Repository: crwlrsoft/crawler
Branch: main
Commit: d6680f9e698a
Files: 326
Total size: 1.1 MB
Directory structure:
gitextract_d22hbn5_/
├── .editorconfig
├── .gitattributes
├── .github/
│ └── workflows/
│ └── ci.yml
├── .gitignore
├── .php-cs-fixer.php
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── bin/
│ └── add-git-hooks
├── composer.json
├── git-hooks/
│ └── pre-commit
├── phpstan.neon
├── phpunit.xml
├── src/
│ ├── Cache/
│ │ ├── CacheItem.php
│ │ ├── Exceptions/
│ │ │ ├── MissingZlibExtensionException.php
│ │ │ └── ReadingCacheFailedException.php
│ │ └── FileCache.php
│ ├── Crawler.php
│ ├── HttpCrawler/
│ │ └── AnonymousHttpCrawlerBuilder.php
│ ├── HttpCrawler.php
│ ├── Input.php
│ ├── Io.php
│ ├── Loader/
│ │ ├── Http/
│ │ │ ├── Browser/
│ │ │ │ ├── Screenshot.php
│ │ │ │ └── ScreenshotConfig.php
│ │ │ ├── Cache/
│ │ │ │ └── RetryManager.php
│ │ │ ├── Cookies/
│ │ │ │ ├── Cookie.php
│ │ │ │ ├── CookieJar.php
│ │ │ │ ├── Date.php
│ │ │ │ └── Exceptions/
│ │ │ │ └── InvalidCookieException.php
│ │ │ ├── Exceptions/
│ │ │ │ └── LoadingException.php
│ │ │ ├── HeadlessBrowserLoaderHelper.php
│ │ │ ├── HttpLoader.php
│ │ │ ├── Messages/
│ │ │ │ └── RespondedRequest.php
│ │ │ ├── Politeness/
│ │ │ │ ├── RetryErrorResponseHandler.php
│ │ │ │ ├── RobotsTxtHandler.php
│ │ │ │ ├── Throttler.php
│ │ │ │ └── TimingUnits/
│ │ │ │ └── MultipleOf.php
│ │ │ └── ProxyManager.php
│ │ ├── Loader.php
│ │ └── LoaderInterface.php
│ ├── Logger/
│ │ ├── CliLogger.php
│ │ └── PreStepInvocationLogger.php
│ ├── Output.php
│ ├── Result.php
│ ├── Steps/
│ │ ├── BaseStep.php
│ │ ├── Csv.php
│ │ ├── Dom/
│ │ │ ├── DomDocument.php
│ │ │ ├── HtmlDocument.php
│ │ │ ├── HtmlElement.php
│ │ │ ├── Node.php
│ │ │ ├── NodeList.php
│ │ │ ├── XmlDocument.php
│ │ │ └── XmlElement.php
│ │ ├── Dom.php
│ │ ├── Exceptions/
│ │ │ └── PreRunValidationException.php
│ │ ├── Filters/
│ │ │ ├── AbstractFilter.php
│ │ │ ├── ArrayFilter.php
│ │ │ ├── ClosureFilter.php
│ │ │ ├── ComparisonFilter.php
│ │ │ ├── Enums/
│ │ │ │ ├── ComparisonFilterRule.php
│ │ │ │ ├── StringFilterRule.php
│ │ │ │ ├── StringLengthFilterRule.php
│ │ │ │ └── UrlFilterRule.php
│ │ │ ├── Filter.php
│ │ │ ├── FilterInterface.php
│ │ │ ├── Filterable.php
│ │ │ ├── NegatedFilter.php
│ │ │ ├── StringFilter.php
│ │ │ ├── StringLengthFilter.php
│ │ │ └── UrlFilter.php
│ │ ├── Group.php
│ │ ├── Html/
│ │ │ ├── CssSelector.php
│ │ │ ├── DomQuery.php
│ │ │ ├── Exceptions/
│ │ │ │ └── InvalidDomQueryException.php
│ │ │ ├── GetLink.php
│ │ │ ├── GetLinks.php
│ │ │ ├── MetaData.php
│ │ │ ├── SchemaOrg.php
│ │ │ ├── SelectorTarget.php
│ │ │ └── XPathQuery.php
│ │ ├── Html.php
│ │ ├── Json.php
│ │ ├── Loading/
│ │ │ ├── GetSitemapsFromRobotsTxt.php
│ │ │ ├── Http/
│ │ │ │ ├── AbstractPaginator.php
│ │ │ │ ├── Browser/
│ │ │ │ │ └── BrowserAction.php
│ │ │ │ ├── Document.php
│ │ │ │ ├── Paginate.php
│ │ │ │ ├── Paginator.php
│ │ │ │ └── Paginators/
│ │ │ │ ├── QueryParams/
│ │ │ │ │ ├── AbstractQueryParamManipulator.php
│ │ │ │ │ ├── Decrementor.php
│ │ │ │ │ ├── Incrementor.php
│ │ │ │ │ └── QueryParamManipulator.php
│ │ │ │ ├── QueryParamsPaginator.php
│ │ │ │ ├── SimpleWebsitePaginator.php
│ │ │ │ └── StopRules/
│ │ │ │ ├── Contains.php
│ │ │ │ ├── IsEmptyInDom.php
│ │ │ │ ├── IsEmptyInHtml.php
│ │ │ │ ├── IsEmptyInJson.php
│ │ │ │ ├── IsEmptyInXml.php
│ │ │ │ ├── IsEmptyResponse.php
│ │ │ │ ├── NotContains.php
│ │ │ │ ├── PaginatorStopRules.php
│ │ │ │ └── StopRule.php
│ │ │ ├── Http.php
│ │ │ ├── HttpBase.php
│ │ │ ├── HttpCrawl.php
│ │ │ └── LoadingStep.php
│ │ ├── Refiners/
│ │ │ ├── AbstractRefiner.php
│ │ │ ├── DateTime/
│ │ │ │ └── DateTimeFormat.php
│ │ │ ├── DateTimeRefiner.php
│ │ │ ├── Html/
│ │ │ │ └── RemoveFromHtml.php
│ │ │ ├── HtmlRefiner.php
│ │ │ ├── RefinerInterface.php
│ │ │ ├── String/
│ │ │ │ ├── AbstractStringRefiner.php
│ │ │ │ ├── StrAfterFirst.php
│ │ │ │ ├── StrAfterLast.php
│ │ │ │ ├── StrBeforeFirst.php
│ │ │ │ ├── StrBeforeLast.php
│ │ │ │ ├── StrBetweenFirst.php
│ │ │ │ ├── StrBetweenLast.php
│ │ │ │ └── StrReplace.php
│ │ │ ├── StringRefiner.php
│ │ │ ├── Url/
│ │ │ │ ├── AbstractUrlRefiner.php
│ │ │ │ ├── WithFragment.php
│ │ │ │ ├── WithHost.php
│ │ │ │ ├── WithPath.php
│ │ │ │ ├── WithPort.php
│ │ │ │ ├── WithQuery.php
│ │ │ │ ├── WithScheme.php
│ │ │ │ └── WithoutPort.php
│ │ │ └── UrlRefiner.php
│ │ ├── Sitemap/
│ │ │ └── GetUrlsFromSitemap.php
│ │ ├── Sitemap.php
│ │ ├── Step.php
│ │ ├── StepInterface.php
│ │ ├── StepOutputType.php
│ │ └── Xml.php
│ ├── Stores/
│ │ ├── JsonFileStore.php
│ │ ├── SimpleCsvFileStore.php
│ │ ├── Store.php
│ │ └── StoreInterface.php
│ ├── UserAgents/
│ │ ├── BotUserAgent.php
│ │ ├── BotUserAgentInterface.php
│ │ ├── UserAgent.php
│ │ └── UserAgentInterface.php
│ └── Utils/
│ ├── Gzip.php
│ ├── HttpHeaders.php
│ ├── OutputTypeHelper.php
│ ├── RequestKey.php
│ └── TemplateString.php
└── tests/
├── Cache/
│ ├── CacheItemTest.php
│ ├── FileCacheTest.php
│ └── _cachefilecontent
├── CrawlerTest.php
├── HttpCrawler/
│ └── AnonymousHttpCrawlerBuilderTest.php
├── IoTest.php
├── Loader/
│ ├── Http/
│ │ ├── Browser/
│ │ │ └── ScreenshotConfigTest.php
│ │ ├── Cache/
│ │ │ └── RetryManagerTest.php
│ │ ├── Cookies/
│ │ │ ├── CookieJarTest.php
│ │ │ ├── CookieTest.php
│ │ │ └── DateTest.php
│ │ ├── HeadlessBrowserLoaderHelperTest.php
│ │ ├── HttpLoaderPolitenessTest.php
│ │ ├── HttpLoaderTest.php
│ │ ├── Messages/
│ │ │ └── RespondedRequestTest.php
│ │ ├── Politeness/
│ │ │ ├── RobotsTxtHandlerTest.php
│ │ │ ├── ThrottlerTest.php
│ │ │ └── TimingUnits/
│ │ │ └── MultipleOfTest.php
│ │ └── ProxyManagerTest.php
│ └── LoaderTest.php
├── Logger/
│ ├── CliLoggerTest.php
│ └── PreStepInvocationLoggerTest.php
├── Pest.php
├── ResultTest.php
├── Steps/
│ ├── BaseStepTest.php
│ ├── CsvTest.php
│ ├── Dom/
│ │ ├── HtmlDocumentTest.php
│ │ ├── HtmlElementTest.php
│ │ ├── NodeListTest.php
│ │ ├── NodeTest.php
│ │ ├── XmlDocumentTest.php
│ │ ├── XmlElementTest.php
│ │ └── _Stubs/
│ │ ├── HtmlNodeStub.php
│ │ └── XmlNodeStub.php
│ ├── DomTest.php
│ ├── Filters/
│ │ ├── ArrayFilterTest.php
│ │ ├── ClosureFilterTest.php
│ │ ├── ComparisonFilterTest.php
│ │ ├── Enums/
│ │ │ ├── ComparisonFilterRuleTest.php
│ │ │ ├── StringFilterRuleTest.php
│ │ │ ├── StringLengthFilterRuleTest.php
│ │ │ └── UrlFilterRuleTest.php
│ │ ├── FilterTest.php
│ │ ├── NegatedFilterTest.php
│ │ ├── StringFilterTest.php
│ │ ├── StringLengthFilterTest.php
│ │ └── UrlFilterTest.php
│ ├── GroupTest.php
│ ├── Html/
│ │ ├── CssSelectorTest.php
│ │ ├── Exceptions/
│ │ │ └── InvalidDomQueryExceptionTest.php
│ │ ├── GetLinkTest.php
│ │ ├── GetLinksTest.php
│ │ ├── MetaDataTest.php
│ │ ├── SchemaOrgTest.php
│ │ └── XPathQueryTest.php
│ ├── HtmlTest.php
│ ├── JsonTest.php
│ ├── Loading/
│ │ ├── GetSitemapsFromRobotsTxtTest.php
│ │ ├── Http/
│ │ │ ├── DocumentTest.php
│ │ │ └── Paginators/
│ │ │ ├── AbstractPaginatorTest.php
│ │ │ ├── QueryParams/
│ │ │ │ ├── AbstractQueryParamManipulatorTest.php
│ │ │ │ ├── DecrementorTest.php
│ │ │ │ └── IncrementorTest.php
│ │ │ ├── QueryParamsPaginatorTest.php
│ │ │ ├── SimpleWebsitePaginatorTest.php
│ │ │ └── StopRules/
│ │ │ ├── ContainsTest.php
│ │ │ ├── IsEmptyInHtmlTest.php
│ │ │ ├── IsEmptyInJsonTest.php
│ │ │ ├── IsEmptyInXmlTest.php
│ │ │ ├── IsEmptyResponseTest.php
│ │ │ └── NotContainsTest.php
│ │ ├── HttpTest.php
│ │ └── LoadingStepTest.php
│ ├── Refiners/
│ │ ├── AbstractRefinerTest.php
│ │ ├── DateTime/
│ │ │ └── DateTimeFormatTest.php
│ │ ├── Html/
│ │ │ └── RemoveFromHtmlTest.php
│ │ ├── String/
│ │ │ ├── AfterFirstTest.php
│ │ │ ├── AfterLastTest.php
│ │ │ ├── BeforeFirstTest.php
│ │ │ ├── BeforeLastTest.php
│ │ │ ├── BetweenFirstTest.php
│ │ │ ├── BetweenLastTest.php
│ │ │ └── ReplaceTest.php
│ │ └── Url/
│ │ ├── WithFragmentTest.php
│ │ ├── WithHostTest.php
│ │ ├── WithPathTest.php
│ │ ├── WithPortTest.php
│ │ ├── WithQueryTest.php
│ │ ├── WithSchemeTest.php
│ │ └── WithoutPortTest.php
│ ├── Sitemap/
│ │ └── GetUrlsFromSitemapTest.php
│ ├── StepTest.php
│ ├── XmlTest.php
│ └── _Files/
│ ├── Csv/
│ │ ├── basic.csv
│ │ ├── enclosure.csv
│ │ ├── escape.csv
│ │ ├── separator.csv
│ │ └── with-column-headlines.csv
│ ├── Html/
│ │ ├── basic.html
│ │ ├── bookstore.html
│ │ └── event.html
│ └── Xml/
│ ├── bookstore.xml
│ ├── events.xml
│ └── rss-with-bom.xml
├── Stores/
│ ├── JsonFileStoreTest.php
│ ├── SimpleCsvFileStoreTest.php
│ └── _files/
│ └── .gitkeep
├── UserAgents/
│ ├── BotUserAgentTest.php
│ └── UserAgentTest.php
├── Utils/
│ ├── GzipTest.php
│ ├── HttpHeadersTest.php
│ ├── OutputTypeHelperTest.php
│ ├── RequestKeyTest.php
│ └── TemplateStringTest.php
├── _Integration/
│ ├── GroupTest.php
│ ├── Http/
│ │ ├── CharsetTest.php
│ │ ├── CrawlingTest.php
│ │ ├── ErrorResponsesTest.php
│ │ ├── GzipTest.php
│ │ ├── HeadlessBrowserTest.php
│ │ ├── Html/
│ │ │ ├── PaginatedListingTest.php
│ │ │ └── SimpleListingTest.php
│ │ ├── PaginationTest.php
│ │ ├── ProxyingTest.php
│ │ ├── PublisherExampleTest.php
│ │ ├── QueryParamPaginationTest.php
│ │ ├── RedirectTest.php
│ │ ├── RequestParamsFromInputTest.php
│ │ ├── RetryErrorResponsesTest.php
│ │ ├── RobotsTxtTest.php
│ │ └── TimeoutTest.php
│ ├── ProxyServer.php
│ ├── Server.php
│ └── _Server/
│ ├── BlogPostWithJsonLd.php
│ ├── BrokenMimeTypeRss.php
│ ├── BrowserActions/
│ │ ├── ClickAndWaitForReload.php
│ │ ├── EvaluateAndWaitForReload.php
│ │ ├── EvaluateAndWaitForReloadReloaded.php
│ │ ├── Main.php
│ │ └── Wait.php
│ ├── Crawling.php
│ ├── HelloWorld.php
│ ├── JsGeneratedContent.php
│ ├── NonUtf8.php
│ ├── PageInitScript.php
│ ├── PaginatedListing/
│ │ └── Detail.php
│ ├── PaginatedListing.php
│ ├── PrintCookie.php
│ ├── PrintCookies.php
│ ├── PrintHeaders.php
│ ├── Publisher/
│ │ ├── AuthorDetailPage.php
│ │ ├── AuthorsListPage.php
│ │ ├── BookDetailPage.php
│ │ └── EditionDetailPage.php
│ ├── QueryParamPagination.php
│ ├── RssFeed.php
│ ├── ServiceUnavailable.php
│ ├── SetCookie.php
│ ├── SetCookieJs.php
│ ├── SetDelayedCookieJs.php
│ ├── SetMultipleCookiesJs.php
│ ├── SimpleListing/
│ │ └── Detail.php
│ ├── SimpleListing.php
│ └── TooManyRequests.php
├── _Stubs/
│ ├── AbstractTestPaginator.php
│ ├── Crawlers/
│ │ ├── DummyOne.php
│ │ ├── DummyTwo/
│ │ │ ├── DummyTwoLoader.php
│ │ │ ├── DummyTwoLogger.php
│ │ │ └── DummyTwoUserAgent.php
│ │ └── DummyTwo.php
│ ├── DummyLogger.php
│ ├── PhantasyLoader.php
│ └── RespondedRequestChild.php
└── _Temp/
├── _cachedir/
│ └── .gitkeep
└── _storagedir/
└── .gitkeep
================================================
FILE CONTENTS
================================================
================================================
FILE: .editorconfig
================================================
# EditorConfig is awesome: http://EditorConfig.org
root = true
[*]
charset = utf-8
end_of_line = lf
indent_style = space
indent_size = 4
insert_final_newline = true
trim_trailing_whitespace = true
[*.md]
trim_trailing_whitespace = false
[*.yml]
indent_size = 2
[_cachefilecontent]
insert_final_newline = false
================================================
FILE: .gitattributes
================================================
.github export-ignore
bin/add-git-hooks export-ignore
git-hooks export-ignore
tests export-ignore
.editorconfig export-ignore
.gitattributes export-ignore
.gitignore export-ignore
.php-cs-fixer.php export-ignore
phpstan.neon export-ignore
phpunit.xml export-ignore
================================================
FILE: .github/workflows/ci.yml
================================================
name: CI
on: pull_request
jobs:
tests:
name: PestPHP Tests
runs-on: ubuntu-latest
strategy:
matrix:
php-versions: ['8.1', '8.2', '8.3', '8.4', '8.5']
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install PHP
uses: shivammathur/setup-php@v2
with:
php-version: ${{ matrix.php-versions }}
- name: Install dependencies
run: composer install --prefer-dist --no-progress
- name: Run tests
run: composer test
- name: Run integration tests
run: composer test-integration
tests84:
name: PestPHP Tests Running only on PHP >= 8.4
runs-on: ubuntu-latest
strategy:
matrix:
php-versions: ['8.4', '8.5']
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install PHP
uses: shivammathur/setup-php@v2
with:
php-version: ${{ matrix.php-versions }}
- name: Install dependencies
run: composer install --prefer-dist --no-progress
- name: Run tests
run: composer test-php84
stanAndCs:
name: Static Analysis (phpstan) and Code Style (PHP CS Fixer)
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install PHP
uses: shivammathur/setup-php@v2
with:
php-version: '8.1'
coverage: none
- name: Install dependencies
run: composer install --prefer-dist --no-progress
- name: Run PHPStan
run: composer stan
- name: Run PHP CS Fixer
run: composer cs
================================================
FILE: .gitignore
================================================
composer.lock
vendor
.php_cs.cache
.php-cs-fixer.cache
.phpunit.result.cache
.phpunit.cache
/cachedir
/storedir
/tests/_Temp/_cachedir/*
!/tests/_Temp/_cachedir/.gitkeep
================================================
FILE: .php-cs-fixer.php
================================================
exclude(['tests/_Integration/_Server', '.github', 'bin', 'git-hooks'])
->in(__DIR__);
return (new Config())
->setFinder($finder)
->setParallelConfig(ParallelConfigFactory::detect())
->setRules([
'@PER-CS' => true,
'strict_param' => true,
'array_syntax' => ['syntax' => 'short'],
'no_unused_imports' => true,
'operator_linebreak' => ['only_booleans' => true, 'position' => 'end'],
])
->setRiskyAllowed(true)
->setUsingCache(true);
================================================
FILE: CHANGELOG.md
================================================
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [3.5.6] - 2026-01-05
### Fixed
* Potential issues found with PHPStan 2 on level 8.
## [3.5.5] - 2025-08-05
### Fixed
* Removed the overriding `validateAndSanitizeInput()` method from the `Paginate` HTTP step to ensure features like `staticUrl()` and `useInputKeyAsUrl()` work correctly.
* The `Paginate` HTTP step now also supports receiving an array of URLs, initiating pagination separately for each one.
### Deprecated
* The `Crwlr\Crawler\Steps\Loading\Http\Paginate` class. It shall be removed and its behavior implemented in the `Http` class directly, in the next major version.
## [3.5.4] - 2025-07-28
### Fixed
* An issue in the `SimpleWebsitePaginator` when used with stop rules.
## [3.5.3] - 2025-06-10
### Fixed
* Issues with passing cookies from the cookie jar to the headless browser when using the `useBrowser()` method on `Http` steps, in cases where the loader wasn’t globally configured to use the browser for all requests.
## [3.5.2] - 2025-05-16
### Fixed
* The `Result::toArray()` method now converts all objects contained in the Result array (at any level of the array) to arrays. Also, if the only element in a result array has some autogenerated key containing "unnamed", but the value also is an associative array with string keys, the method only returns that child array.
## [3.5.1] - 2025-04-23
### Fixed
* An issue that occurred, when a step uses the `PreStepInvocationLogger`. As refiners also use the logger, a newer logger (replacing the `PreStepInvocationLogger`) is now also passed to all registered refiners of a step.
* Enable applying refiners to output properties with array value. E.g. if a step outputs an array of URLs (`['https://...', 'https://...']`), a `UrlRefiner` will be applied to all those URLs.
## [3.5.0] - 2025-04-10
### Added
* Dynamically building request URLs from extracted data: `Http` steps now have a new `staticUrl()` method, and you can also use variables within that static URL - as well as in request headers and the body - like `https://www.example.com/foo/[crwl:some_extracted_property]`. These variables will be replaced with the corresponding properties from input data (also works with kept data).
* New Refiners:
* `DateTimeRefiner::reformat('Y-m-d H:i:s')` to reformat a date time string to a different format. Tries to automatically recognize the input format. If this does not work, you can provide an input format to use as the second argument.
* `HtmlRefiner::remove('#foo')` to remove nodes matching the given selector from selected HTML.
* Steps that produce multiple outputs per input can now group them per input by calling the new `Step::oneOutputPerInput()` method.
## [3.4.5] - 2025-04-09
### Fixed
* When feeding an `Http` step with a string that is not a valid URL (e.g. `https://`), the exception when trying to parse it as a URL is caught, and an error logged.
## [3.4.4] - 2025-04-04
### Fixed
* As sometimes, XML parsing errors occur because of characters that aren't valid within XML documents, the library now catches XML parsing errors, tries to find and replace invalid characters (with transliterates or HTML entities) and retries parsing the document. Works best when you additionally install the `voku/portable-ascii` composer package.
## [3.4.3] - 2025-04-03
### Fixed
* When providing an empty base selector to an `Html` step (`Html::each('')`, `Html::first('')`, `Html::last('')`), it won't fail with an error, but instead log a warning, that it most likely doesn't make sense.
* The `Step::keep()` methods now also work when applied to child steps within a group step.
## [3.4.2] - 2025-03-08
### Fixed
* Issue when using `Http::get()->useBrowser()->postBrowserNavigateHook()`. Previously in this case, when the loader is configured to use the HTTP client, the post browser navigate hook was actually not set because of an issue with the order, things happened internally.
## [3.4.1] - 2025-03-08
### Fixed
* Since, when using the Chrome browser for loading, we can only execute GET requests:
* The loader now automatically switches to the HTTP client for POST, PUT, PATCH, and DELETE requests and logs a warning.
* A warning is logged when attempting to use "Post Browser Navigate Hooks" with POST, PUT, PATCH, or DELETE requests.
* Consequently, the `useBrowser()` method, introduced in v3.4.0, is also limited to GET requests.
## [3.4.0] - 2025-03-06
### Added
* Two new methods to the base class of all `Http` steps:
* `skipCache()` – Allows using the cache while skipping it for a specific loading step.
* `useBrowser()` – Switches the loader to use a (headless) Chrome browser for loading calls in a specific step and then reverts the loader to its previous setting.
* Introduced the new `BrowserAction::screenshot()` post browser navigate hook. It accepts an instance of the new `ScreenshotConfig` class, allowing you to configure various options (see the methods of `ScreenshotConfig`). If successful, the screenshot file paths are included in the `RespondedRequest` output object of the `Http` step.
## [3.3.0] - 2025-03-02
### Added
* New `BrowserAction`s to use with the `postBrowserNavigateHook()` method:
* `BrowserAction::clickInsideShadowDom()`
* `BrowserAction::moveMouseToElement()`
* `BrowserAction::moveMouseToPosition()`
* `BrowserAction::scrollDown()`
* `BrowserAction::scrollUp()`
* `BrowserAction::typeText()`
* `BrowserAction::waitForReload()`
* A new method in `HeadlessBrowserLoaderHelper` to include the HTML content of shadow DOM elements in the returned HTML. Use it like this: `$crawler->getLoader()->browser()->includeShadowElementsInHtml()`.
### Changed
* The `BrowserAction::clickElement()` action, now automatically waits for an element matching the selector to be rendered, before performing the click. This means you don't need to put a `BrowserAction::waitUntilDocumentContainsElement()` before it. It works the same in the new `BrowserAction::clickInsideShadowDom()` and `BrowserAction::moveMouseToElement()` actions.
### Deprecated
* `BrowserAction::clickElementAndWaitForReload()` and `BrowserAction::evaluateAndWaitForReload()`. As a replacement, please use `BrowserAction::clickElement()` or `BrowserAction::evaluate()` and `BrowserAction::waitForReload()` separately.
## [3.2.5] - 2025-02-26
### Fixed
* When a child step is nested in the `extract()` method of an `Html` or `Xml` step, and does not use `each()` as the base, the extracted value is an array with the keys defined in the `extract()` call, rather than an array of such arrays as it would be with `each()` as base.
## [3.2.4] - 2025-02-25
### Fixed
* Trying to load a relative reference URI (no scheme and host/authority, only path) via the `HttpLoader` now immediately logs (or throws when `loadOrFail()` is used) an error instead of trying to actually load it.
## [3.2.3] - 2025-01-28
### Fixed
* Fix deprecation warning triggered in the `DomQuery` class, when trying to get the value of an HTML/XML attribute that does not exist on the element.
## [3.2.2] - 2025-01-17
### Fixed
* Warnings about loader hooks being called multiple times, when using a `BotUserAgent` and therefore loading and respecting the robots.txt file, or when using the `Http::stopOnErrorResponse()` method.
## [3.2.1] - 2025-01-13
### Fixed
* Reuse previously opened page when using the (headless) Chrome browser, instead of opening a new page for each request.
## [3.2.0] - 2025-01-12
### Added
* `RespondedRequest::isServedFromCache()` to determine whether a response was served from cache or actually loaded.
## [3.1.5] - 2025-01-10
### Fixed
* Another improvement for getting XML source when using the browser, in cases where Chrome doesn't identify the response as an XML document (even though a Content-Type header is sent).
## [3.1.4] - 2025-01-10
### Fixed
* `HttpLoader::dontUseCookies()` now also works when using the Chrome browser. Cookies are cleared before every request.
## [3.1.3] - 2025-01-10
### Fixed
* Further improve getting the raw response body from non-HTML documents via Chrome browser.
## [3.1.2] - 2025-01-08
### Fixed
* When loading a non-HTML document (e.g., XML) via the Chrome browser, the library now retrieves the original source. Previously, it returned the outerHTML of the rendered document, which wrapped the content in an HTML structure.
## [3.1.1] - 2025-01-07
### Fixed
* When the `validateAndSanitize()` method of a step throws an `InvalidArgumentException`, the exception is now caught, logged and the step is not invoked with the invalid input. This improves fault tolerance. Feeding a step with one invalid input shouldn't cause the whole crawler run to fail. Exceptions other than `InvalidArgumentException` remain uncaught.
## [3.1.0] - 2025-01-03
### Added
* New method `HeadlessBrowserLoaderHelper::setPageInitScript()` (`$crawler->getLoader()->browser()->setPageInitScript()`) to provide javascript code that is executed on every new browser page before navigating anywhere.
* New method `HeadlessBrowserLoaderHelper::useNativeUserAgent()` (`$crawler->getLoader()->browser()->useNativeUserAgent()`) to allow using the native `User-Agent` that your Chrome browser sends by default.
## [3.0.4] - 2024-12-18
### Fixed
* Minor improvement for the `DomQuery` (base for `Dom::cssSelector()` and `Dom::xPath()`): enable providing an empty string as selector, to simply get the node that the selector is applied to.
## [3.0.3] - 2024-12-11
### Fixed
* Improved fix for non UTF-8 characters in HTML documents declared as UTF-8.
## [3.0.2] - 2024-12-11
### Fixed
* When the new PHP 8.4 DOM API is used, and HTML declared as UTF-8 contains non UTF-8 compatible characters, it does not replace them with a � character, but instead removes it. This behaviour is consistent with the data returned by Symfony DomCrawler.
## [3.0.1] - 2024-12-10
### Undeprecated
* Removed deprecations for all XPath functionality (`Dom::xPath()`, `XPathQuery` class and `Node::queryXPath()`), because it's still available with the net DOM API in PHP 8.4.
## [3.0.0] - 2024-12-08
The primary change in version 3.0.0 is that the library now leverages PHP 8.4’s new DOM API when used in an environment with PHP >= 8.4. To maintain compatibility with PHP < 8.4, an abstraction layer has been implemented. This layer dynamically uses either the Symfony DomCrawler component or the new DOM API, depending on the PHP version.
Since no direct interaction with an instance of the Symfony DomCrawler library was required at the step level provided by the library, it is highly likely that you won’t need to make any changes to your code to upgrade to v3. To ensure a smooth transition, please review the points under “Changed.”
### Changed
* __BREAKING__: The `DomQuery::innerText()` method (a.k.a. `Dom::cssSelector('...')->innerText()`) has been removed. `innerText` exists only in the Symfony DomCrawler component, and its usefulness is questionable. If you still require this variant of the DOM element text, please let us know or create a pull request yourself. Thank you!
* __BREAKING__: The `DomQueryInterface` was removed. As the `DomQuery` class offers a lot more functionality than the interface defines, the purpose of the interface was questionable. Please use the abstract `DomQuery` class instead. This also means that some method signatures, type hinting the interface, have changed. Look for occurrences of `DomQueryInterface` and replace them.
* __BREAKING__: The visibility of the `DomQuery::filter()` method was changed from public to protected. It is still needed in the `DomQuery` class, but outside of it, it is probably better and easier to directly use the new DOM abstraction (see the `src/Steps/Dom` directory). If you are extending the `DomQuery` class (which is not recommended), be aware that the argument now takes a `Node` (from the new DOM abstraction) instead of a Symfony `Crawler`.
* __BREAKING__: The `Step::validateAndSanitizeToDomCrawlerInstance()` method was removed. Please use the `Step::validateAndSanitizeToHtmlDocumentInstance()` and `Step::validateAndSanitizeToXmlDocumentInstance()` methods instead.
* __BREAKING__: The second argument in `Closure`s passed to the `Http::crawl()->customFilter()` has changed from an instance of Symfony `Crawler` class, to an `HtmlElement` instance from the new DOM abstraction (`Crwlr\Crawler\Steps\Dom\HtmlElement`).
* __BREAKING__: The Filter class was split into `AbstractFilter` (base class for actual filter classes) and `Filter` only hosting the static function for easy instantiation, because otherwise each filter class also has all the static methods.
* __BREAKING__: Further, the signatures of some methods that are mainly here for internal usage, have changed due to the new DOM abstraction:
* The static `GetLink::isSpecialNonHttpLink()` method now needs an instance of `HtmlElement` instead of a Symfony `Crawler`.
* `GetUrlsFromSitemap::fixUrlSetTag()` now takes an `XmlDocument` instead of a Symfony `Crawler`.
* The `DomQuery::apply()` method now takes a `Node` instead of a Symfony `Crawler`.
### Deprecated
* `Dom::xPath()` method and
* the `XPathQuery` class as well as
* the new `Node::queryXPath()` method.
### Added
* New step output filter `Filter::arrayHasElement()`. When a step produces array output with a property being a numeric array, you can now filter outputs by checking if one element of that array property, matches certain filter criteria. Example: The outputs look like `['foo' => 'bar', 'baz' => ['one', 'two', 'three']]`. You can filter all outputs where `baz` contains `two` like: `Filter::arrayHasElement()->where('baz', Filter::equal('two'))`.
## [2.1.3] - 2024-11-05
### Fixed
* Improvements for deprecations in PHP 8.4.
## [2.1.2] - 2024-10-22
### Fixed
* Issue when converting cookie objects received from the chrome-php library.
## [2.1.1] - 2024-10-21
### Fixed
* Also add cookies, set during headless browser usage, to the cookie jar. When switching back to the (guzzle) HTTP client the cookies should also be sent.
* Don't call `Loader::afterLoad()` when `Loader::beforeLoad()` was not called before. This can potentially happen, when an exception is thrown before the call to the `beforeLoad` hook, but it is caught and the `afterLoader` hook method is called anyway. As this most likely won't make sense to users, the `afterLoad` hook callback functions will just not be called in this case.
* The `Throttler` class now has protected methods `_internalTrackStartFor()`, `_requestToUrlWasStarted()` and `_internalTrackEndFor()`. When extending the `Throttler` class (be careful, actually that's not really recommended) they can be used to check if a request to a URL was actually started before.
## [2.1.0] - 2024-10-19
### Added
* The new `postBrowserNavigateHook()` method in the `Http` step classes, which allows to define callback functions that are triggered after the headless browser navigated to the specified URL. They are called with the chrome-php `Page` object as argument, so you can interact with the page. Also, there is a new class `BrowserAction` providing some simple actions (like wait for element, click element,...) as Closures via static methods. You can use it like `Http::get()->postBrowserNavigateHook(BrowserAction::clickElement('#element'))`.
## [2.0.1] - 2024-10-15
### Fixed
* Issue with the `afterLoad` hook of the `HttpLoader`, introduced in v2. Calling the hook was commented out, which slipped through because the test case was faulty.
## [2.0.0] - 2024-10-15
### Changed
* __BREAKING__: Removed methods `BaseStep::addToResult()`, `BaseStep::addLaterToResult()`, `BaseStep::addsToOrCreatesResult()`, `BaseStep::createsResult()`, and `BaseStep::keepInputData()`. These methods were deprecated in v1.8.0 and should be replaced with `Step::keep()`, `Step::keepAs()`, `Step::keepFromInput()`, and `Step::keepInputAs()`.
* __BREAKING__: Added the following keep methods to the `StepInterface`: `StepInterface::keep()`, `StepInterface::keepAs()`, `StepInterface::keepFromInput()`, `StepInterface::keepInputAs()`, as well as `StepInterface::keepsAnything()`, `StepInterface::keepsAnythingFromInputData()` and `StepInterface::keepsAnythingFromOutputData()`. If you have a class that implements this interface without extending `Step` (or `BaseStep`), you will need to implement these methods yourself. However, it is strongly recommended to extend `Step` instead.
* __BREAKING__: With the removal of the `addToResult()` method, the library no longer uses `toArrayForAddToResult()` methods on output objects. Instead, please use `toArrayForResult()`. Consequently, `RespondedRequest::toArrayForAddToResult()` has been renamed to `RespondedRequest::toArrayForResult()`.
* __BREAKING__: Removed the `result` and `addLaterToResult` properties from `Io` objects (`Input` and `Output`). These properties were part of the `addToResult` feature and are now removed. Instead, use the `keep` property where kept data is added.
* __BREAKING__: The signature of the `Crawler::addStep()` method has changed. You can no longer provide a result key as the first parameter. Previously, this key was passed to the `Step::addToResult()` method internally. Now, please handle this call yourself.
* __BREAKING__: The return type of the `Crawler::loader()` method no longer allows `array`. This means it's no longer possible to provide multiple loaders from the crawler. Instead, use the new functionality to directly provide a custom loader to a step described below. As part of this change, the `UnknownLoaderKeyException` was also removed as it is now obsolete. If you have any references to this class, please make sure to remove them.
* __BREAKING__: Refactored the abstract `LoadingStep` class to a trait and removed the `LoadingStepInterface`. Loading steps should now extend the `Step` class and use the trait. As multiple loaders are no longer supported, the `addLoader` method was renamed to `setLoader`. Similarly, the methods `useLoader()` and `usesLoader()` for selecting loaders by key are removed. Now, you can directly provide a different loader to a single step using the trait's new `withLoader()` method (e.g., `Http::get()->withLoader($loader)`). The trait now also uses phpdoc template tags, for a generic loader type. You can define the loader type by putting `/** @use LoadingStep */` above `use LoadingStep;` in your step class. Then your IDE and static analysis (if supported) will know what type of loader, the trait methods return and accept.
* __BREAKING__: Removed the `PaginatorInterface` to allow for better extensibility. The old `Crwlr\Crawler\Steps\Loading\Http\Paginators\AbstractPaginator` class has also been removed. Please use the newer, improved version `Crwlr\Crawler\Steps\Loading\Http\AbstractPaginator`. This newer version has also changed: the first argument `UriInterface $url` is removed from the `processLoaded()` method, as the URL also is part of the request (`Psr\Http\Message\RequestInterface`) which is now the first argument. Additionally, the default implementation of the `getNextRequest()` method is removed. Child implementations must define this method themselves. If your custom paginator still has a `getNextUrl()` method, note that it is no longer needed by the library and will not be called. The `getNextRequest()` method now fulfills its original purpose.
* __BREAKING__: Removed methods from `HttpLoader`:
* `$loader->setHeadlessBrowserOptions()` => use `$loader->browser()->setOptions()` instead
* `$loader->addHeadlessBrowserOptions()` => use `$loader->browser()->addOptions()` instead
* `$loader->setChromeExecutable()` => use `$loader->browser()->setExecutable()` instead
* `$loader->browserHelper()` => use `$loader->browser()` instead
* __BREAKING__: Removed method `RespondedRequest::cacheKeyFromRequest()`. Use `RequestKey::from()` instead.
* __BREAKING__: The `HttpLoader::retryCachedErrorResponses()` method now returns an instance of the new `Crwlr\Crawler\Loader\Http\Cache\RetryManager` class. This class provides the methods `only()` and `except()` to restrict retries to specific HTTP response status codes. Previously, this method returned the `HttpLoader` itself (`$this`), so if you're using it in a chain and calling other loader methods after it, you will need to refactor your code.
* __BREAKING__: Removed the `Microseconds` class from this package. It has been moved to the `crwlr/utils` package, which you can use instead.
### Added
* New methods `FileCache::prolong()` and `FileCache::prolongAll()` to allow prolonging the time to live for cached responses.
### Fixed
* The `maxOutputs()` method is now also available and working on `Group` steps.
* Improved warning messages for step validations that are happening before running a crawler.
* A `PreRunValidationException` when the crawler finds a problem with the setup, before actually running, is not only logged as an error via the logger, but also rethrown to the user. This way the user won't get the impression, that the crawler ran successfully without looking at the log messages.
## [1.10.0] - 2024-08-05
### Added
* URL refiners: `UrlRefiner::withScheme()`, `UrlRefiner::withHost()`, `UrlRefiner::withPort()`, `UrlRefiner::withoutPort()`, `UrlRefiner::withPath()`, `UrlRefiner::withQuery()`, `UrlRefiner::withoutQuery()`, `UrlRefiner::withFragment()` and `UrlRefiner::withoutFragment()`.
* New paginator stop rules `PaginatorStopRules::contains()` and `PaginatorStopRules::notContains()`.
* Static method `UserAgent::mozilla5CompatibleBrowser()` to get a `UserAgent` instance with the user agent string `Mozilla/5.0 (compatible)` and also the new method `withMozilla5CompatibleUserAgent` in the `AnonymousHttpCrawlerBuilder` that you can use like this: `HttpCrawler::make()->withMozilla5CompatibleUserAgent()`.
## [1.9.5] - 2024-07-25
### Fixed
* Prevent PHP warnings when an HTTP response includes a `Content-Type: application/x-gzip` header, but the content is not actually compressed. This issue also occurred with cached responses, because compressed content is decoded during caching. Upon retrieval from the cache, the header indicated compression, but the content was already decoded.
## [1.9.4] - 2024-07-24
### Fixed
* When using `HttpLoader::cacheOnlyWhereUrl()` to restrict caching, the filter rule is not only applied when adding newly loaded responses to the cache, but also for using cached responses. Example: a response for `https://www.example.com/foo` is already available in the cache, but `$loader->cacheOnlyWhereUrl(Filter::urlPathStartsWith('/bar/'))` was called, the cached response is not used.
## [1.9.3] - 2024-07-05
### Fixed
* Add `HttpLoader::browser()` as a replacement for `HttpLoader::browserHelper()` and deprecate the `browserHelper()` method. It's an alias and just because it will read a little better: `$loader->browser()->xyz()` vs. `$loader->browserHelper()->xyz()`. `HttpLoader::browserHelper()` will be removed in v2.0.
* Also deprecate `HttpLoader::setHeadlessBrowserOptions()`, `HttpLoader::addHeadlessBrowserOptions()` and `HttpLoader::setChromeExecutable()`. Use `$loader->browser()->setOptions()`, `$loader->browser()->addOptions()` and `$loader->browser()->setExecutable()` instead.
## [1.9.2] - 2024-06-18
### Fixed
* Issue with setting the headless chrome executable, introduced in 1.9.0.
## [1.9.1] - 2024-06-17
### Added
* Also add `HeadlessBrowserLoaderHelper::getTimeout()` to get the currently configured timeout value.
## [1.9.0] - 2024-06-17
### Added
* New methods `HeadlessBrowserLoaderHelper::setTimeout()` and `HeadlessBrowserLoaderHelper::waitForNavigationEvent()` to allow defining the timeout for the headless chrome in milliseconds (default 30000 = 30 seconds) and the navigation event (`load` (default), `DOMContentLoaded`, `firstMeaningfulPaint`, `networkIdle`, etc.) to wait for when loading a URL.
## [1.8.0] - 2024-06-05
### Added
* New methods `Step::keep()` and `Step::keepAs()`, as well as `Step::keepFromInput()` and `Step::keepInputAs()`, as alternatives to `Step::addToResult()` (or `Step::addLaterToResult()`). The `keep()` method can be called without any argument, to keep all from the output data. It can be called with a string, to keep a certain key or with an array to keep a list of keys. If the step yields scalar value outputs (not an associative array or object with keys) you need to use the `keepAs()` method with the key you want the output value to have in the kept data. The methods `keepFromInput()` and `keepInputAs()` work the same, but uses the input (not the output) that the step receives. Most likely only needed with a first step, to keep data from initial inputs (or in a sub crawler, see below). Kept properties can also be accessed with the `Step::useInputKey()` method, so you can easily reuse properties from multiple steps ago as input.
* New method `Step::outputType()` with default implementation returning `StepOutputType::Mixed`. Please consider implementing this method yourself in all your custom steps, because it is going to be required in v2 of the library. It allows detecting (potential) problems in crawling procedures immediately when starting a run instead of failing after already running a while.
* New method `Step::subCrawlerFor()`, allowing to fill output properties from an actual full child crawling procedure. As the first argument, you give it a key from the step's output, that the child crawler uses as input(s). As the second argument you need to provide a `Closure` that receives a clone of the current `Crawler` without steps and with initial inputs, set from the current output. In the `Closure` you then define the crawling procedure by adding steps as you're used to do it, and return it. This allows to achieve nested output data, scraped from different (sub-)pages, more flexible and less complicated as with the usual linear crawling procedure and `Step::addToResult()`.
### Deprecated
* The `Step::addToResult()`, `Step::addLaterToResult()` and `Step::keepInputData()` methods. Instead, please use the new keep methods. This can cause some migration work for v2, because especially the add to result methods are a pretty central functionality, but the new "keep" methodology (plus the new sub crawler feature) will make a lot of things easier, less complex and the library will most likely work more efficiently in v2.
### Fixed
* When a cache file was generated with compression, and you're trying to read it with a `FileCache` instance without compression enabled, it also works. When unserializing the file content fails it tries decoding the string first before unserializing it.
## [1.7.2] - 2024-03-19
### Fixed
* When the `useInputKey()` method is used on a step and the defined key does not exist in input, it logs a warning and does not invoke the step instead of throwing an `Exception`.
## [1.7.1] - 2024-03-11
### Fixed
* A PHP error that happened when the loader returns `null` for the initial request in the `Http::crawl()` step.
## [1.7.0] - 2024-03-04
### Added
* Allow getting the whole decoded JSON as array with the new `Json::all()` and also allow to get the whole decoded JSON, when using `Json::get()`, inside a mapping using either empty string or `*` as target. Example: `Json::get(['all' => '*'])`. `*` only works, when there is no key `*` in the decoded data.
### Fixed
* Make it work with responses loaded by a headless browser. If decoding the input string fails, it now checks if it could be HTML. If that's the case, it extracts the text content of the `` and tries to decode this instead.
## [1.6.2] - 2024-02-26
### Fixed
* When using `HttpLoader::cacheOnlyWhereUrl()` and a request was redirected (maybe even multiple times), previously all URLs in the chain had to match the filter rule. As this isn't really practicable, now only one of the URLs has to match the rule.
## [1.6.1] - 2024-02-16
### Changed
* Make method `HttpLoader::addToCache()` public, so steps can update a cached response with an extended version.
## [1.6.0] - 2024-02-13
### Added
* Enable dot notation in `Step::addToResult()`, so you can get data from nested output, like: `$step->addToResult(['url' => 'response.url', 'status' => 'response.status', 'foo' => 'bar'])`.
* When a step adds output properties to the result, and the output contains objects, it tries to serialize those objects to arrays, by calling `__serialize()`. If you want an object to be serialized differently for that purpose, you can define a `toArrayForAddToResult()` method in that class. When that method exists, it's preferred to the `__serialize()` method.
* Implemented above-mentioned `toArrayForAddToResult()` method in the `RespondedRequest` class, so on every step that somehow yields a `RespondedRequest` object, you can use the keys `url`, `uri`, `status`, `headers` and `body` with the `addToResult()` method. Previously this only worked for `Http` steps, because it defines output key aliases (`HttpBase::outputKeyAliases()`). Now, in combination with the ability to use dot notation when adding data to the result, if your custom step returns nested output like `['response' => RespondedRequest, 'foo' => 'bar']`, you can add response data to the result like this `$step->addToResult(['url' => 'response.url', 'body' => 'response.body'])`.
### Fixed
* Improvement regarding the timing when a store (`Store` class instance) is called by the crawler with a final crawling result. When a crawling step initiates a crawling result (so, `addToResult()` was called on the step instance), the crawler has to wait for all child outputs (resulting from one step-input) until it calls the store, because the child outputs can all add data to the same final result object. But previously this was not only the case for all child outputs starting from a step where `addToResult()` was called, but all children of one initial crawler input. So with this change, in a lot of cases, the store will earlier be called with finished `Result` objects and memory usage will be lowered.
## [1.5.3] - 2024-02-07
### Fixed
* Merge `HttpBaseLoader` back to `HttpLoader`. It's probably not a good idea to have multiple loaders. At least not multiple loaders just for HTTP. It should be enough to publicly expose the `HeadlessBrowserLoaderHelper` via `HttpLoader::browserHelper()` for the extension steps. But keep the `HttpBase` step, to share the general HTTP functionality implemented there.
## [1.5.2] - 2024-02-07
### Fixed
* Issue in `GetUrlsFromSitemap` (`Sitemap::getUrlsFromSitemap()`) step when XML content has no line breaks.
## [1.5.1] - 2024-02-06
### Fixed
* For being more flexible to build a separate headless browser loader (in an extension package) extract the most basic HTTP loader functionality to a new `HttpBaseLoader` and important functionality for the headless browser loader to a new `HeadlessBrowserLoaderHelper`. Further, also share functionality from the `Http` steps via a new abstract `HttpBase` step. It's considered a fix, because there's no new functionality, just refactoring existing code for better extendability.
## [1.5.0] - 2024-01-29
### Added
* The `DomQuery` class (parent of `CssSelector` (`Dom::cssSelector`) and `XPathQuery` (`Dom::xPath`)) has a new method `formattedText()` that uses the new crwlr/html-2-text package to convert the HTML to formatted plain text. You can also provide a customized instance of the `Html2Text` class to the `formattedText()` method.
### Fixed
* The `Http::crawl()` step won't yield a page again if a newly found URL responds with a redirect to a previously loaded URL.
## [1.4.0] - 2024-01-14
### Added
* The `QueryParamsPaginator` can now also increase and decrease non first level query param values like `foo[bar][baz]=5` using dot notation: `QueryParamsPaginator::paramsInUrl()->increaseUsingDotNotation('foo.bar.baz', 5)`.
## [1.3.5] - 2023-12-20
### Fixed
* The `FileCache` can now also read uncompressed cache files when compression is activated.
## [1.3.4] - 2023-12-19
### Fixed
* Reset paginator state after finishing paginating for one base input, to enable paginating multiple listings of the same structure.
## [1.3.3] - 2023-12-01
### Fixed
* Add forgotten getter method to get the DOM query that is attached to an `InvalidDomQueryException` instance.
## [1.3.2] - 2023-12-01
### Fixed
* When creating a `CssSelector` or `XPathQuery` instance with invalid selector/query syntax, an `InvalidDomQueryException` is now immediately thrown. This change is considered to be not only non-breaking, but actually a fix, because the `CssSelector` would otherwise throw an exception later when the `apply()` method is called. The `XPathQuery` would silently return no result without notifying you of the invalid query and generate a PHP warning.
## [1.3.1] - 2023-11-30
### Fixed
* Support usage with the new Symfony major version v7.
## [1.3.0] - 2023-10-28
### Added
* New methods `HttpLoader::useProxy()` and `HttpLoader::useRotatingProxies([...])` to define proxies that the loader shall use. They can be used with a guzzle HTTP client instance (default) and when the loader uses the headless Chrome browser. Using them when providing some other PSR-18 implementation will throw an exception.
* New `QueryParamsPaginator` to paginate by increasing and/or decreasing one or multiple query params, either in the URL or in the body of requests. Can be created via static method `Crwlr\Crawler\Steps\Loading\Http\Paginator::queryParams()`.
* New method `stopWhen` in the new `Crwlr\Crawler\Steps\Loading\Http\AbstractPaginator` class (for more info see the deprecation below). You can pass implementations of the new `StopRule` interface or custom closures to that method and then, every time the Paginator receives a loaded response to process, those stop rules are called with the response. If any of the conditions of the stop rules is met, the Paginator stops paginating. Of course also added a few stop rules to use with that new method: `IsEmptyInHtml`, `IsEmptyInJson`, `IsEmptyInXml` and `IsEmptyResponse`, also available via static methods: `PaginatorStopRules::isEmptyInHtml()`, `PaginatorStopRules::isEmptyInJson()`, `PaginatorStopRules::isEmptyInXml()` and `PaginatorStopRules::isEmptyResponse()`.
### Deprecated
* Deprecated the `Crwlr\Crawler\Steps\Loading\Http\PaginatorInterface` and the `Crwlr\Crawler\Steps\Loading\Http\Paginators\AbstractPaginator`. Instead, added a new version of the `AbstractPaginator` as `Crwlr\Crawler\Steps\Loading\Http\AbstractPaginator` that can be used. Usually there shouldn't be a problem switching from the old to the new version. If you want to make your custom paginator implementation ready for v2 of the library, extend the new `AbstractPaginator` class, implement your own `getNextRequest` method (new requirement, with a default implementation in the abstract class, which will be removed in v2) and check if properties and methods of your existing class don't collide with the new properties and methods in the abstract class.
### Fixed
* The `HttpLoader::load()` implementation won't throw any exception, because it shouldn't kill a crawler run. When you want any loading error to end the whole crawler execution `HttpLoader::loadOrFail()` should be used. Also adapted the phpdoc in the `LoaderInterface`.
## [1.2.2] - 2023-09-19
### Fixed
* Fix in `HttpCrawl` (`Http::crawl()`) step: when a page contains a broken link, that can't be resolved and throws an `Exception` from the URL library, ignore the link and log a warning message.
* Minor fix for merging HTTP headers when an `Http` step gets both, statically defined headers and headers to use from array input.
## [1.2.1] - 2023-08-21
### Fixed
* When a URL redirects, the `trackRequestEndFor()` method of the `HttpLoader`'s `Throttler` instance is called only once at the end and with the original request URL.
## [1.2.0] - 2023-08-18
### Added
* New `onCacheHit` hook in the `Loader` class (in addition to `beforeLoad`, `onSuccess`, `onError` and `afterLoad`) that is called in the `HttpLoader` class when a response for a request was found in the cache.
### Deprecated
* Moved the `Microseconds` value object class to the crwlr/utils package, as it is a very useful and universal tool. The class in this package still exists, but just extends the class from the utils package and will be removed in v2. So, if you're using this class, please change to use the version from the utils package.
## [1.1.6] - 2023-07-20
### Fixed
* Throttling now also works when using the headless browser.
## [1.1.5] - 2023-07-14
### Fixed
* The `Http::crawl()` step, as well as the `Html::getLink()` and `Html::getLinks()` steps now ignore links, when the `href` attribute starts with `mailto:`, `tel:` or `javascript:`. For the crawl step it obviously makes no sense, but it's also considered a bugfix for the getLink(s) steps, because they are meant to deliver absolute HTTP URLs. If you want to get the values of such links, use the HTML data extraction step.
## [1.1.4] - 2023-07-14
### Fixed
* The `Http::crawl()` step now also work with sitemaps as input URL, where the `` tag contains attributes that would cause the symfony DomCrawler to not find any elements.
## [1.1.3] - 2023-06-29
### Fixed
* Improved `Json` step: if the target of the "each" (like `Json::each('target', [...])`) does not exist in the input JSON data, the step yields nothing and logs a warning.
## [1.1.2] - 2023-05-28
### Fixed
* Using the `only()` method of the `MetaData` (`Html::metaData()`) step class, the `title` property was always contained in the output, even if not listed in the `only` properties. This is fixed now.
## [1.1.1] - 2023-05-28
### Fixed
* There was an issue when adding multiple associative arrays with the same key to a `Result` object: let's say you're having a step producing array output like: `['bar' => 'something', 'baz' => 'something else']` and it (the whole array) shall be added to the result property `foo`. When the step produced multiple such array outputs, that led to a result like `['bar' => '...', 'baz' => '...', ['bar' => '...', 'baz' => '...'], ['bar' => '...', 'baz' => '...']`. Now it's fixed to result in `[['bar' => '...', 'baz' => '...'], ['bar' => '...', 'baz' => '...'], ['bar' => '...', 'baz' => '...']`.
## [1.1.0] - 2023-05-21
### Added
* `Http` steps can now receive body and headers from input data (instead of statically defining them via argument like `Http::method(headers: ...)`) using the new methods `useInputKeyAsBody()` and `useInputKeyAsHeader(, )` or `useInputKeyAsHeaders()`. Further, when invoked with associative array input data, the step will by default use the value from `url` or `uri` for the request URL. If the input array contains the URL in a key with a different name, you can use the new `useInputKeyAsUrl()` method. That was basically already possible with the existing `useInputKey()` method, because the URL is the main input argument for the step. But if you want to use it in combination with the other new `useInputKeyAsXyz()` methods, you have to use `useInputKeyAsUrl()`, because using `useInputKey()` would invoke the whole step with that key only.
* `Crawler::runAndDump()` as a simple way to just run a crawler and dump all results, each as an array.
* `addToResult()` now also works with serializable objects.
* If you know certain keys that the output of a step will contain, you can now also define aliases for those keys, to be used with `addToResult()`. The output of an `Http` step (`RespondedRequest`) contains the keys `requestUri` and `effectiveUri`. The aliases `url` and `uri` refer to `effectiveUri`, so `addToResult(['url'])` will add the `effectiveUri` as `url` to the result object.
* The `GetLink` (`Html::getLink()`) and `GetLinks` (`Html::getLinks()`) steps, as well as the abstract `DomQuery` (parent of `CssSelector` (/`Dom::cssSelector`) and `XPathQuery` (/`Dom::xPath`)) now have a method `withoutFragment()` to get links respectively URLs without their fragment part.
* The `HttpCrawl` step (`Http::crawl()`) has a new method `useCanonicalLinks()`. If you call it, the step will not yield responses if its canonical link URL was already yielded. And if it discovers a link, and some document pointing to that URL via canonical link was already loaded, it treats it as if it was already loaded. Further this feature also sets the canonical link URL as the `effectiveUri` of the response.
* All filters can now be negated by calling the `negate()` method, so the `evaluate()` method will return the opposite bool value when called. The `negate()` method returns an instance of `NegatedFilter` that wraps the original filter.
* New method `cacheOnlyWhereUrl()` in the `HttpLoader` class, that takes an instance of the `FilterInterface` as argument. If you define one or multiple filters using this method, the loader will cache only responses for URLs that match all the filters.
### Fixed
* The `HttpCrawl` step (`Http::crawl()`) by default now removes the fragment part of URLs to not load the same page multiple times, because in almost any case, servers won't respond with different content based on the fragment. That's why this change is considered non-breaking. For the rare cases when servers respond with different content based on the fragment, you can call the new `keepUrlFragment()` method of the step.
* Although the `HttpCrawl` step (`Http::crawl()`) already respected the limit of outputs defined via the `maxOutputs()` method, it actually didn't stop loading pages. The limit had no effect on loading, only on passing on outputs (responses) to the next step. This is fixed in this version.
* A so-called byte order mark at the beginning of a file (/string) can cause issues. So just remove it, when a step's input string starts with a UTF-8 BOM.
* There seems to be an issue in guzzle when it gets a PSR-7 request object with a header with multiple string values (as array, like: `['accept-encoding' => ['gzip', 'deflate', 'br']]`). When testing it happened that it only sent the last part (in this case `br`). Therefore, the `HttpLoader` now prepares headers before sending (in this case to: `['accept-encoding' => ['gzip, deflate, br']]`).
* You can now also use the output key aliases when filtering step outputs. You can even use keys that are only present in the serialized version of an output object.
## [1.0.2] - 2023-03-20
### Fixed
* JSON step: another fix for JSON strings having keys without quotes with empty string value.
## [1.0.1] - 2023-03-17
### Fixed
* JSON step: improve attempt to fix JSON string having keys without quotes.
## [1.0.0] - 2023-02-08
### Added
* New method `Step::refineOutput()` to manually refine step output values. It takes either a `Closure` or an instance of the new `RefinerInterface` as argument. If the step produces array output, you can provide a key from the array output, to refine, as first argument and the refiner as second argument. You can call the method multiple times and all the refiners will be applied to the outputs in the order you add them. If you want to refine multiple output array keys with a `Closure`, you can skip providing a key and the `Closure` will receive the full output array for refinement. As mentioned you can provide an instance of the `RefinerInterface`. There are already a few implementations: `StringRefiner::afterFirst()`, `StringRefiner::afterLast()`, `StringRefiner::beforeFirst()`, `StringRefiner::beforeLast()`, `StringRefiner::betweenFirst()`, `StringRefiner::betweenLast()` and `StringRefiner::replace()`.
* New method `Step::excludeFromGroupOutput()` to exclude a normal steps output from the combined output of a group that it's part of.
* New method `HttpLoader::setMaxRedirects()` to customize the limit of redirects to follow. Works only when using the HTTP client.
* New filters to filter by string length, with the same options as the comparison filters (equal, not equal, greater than,...).
* New `Filter::custom()` that you can use with a Closure, so you're not limited to the available filters only.
* New method `DomQuery::link()` as a shortcut for `DomQuery::attribute('href')->toAbsoluteUrl()`.
* New static method `HttpCrawler::make()` returning an instance of the new class `AnonymousHttpCrawlerBuilder`. This makes it possible to create your own Crawler instance with a one-liner like: `HttpCrawler::make()->withBotUserAgent('MyCrawler')`. There's also a `withUserAgent()` method to create an instance with a normal (non bot) user agent.
### Changed
* __BREAKING__: The `FileCache` now also respects the `ttl` (time to live) argument and by default it is one hour (3600 seconds). If you're using the cache and expect the items to live (basically) forever, please provide a high enough value for default the time to live. When you try to get a cache item that is already expired, it (the file) is immediately deleted.
* __BREAKING__: The `TooManyRequestsHandler` (and with that also the constructor argument in the `HttpLoader`) was renamed to `RetryErrorResponseHandler`. It now reacts the same to 503 (Service Unavailable) responses as to the 429 (Too Many Requests) responses. If you're actively passing your own instance to the `HttpLoader`, you need to update it.
* You can now have multiple different loaders in a `Crawler`. To use this, return an array containing your loaders from the protected `Crawler::loader()` method with keys to name them. You can then selectively use them by calling the `Step::useLoader()` method on a loading step with the key of the loader it should use.
### Removed
* __BREAKING__: The loop feature. The only real world use case should be paginating listings and this should be solved with the Paginator feature.
* __BREAKING__: `Step::dontCascade()` and `Step::cascades()` because with the change in v0.7, that groups can only produce combined output, there should be no use case for this anymore. If you want to exclude one steps output from the combined group output, you can use the new `Step::excludeFromGroupOutput()` method.
## [0.7.0] - 2023-01-13
### Added
* New functionality to paginate: There is the new `Paginate` child class of the `Http` step class (easy access via `Http::get()->paginate()`). It takes an instance of the `PaginatorInterface` and uses it to iterate through pagination links. There is one implementation of that interface, the `SimpleWebsitePaginator`. The `Http::get()->paginate()` method uses it by default, when called just with a CSS selector to get pagination links. Paginators receive all loaded pages and implement the logic to find pagination links. The paginator class is also called before sending a request, with the request object that is about to be sent as an argument (`prepareRequest()`). This way, it should even be doable to implement more complex pagination functionality. For example when pagination is built using POST request with query strings in the request body.
* New methods `stopOnErrorResponse()` and `yieldErrorResponses()` that can be used with `Http` steps. By calling `stopOnErrorResponse()` the step will throw a `LoadingException` when a response has a 4xx or 5xx status code. By calling the `yieldErrorResponse()` even error responses will be yielded and passed on to the next steps (this was default behaviour until this version. See the breaking change below).
* The body of HTTP responses with a `Content-Type` header containing `application/x-gzip` are automatically decoded when `Http::getBodyString()` is used. Therefore, added `ext-zlib` to suggested in `composer.json`.
* New methods `addToResult()` and `addLaterToResult()`. `addToResult()` is a single replacement for `setResultKey()` and `addKeysToResult()` (they are removed, see `Changed` below) that can be used for array and non array output. `addLaterToResult()` is a new method that does not create a Result object immediately, but instead adds the output of the current step to all the Results that will later be created originating from the current output.
* New methods `outputKey()` and `keepInputData()` that can be used with any step. Using the `outputKey()` method, the step will convert non array output to an array and use the key provided as an argument to this method as array key for the output value. The `keepInputData()` method allows you to forward data from the step's input to the output. If the input is non array you can define a key using the method's argument. This is useful e.g. if you're having data in the initial inputs that you also want to add to the final crawling results.
* New method `createsResult()` that can be used with any step, so you can differentiate if a step creates a Result object, or just keeps data to add to results later (new `addLaterToResult()` method). But primarily relevant for library internal use.
* The `FileCache` class can compress the cache data now to save disk space. Use the `useCompression()` method to do so.
* New method `retryCachedErrorResponses()` in `HttpLoader`. When called, the loader will only use successful responses (status code < 400) from the cache and therefore retry already cached error responses.
* New method `writeOnlyCache()` in `HttpLoader` to only write to, but don't read from the response cache. Can be used to renew cached responses.
* `Filter::urlPathMatches()` to filter URL paths using a regex.
* Option to provide a chrome executable name to the `chrome-php/chrome` library via `HttpLoader::setChromeExecutable()`.
### Changed
* __BREAKING__: Group steps can now only produce combined outputs, as previously done when `combineToSingleOutput()` method was called. The method is removed.
* __BREAKING__: `setResultKey()` and `addKeysToResult()` are removed. Calls to those methods can both be replaced with calls to the new `addToResult()` method.
* __BREAKING__: `getResultKey()` is also removed with `setResultKey()`. It's removed without replacement, as it doesn't really make sense any longer.
* __BREAKING__: Error responses (4xx as well as 5xx), by default, won't produce any step outputs any longer. If you want to receive error responses, use the new `yieldErrorResponses()` method.
* __BREAKING__: Removed the `httpClient()` method in the `HttpCrawler` class. If you want to provide your own HTTP client, implement a custom `loader` method passing your client to the `HttpLoader` instead.
* __Deprecated__ the loop feature (class `Loop` and `Crawler::loop()` method). Probably the only use case is iterating over paginated list pages, which can be done using the new Paginator functionality. It will be removed in v1.0.
* In case of a 429 (Too Many Requests) response, the `HttpLoader` now automatically waits and retries. By default, it retries twice and waits 10 seconds for the first retry and a minute for the second one. In case the response also contains a `Retry-After` header with a value in seconds, it complies to that. Exception: by default it waits at max `60` seconds (you can set your own limit if you want), if the `Retry-After` value is higher, it will stop crawling. If all the retries also receive a `429` it also throws an Exception.
* Removed logger from `Throttler` as it doesn't log anything.
* Fail silently when `robots.txt` can't be parsed.
* Default timeout configuration for the default guzzle HTTP client: `connect_timeout` is `10` seconds and `timeout` is `60` seconds.
* The `validateAndSanitize...()` methods in the abstract `Step` class, when called with an array with one single element, automatically try to use that array element as input value.
* With the `Html` and `Xml` data extraction steps you can now add layers to the data that is being extracted, by just adding further `Html`/`Xml` data extraction steps as values in the mapping array that you pass as argument to the `extract()` method.
* The base `Http` step can now also be called with an array of URLs as a single input. Crawl and Paginate steps still require a single URL input.
### Fixed
* The `CookieJar` now also works with `localhost` or other hosts without a registered domain name.
* Improve the `Sitemap::getUrlsFromSitemap()` step to also work when the `` tag contains attributes that would cause the symfony DomCrawler to not find any elements.
* Fixed possibility of infinite redirects in `HttpLoader` by adding a redirects limit of 10.
## [0.6.0] - 2022-10-03
### Added
* New step `Http::crawl()` (class `HttpCrawl` extending the normal `Http` step class) for conventional crawling. It loads all pages of a website (same host or domain) by following links. There's also a lot of options like depth, filtering by paths, and so on.
* New steps `Sitemap::getSitemapsFromRobotsTxt()` (`GetSitemapsFromRobotsTxt`) and `Sitemap::getUrlsFromSitemap()` (`GetUrlsFromSitemap`) to get sitemap (URLs) from a robots.txt file and to get all the URLs from those sitemaps.
* New step `Html::metaData()` to get data from meta tags (and title tag) in HTML documents.
* New step `Html::schemaOrg()` (`SchemaOrg`) to get schema.org structured data in JSON-LD format from HTML documents.
* The abstract `DomQuery` class (parent of the `CssSelector` and `XPathQuery` classes) now has some methods to narrow the selected matches further: `first()`, `last()`, `nth(n)`, `even()`, `odd()`.
### Changed
* __BREAKING__: Removed `PoliteHttpLoader` and traits `WaitPolitely` and `CheckRobotsTxt`. Converted the traits to classes `Throttler` and `RobotsTxtHandler` which are dependencies of the `HttpLoader`. The `HttpLoader` internally gets default instances of those classes. The `RobotsTxtHandler` will respect robots.txt rules by default if you use a `BotUserAgent` and it won't if you use a normal `UserAgent`. You can access the loader's `RobotsTxtHandler` via `HttpLoader::robotsTxt()`. You can pass your own instance of the `Throttler` to the loader and also access it via `HttpLoader::throttle()` to change settings.
### Fixed
* Getting absolute links via the `GetLink` and `GetLinks` steps and the `toAbsoluteUrl()` method of the `CssSelector` and `XPathQuery` classes, now also look for `` tags in HTML when resolving the URLs.
* The `SimpleCsvFileStore` can now also save results with nested data (but only second level). It just concatenates the values separated with a ` | `.
## [0.5.0] - 2022-09-03
### Added
* You can now call the new `useHeadlessBrowser` method on the `HttpLoader` class to use a headless Chrome browser to load pages. This is enough to get HTML after executing javascript in the browser. For more sophisticated tasks a separate Loader and/or Steps should better be created.
* With the `maxOutputs()` method of the abstract `Step` class you can now limit how many outputs a certain step should yield at max. That's for example helpful during development, when you want to run the crawler only with a small subset of the data/requests it will actually have to process when you eventually remove the limits. When a step has reached its limit, it won't even call the `invoke()` method any longer until the step is reset after a run.
* With the new `outputHook()` method of the abstract `Crawler` class you can set a closure that'll receive all the outputs from all the steps. Should be only for debugging reasons.
* The `extract()` method of the `Html` and `Xml` (children of `Dom`) steps now also works with a single selector instead of an array with a mapping. Sometimes you'll want to just get a simple string output e.g. for a next step, instead of an array with mapped extracted data.
* In addition to `uniqueOutputs()` there is now also `uniqueInputs()`. It works exactly the same as `uniqueOutputs()`, filtering duplicate input values instead. Optionally also by a key when expected input is an array or an object.
* In order to be able to also get absolute links when using the `extract()` method of Dom steps, the abstract `DomQuery` class now has a method `toAbsoluteUrl()`. The Dom step will automatically provide the `DomQuery` instance with the base url, presumed that the input was an instance of the `RespondedRequest` class and resolve the selected value against that base url.
### Changed
* Remove some not so important log messages.
* Improve behavior of group step's `combineToSingleOutput()`. When steps yield multiple outputs, don't combine all yielded outputs to one. Instead, combine the first output from the first step with the first output from the second step, and so on.
* When results are not explicitly composed, but the outputs of the last step are arrays with string keys, it sets those keys on the Result object instead of setting a key `unnamed` with the whole array as value.
### Fixed
* The static methods `Html::getLink()` and `Html::getLinks()` now also work without argument, like the `GetLink` and `GetLinks` classes.
* When a `DomQuery` (CSS selector or XPath query) doesn't match anything, its `apply()` method now returns `null` (instead of an empty string). When the `Html(/Xml)::extract()` method is used with a single, not matching selector/query, nothing is yielded. When it's used with an array with a mapping, it yields an array with null values. If the selector for one of the methods `Html(/Xml)::each()`, `Html(/Xml)::first()` or `Html(/Xml)::last()` doesn't match anything, that's not causing an error any longer, it just won't yield anything.
* Removed the (unnecessary) second argument from the `Loop::withInput()` method because when `keepLoopingWithoutOutput()` is called and `withInput()` is called after that call, it resets the behavior.
* Issue when date format for expires date in cookie doesn't have dashes in `d-M-Y` (so `d M Y`).
## [0.4.1] - 2022-05-10
### Fixed
* The `Json` step now also works with Http responses as input.
## [0.4.0] - 2022-05-06
### Added
* The `BaseStep` class now has `where()` and `orWhere()` methods to filter step outputs. You can set multiple filters that will be applied to all outputs. When setting a filter using `orWhere` it's linked to the previously added Filter with "OR". Outputs not matching one of the filters, are not yielded. The available filters can be accessed through static methods on the new `Filter` class. Currently available filters are comparison filters (equal, greater/less than,...), a few string filters (contains, starts/ends with) and url filters (scheme, domain, host,...).
* The `GetLink` and `GetLinks` steps now have methods `onSameDomain()`, `notOnSameDomain()`, `onDomain()`, `onSameHost()`, `notOnSameHost()`, `onHost()` to restrict the which links to find.
* Automatically add the crawler's logger to the `Store` so you can also log messages from there. This can be breaking as the `StoreInterface` now also requires the `addLogger` method. The new abstract `Store` class already implements it, so you can just extend it.
### Changed
* The `Csv` step can now also be used without defining a column mapping. In that case it will use the values from the first line (so this makes sense when there are column headlines) as output array keys.
## [0.3.0] - 2022-04-27
### Added
* By calling `monitorMemoryUsage()` you can tell the Crawler to add log messages with the current memory usage after every step invocation. You can also set a limit in bytes when to start monitoring and below the limit it won't log memory usage.
### Fixed
* Previously the __use of Generators__ actually didn't make a lot of sense, because the outputs of one step were only iterated and passed on to the next step, after the current step was invoked with all its inputs. That makes steps with a lot of inputs bottlenecks and causes bigger memory consumption. So, changed the crawler to immediately pass on outputs of one step to the next step if there is one.
## [0.2.0] - 2022-04-25
### Added
* `uniqueOutputs()` method to Steps to get only unique output values. If outputs are array or object, you can provide a key that will be used as identifier to check for uniqueness. Otherwise, the arrays or objects will be serialized for comparison which will probably be slower.
* `runAndTraverse()` method to Crawler, so you don't need to manually traverse the Generator, if you don't need the results where you're calling the crawler.
* Implement the behaviour for when a `Group` step should add something to the Result using `setResultKey()` or `addKeysToResult()`, which was still missing. For groups this will only work when using `combineToSingleOutput`.
================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to this Package
That you're reading this must mean you consider contributing to
this package. So first off: Awesome! 👍🤘
## Bugs
In case you encounter any bugs please
[file an issue](https://github.com/crwlrsoft/crawler/issues/new).
Describe the issue as well as you can and provide an example to
reproduce it.
Maybe you're not 100 percent sure whether what you've discovered
is a bug or the intended behavior. You can still file an issue
and tell us which results you'd expect.
If you know how to fix the issue you're welcome to send a pull
request. 💪
## New Features
If you have ideas for new features you can tell us about it on
[Twitter](https://twitter.com/crwlrsoft) or via
[crwlr.software](https://www.crwlr.software/contact) or just
send a pull request. Please keep in mind that there is no
guarantee that your feature will be merged.
## Conventions
### Coding Style
This package follows the
[PSR-12](https://www.php-fig.org/psr/psr-12/) coding standard.
You can run PHP CS Fixer via `composer cs` for a dry run or
`composer cs-fix` to automatically fix code style issues.
### Code quality tools
When you're making changes to this package please always run
tests and linting. Commands:
`composer test`
`composer test-integration`
`composer cs`
`composer stan`
Ideally you add the pre-commit git hook that is shipped with
this repo that will run tests and linting. Add it to your local
clone by running:
`composer add-git-hooks`
The integration tests start a simple PHP web server for the
testing purpose on port 8000. If you have anything else running
on that port, the integration tests won't work.
Also, please don't forget to add new test cases if necessary.
### Documentation
For any code change that changes/adds something for users of
the package, please don't forget to add an entry to the
`CHANGELOG.md` file.
## Appreciation
When your pull request is merged I will show some love and tweet
about it. Also, if you meet me in person I will be glad to buy you
a beer.
================================================
FILE: LICENSE
================================================
Copyright (c) 2026 Christian Olear
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject
to the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
================================================
FILE: README.md
================================================
# Library for Rapid (Web) Crawler and Scraper Development
This library provides kind of a framework and a lot of ready to use, so-called __steps__, that you can use as building blocks, to build your own crawlers and scrapers with.
To give you an overview, here's a list of things that it helps you with:
* [Crawler __Politeness__](https://www.crwlr.software/packages/crawler/the-crawler/politeness) 😇 (respecting robots.txt, throttling,...)
* Load URLs using
* [a __(PSR-18) HTTP client__](https://www.crwlr.software/packages/crawler/the-crawler/loaders) (default is of course Guzzle)
* or a [__headless browser__](https://www.crwlr.software/packages/crawler/the-crawler/loaders#using-a-headless-browser) (chrome) to get source after Javascript execution
* [Get __absolute links__ from HTML documents](https://www.crwlr.software/packages/crawler/included-steps/html#html-get-link) 🔗
* [Get __sitemaps__ from robots.txt and get all URLs from those sitemaps](https://www.crwlr.software/packages/crawler/included-steps/sitemap)
* [__Crawl__ (load) all pages of a website](https://www.crwlr.software/packages/crawler/included-steps/http#crawling) 🕷
* [Use __cookies__ (or don't)](https://www.crwlr.software/packages/crawler/the-crawler/loaders#http-loader) 🍪
* [Use any __HTTP methods__ (GET, POST,...) and send any headers or body](https://www.crwlr.software/packages/crawler/included-steps/http#http-requests)
* [Easily iterate over __paginated__ list pages](https://www.crwlr.software/packages/crawler/included-steps/http#paginating) 🔁
* Extract data from:
* [__HTML__](https://www.crwlr.software/packages/crawler/included-steps/html#extracting-data) and also [__XML__](https://www.crwlr.software/packages/crawler/included-steps/xml) (using CSS selectors or XPath queries)
* [__JSON__](https://www.crwlr.software/packages/crawler/included-steps/json) (using dot notation)
* [__CSV__](https://www.crwlr.software/packages/crawler/included-steps/csv) (map columns)
* [Extract __schema.org__ structured data](https://www.crwlr.software/packages/crawler/included-steps/html#schema-org) in __JSON-LD__ format from HTML documents
* [Keep memory usage low](https://www.crwlr.software/packages/crawler/crawling-procedure#memory-usage) by using PHP __Generators__ 💪
* [__Cache__ HTTP responses](https://www.crwlr.software/packages/crawler/response-cache) during development, so you don't have to load pages again and again after every code change
* [Get __logs__](https://www.crwlr.software/packages/crawler/the-crawler#loggers) about what your crawler is doing (accepts any PSR-3 LoggerInterface)
* And a lot more...
## Documentation
You can find the documentation at [crwlr.software](https://www.crwlr.software/packages/crawler/getting-started).
## Contributing
If you consider contributing something to this package, read the [contribution guide (CONTRIBUTING.md)](CONTRIBUTING.md).
================================================
FILE: bin/add-git-hooks
================================================
#!/usr/bin/env php
0) {
$lastLine = array_pop($output);
if (trim($lastLine) !== '') {
printLine($lastLine);
return;
}
}
}
function printLine(string $string)
{
echo $string . PHP_EOL;
}
function printLines(array $lines)
{
echo implode(PHP_EOL, $lines) . PHP_EOL;
}
function printBlankLine()
{
printLine('');
}
function red(string $string): string
{
return color('0;31', $string);
}
function green(string $string): string
{
return color('0;32', $string);
}
function blue(string $string): string
{
return color('0;34', $string);
}
function color(string $colorCode, string $string): string
{
return "\e[" . $colorCode . "m" . $string . "\e[0m";
}
================================================
FILE: phpstan.neon
================================================
parameters:
level: 8
paths:
- src
- tests
excludePaths:
analyse:
- tests/_Integration/_Server
reportUnmatchedIgnoredErrors: false
ignoreErrors:
- "#^Call to an undefined method Pest\\\\PendingCalls\\\\TestCall\\|Pest\\\\Support\\\\HigherOrderTapProxy\\:\\:(with|throws)\\(\\).$#"
- "#^Access to an undefined property Spatie\\\\Invade\\\\Invader#"
- "#^Call to an undefined method Spatie\\\\Invade\\\\Invader#"
- "#^Call to protected method [a-zA-Z]{5,30}\\(\\) of class PHPUnit\\\\Framework\\\\TestCase.#"
- "#^(?:Parameter|Method) .+ has invalid (return )?type Dom\\\\.+\\.#"
- "#^Call to .+ on an unknown class Dom\\\\.+\\.#"
- "#^Property .+ has unknown class Dom\\\\.+ as its type\\.#"
- "#^Class Dom\\\\.+ not found.#"
- "#^Access to property .+ on an unknown class Dom\\\\.+\\.#"
- "#^PHPDoc tag .+ contains unknown class Dom\\\\.+\\.#"
- "#^Call to an undefined (static )?method Dom\\\\.+::.+\\(\\)\\.#"
- "#^Access to an undefined property Dom\\\\.+::\\$.+\\.#"
- "#^Function .+ has invalid return type Dom\\\\.+\\.#"
- "#^(?:Used )?(?:C|c)onstant DOM\\\\.+ not found\\.#"
- "#^Instantiated class Dom\\\\.+ not found.#"
================================================
FILE: phpunit.xml
================================================
./tests./app./src
================================================
FILE: src/Cache/CacheItem.php
================================================
value) && method_exists($this->value, 'cacheKey')) {
$this->key = $this->value->cacheKey();
} else {
$this->key = md5(serialize($this->value));
}
} else {
$this->key = $key;
}
}
public function key(): string
{
return $this->key;
}
public function value(): mixed
{
return $this->value;
}
/**
* @throws Exception
*/
public function isExpired(): bool
{
$ttl = $this->ttl instanceof DateInterval ? $this->ttl : new DateInterval('PT' . $this->ttl . 'S');
return time() > $this->createdAt->add($ttl)->getTimestamp();
}
/**
* Get a new instance with same data but a different time to live.
*/
public function withTtl(DateInterval|int $ttl): CacheItem
{
return new CacheItem($this->value, $this->key, $ttl, $this->createdAt);
}
/**
* @return mixed[]
*/
public function __serialize(): array
{
return [
'value' => $this->value,
'key' => $this->key,
'ttl' => $this->ttl,
'createdAt' => $this->createdAt,
];
}
/**
* @param mixed[] $data
*/
public function __unserialize(array $data): void
{
$this->value = $data['value'];
$this->key = $data['key'];
$this->ttl = $data['ttl'];
$this->createdAt = $data['createdAt'];
}
}
================================================
FILE: src/Cache/Exceptions/MissingZlibExtensionException.php
================================================
useCompression = true;
return $this;
}
public function ttl(DateInterval|int $ttl): static
{
$this->ttl = $ttl;
return $this;
}
/**
* @throws MissingZlibExtensionException|ReadingCacheFailedException|Exception|InvalidArgumentException
*/
public function has(string $key): bool
{
if (file_exists($this->basePath . '/' . $key)) {
$cacheItem = $this->getCacheItem($key);
if (!$cacheItem->isExpired()) {
return true;
}
$this->delete($key);
}
return false;
}
/**
* @throws ReadingCacheFailedException|MissingZlibExtensionException|Exception|InvalidArgumentException
*/
public function get(string $key, mixed $default = null): mixed
{
if (file_exists($this->basePath . '/' . $key)) {
$cacheItem = $this->getCacheItem($key);
if (!$cacheItem->isExpired()) {
return $cacheItem->value();
}
$this->delete($key);
}
return $default;
}
/**
* @throws MissingZlibExtensionException
*/
public function set(string $key, mixed $value, DateInterval|int|null $ttl = null): bool
{
if (!$value instanceof CacheItem) {
$value = new CacheItem($value, $key, $ttl ?? $this->ttl);
} elseif ($value->key() !== $key) {
$value = new CacheItem($value->value(), $key, $ttl ?? $value->ttl);
}
return $this->saveCacheItem($value);
}
public function delete(string $key): bool
{
return unlink($this->basePath . '/' . $key);
}
public function prolong(string $key, DateInterval|int $ttl): bool
{
try {
$item = $this->getCacheItem($key);
return $this->saveCacheItem($item->withTtl($ttl));
} catch (Throwable) {
return false;
}
}
/**
* @throws InvalidArgumentException
*/
public function clear(): bool
{
$allFiles = scandir($this->basePath);
if (is_array($allFiles)) {
foreach ($allFiles as $file) {
if ($file !== '.' && $file !== '..' && $file !== '.gitkeep' && !$this->delete($file)) {
return false;
}
}
}
return true;
}
public function prolongAll(DateInterval|int $ttl): bool
{
$allFiles = scandir($this->basePath);
if (is_array($allFiles)) {
foreach ($allFiles as $file) {
if ($file !== '.' && $file !== '..' && $file !== '.gitkeep' && !$this->prolong($file, $ttl)) {
return false;
}
}
}
return true;
}
/**
* @return iterable
* @throws MissingZlibExtensionException|ReadingCacheFailedException|InvalidArgumentException
*/
public function getMultiple(iterable $keys, mixed $default = null): iterable
{
$items = [];
foreach ($keys as $key) {
$items[$key] = $this->get($key, $default);
}
return $items;
}
/**
* @param iterable $values
* @throws MissingZlibExtensionException
*/
public function setMultiple(iterable $values, DateInterval|int|null $ttl = null): bool
{
foreach ($values as $key => $value) {
if (!$this->set($key, $value, $ttl)) {
return false;
}
}
return true;
}
public function deleteMultiple(iterable $keys): bool
{
foreach ($keys as $key) {
if (!$this->delete($key)) {
return false;
}
}
return true;
}
/**
* @throws MissingZlibExtensionException
* @throws ReadingCacheFailedException
*/
protected function getCacheItem(string $key): CacheItem
{
$fileContent = $this->getFileContents($key);
if ($this->useCompression) {
$fileContent = $this->decode($fileContent);
}
$unserialized = $this->unserialize($fileContent);
if (!$unserialized instanceof CacheItem) {
$unserialized = new CacheItem($unserialized, $key);
}
return $unserialized;
}
/**
* @throws MissingZlibExtensionException
*/
protected function saveCacheItem(CacheItem $item): bool
{
$content = serialize($item);
if ($this->useCompression) {
$content = $this->encode($content);
}
return file_put_contents($this->basePath . '/' . $item->key(), $content) !== false;
}
protected function unserialize(string $content): mixed
{
// Temporarily set a new error handler, so unserializing a compressed string does not result in a PHP warning.
set_error_handler(function ($errno, $errstr) {
return $errno === E_WARNING && str_starts_with($errstr, 'unserialize(): Error at offset 0 of ');
});
$unserialized = unserialize($content);
if ($unserialized === false) { // if unserializing fails, try if the string is compressed.
try {
$content = $this->decode($content);
$unserialized = unserialize($content);
} catch (Throwable) {
}
}
restore_error_handler();
return $unserialized;
}
/**
* @throws ReadingCacheFailedException
*/
protected function getFileContents(string $key): string
{
$fileContent = file_get_contents($this->basePath . '/' . $key);
if ($fileContent === false) {
throw new ReadingCacheFailedException('Failed to read cache file.');
}
return $fileContent;
}
/**
* @throws MissingZlibExtensionException
*/
protected function encode(string $content): string
{
try {
return Gzip::encode($content, true);
} catch (MissingZlibExtensionException) {
throw new MissingZlibExtensionException(
'Can\'t compress response cache data. Compression needs PHP ext-zlib installed.',
);
}
}
/**
* @throws MissingZlibExtensionException
*/
protected function decode(string $content): string
{
try {
return Gzip::decode($content, true);
} catch (MissingZlibExtensionException) {
throw new MissingZlibExtensionException('FileCache compression needs PHP ext-zlib installed.');
}
}
}
================================================
FILE: src/Crawler.php
================================================
*/
protected array $steps = [];
protected ?StoreInterface $store = null;
protected bool|int $monitorMemoryUsage = false;
protected ?Closure $outputHook = null;
public function __construct()
{
$this->userAgent = $this->userAgent();
$this->logger = $this->logger();
$this->loader = $this->loader($this->userAgent, $this->logger);
}
public function __clone(): void
{
$this->inputs = [];
$this->steps = [];
$this->store = null;
$this->outputHook = null;
}
abstract protected function userAgent(): UserAgentInterface;
/**
* @param UserAgentInterface $userAgent
* @param LoggerInterface $logger
* @return LoaderInterface
*/
abstract protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface;
public static function group(): Group
{
return new Group();
}
public static function setMemoryLimit(string $memoryLimit): false|string
{
return ini_set('memory_limit', $memoryLimit);
}
public static function getMemoryLimit(): false|string
{
return ini_get('memory_limit');
}
public function getSubCrawler(): Crawler
{
return clone $this;
}
public function getUserAgent(): UserAgentInterface
{
return $this->userAgent;
}
public function setUserAgent(UserAgentInterface $userAgent): static
{
$this->userAgent = $userAgent;
$this->loader = $this->loader($userAgent, $this->logger);
return $this;
}
public function getLogger(): LoggerInterface
{
return $this->logger;
}
/**
* @return LoaderInterface|array
*/
public function getLoader(): LoaderInterface|array
{
return $this->loader;
}
public function setStore(StoreInterface $store): static
{
$store->addLogger($this->logger);
$this->store = $store;
return $this;
}
public function input(mixed $input): static
{
$this->inputs[] = $input;
return $this;
}
/**
* @param mixed[] $inputs
*/
public function inputs(array $inputs): static
{
$this->inputs = array_merge($this->inputs, $inputs);
return $this;
}
/**
* @param StepInterface $step
* @return $this
* @throws InvalidArgumentException
*/
public function addStep(StepInterface $step): static
{
$step->addLogger($this->logger);
if (method_exists($step, 'setLoader')) {
$step->setLoader($this->loader);
}
if ($step instanceof BaseStep) {
$step->setParentCrawler($this);
}
$this->steps[] = $step;
return $this;
}
/**
* Run the crawler and traverse results
*
* When you've set a store, or you just don't need the results for any other reason (e.g. you use the crawler for
* cache warming) where you're calling the crawler, use this method.
*
* @throws Exception
*/
public function runAndTraverse(): void
{
foreach ($this->run() as $result) {
}
}
/**
* Easy way to just crawl and dump the results
*
* @throws Exception
*/
public function runAndDump(): void
{
foreach ($this->run() as $result) {
var_dump($result->toArray());
}
}
/**
* Run the Crawler
*
* Handles calling all the steps and cascading the data from step to step.
* It returns a Generator, so when using this method directly, you need to traverse the Generator, otherwise nothing
* happens. Alternatively you can use runAndTraverse().
*
* @return Generator
* @throws Exception|PreRunValidationException
*/
public function run(): Generator
{
$this->validateSteps();
$inputs = $this->prepareInput();
if ($this->firstStep()) {
foreach ($inputs as $input) {
$results = $this->invokeStepsRecursive($input, $this->firstStep(), 0);
/** @var Generator $results */
yield from $results;
}
}
$this->reset();
}
/**
* Use this method if you want the crawler to add log messages with the current memory usage after every step
* invocation.
*
* @param int|null $ifAboveXBytes You can provide an int of bytes as a limit above which the crawler should log
* the usage.
*/
public function monitorMemoryUsage(?int $ifAboveXBytes = null): static
{
$this->monitorMemoryUsage = $ifAboveXBytes ?? true;
return $this;
}
public function outputHook(Closure $callback): static
{
$this->outputHook = $callback;
return $this;
}
protected function logger(): LoggerInterface
{
return new CliLogger();
}
/**
* @return Generator
HTML;
$output = helper_invokeStepWithInput(
helper_getDomStepInstance()::root()->extract([
'one' => Dom::cssSelector('#one')->link(),
'two' => Dom::xPath('//a[@id=\'two\']')->link(),
'three' => Dom::cssSelector('#three')->link()->withoutFragment(),
'four' => Dom::xPath('//a[@id=\'four\']')->link()->withoutFragment(),
]),
new RespondedRequest(
new Request('GET', 'https://www.example.com/home'),
new Response(body: $body),
),
);
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe([
'one' => 'https://www.example.com/foo#foo',
'two' => 'https://www.example.com/bar#bar',
'three' => 'https://www.example.com/baz',
'four' => 'https://www.example.com/quz',
]);
});
================================================
FILE: tests/Steps/Filters/ArrayFilterTest.php
================================================
where(Filter::equal('foo'));
expect($filter->evaluate($values))->toBe($evaluationResult);
})->with([
[['foo', 'bar', 'baz'], true],
[['bar', 'baz', 'quz'], false],
]);
it('filters a multi-level array by a key of the array elements (which are also arrays)', function () {
$values = [
['foo' => 'one', 'bar' => 'two'],
['foo' => 'two', 'bar' => 'three'],
['foo' => 'three', 'bar' => 'four'],
];
$filter = Filter::arrayHasElement()->where('foo', Filter::equal('four'));
expect($filter->evaluate($values))->toBeFalse();
$filter = Filter::arrayHasElement()->where('foo', Filter::equal('two'));
expect($filter->evaluate($values))->toBeTrue();
});
it('applies multiple complex filters on a multi-level array', function () {
$values = [
[
'id' => '123',
'name' => 'abc',
'tags' => [
['type' => 'companyId', 'value' => '123'],
['type' => 'type', 'value' => 'job-ad'],
['type' => 'companyId', 'value' => '125'],
],
],
[
'id' => '124',
'name' => 'abd',
'tags' => [
['type' => 'companyId', 'value' => '123'],
['type' => 'type', 'value' => 'blog-post'],
['type' => 'author', 'value' => 'John Doe'],
],
],
[
'id' => '125',
'name' => 'abf',
'tags' => [
['type' => 'companyId', 'value' => '123'],
['type' => 'companyId', 'value' => '124'],
['type' => 'type', 'value' => 'job-ad'],
['type' => 'companyId', 'value' => '125'],
],
],
];
$filter = Filter::arrayHasElement()
->where(
'tags',
Filter::arrayHasElement()
->where('type', Filter::equal('companyId'))
->where('value', Filter::equal('123')),
)
->where(
'tags',
Filter::arrayHasElement()
->where('type', Filter::equal('companyId'))
->where('value', Filter::equal('124'))
->negate(),
)
->where(
'tags',
Filter::arrayHasElement()
->where('type', Filter::equal('type'))
->where('value', Filter::equal('job-ad')),
);
expect($filter->evaluate($values))->toBeTrue();
$filter = Filter::arrayHasElement()
->where(
'tags',
Filter::arrayHasElement()
->where('type', Filter::equal('companyId'))
->where('value', Filter::equal('123')),
)
->where(
'tags',
Filter::arrayHasElement()
->where('type', Filter::equal('companyId'))
->where('value', Filter::equal('125'))
->negate(),
)
->where(
'tags',
Filter::arrayHasElement()
->where('type', Filter::equal('type'))
->where('value', Filter::equal('job-ad')),
);
expect($filter->evaluate($values))->toBeFalse();
});
================================================
FILE: tests/Steps/Filters/ClosureFilterTest.php
================================================
evaluate('one'))->toBeTrue();
expect($closure->evaluate('four'))->toBeFalse();
});
it('evaluates with a value from an array by key', function () {
$closure = new ClosureFilter(function (mixed $value) {
return in_array($value, ['one', 'two', 'three'], true);
});
$closure->useKey('bar');
expect($closure->evaluate(['foo' => 'one', 'bar' => 'two']))->toBeTrue();
expect($closure->evaluate(['foo' => 'three', 'bar' => 'four']))->toBeFalse();
});
it('compares a value from an object by key', function () {
$closure = new ClosureFilter(function (mixed $value) {
return in_array($value, ['one', 'two', 'three'], true);
});
$closure->useKey('bar');
expect($closure->evaluate(helper_getStdClassWithData(['foo' => 'one', 'bar' => 'two'])))->toBeTrue();
expect($closure->evaluate(helper_getStdClassWithData(['foo' => 'three', 'bar' => 'four'])))->toBeFalse();
});
================================================
FILE: tests/Steps/Filters/ComparisonFilterTest.php
================================================
evaluate(4))->toBeTrue()
->and($comparison->evaluate(2))->toBeFalse();
});
it('compares a value from an array by key', function () {
$comparison = new ComparisonFilter(ComparisonFilterRule::NotEqual, 'barValue');
$comparison->useKey('bar');
expect($comparison->evaluate(['foo' => 'fooValue', 'bar' => 'barValue']))->toBeFalse()
->and($comparison->evaluate(['foo' => 'fooValue', 'bar' => 'barzValue']))->toBeTrue();
});
it('compares a value from an object by key', function () {
$comparison = new ComparisonFilter(ComparisonFilterRule::NotEqual, 'barValue');
$comparison->useKey('bar');
expect($comparison->evaluate(helper_getStdClassWithData(['foo' => 'fooValue', 'bar' => 'barValue'])))->toBeFalse()
->and($comparison->evaluate(helper_getStdClassWithData(['foo' => 'fooValue', 'bar' => 'barzValue'])))->toBeTrue();
});
================================================
FILE: tests/Steps/Filters/Enums/ComparisonFilterRuleTest.php
================================================
evaluate($value1, $value2))->toBe($expectedResult);
})->with([
[true, 1, 1],
[true, 'one', 'one'],
[true, 1.12, 1.12],
[false, 1, 2],
[false, 1, '1'],
[false, 'one', 'two'],
[false, 1.12, 1.122],
]);
it('correctly applies not equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
$comparisonFilterRule = ComparisonFilterRule::NotEqual;
expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
[false, 1, 1],
[false, 'one', 'one'],
[false, 1.12, 1.12],
[true, 1, 2],
[true, 1, '1'],
[true, 'one', 'two'],
[true, 1.12, 1.122],
]);
it('correctly applies greater than operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
$comparisonFilterRule = ComparisonFilterRule::GreaterThan;
expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
[true, 1, 0],
[true, 12, 3],
[true, 1.12, 1.11],
[false, 11, 11],
[false, 0, 1],
[false, 3.59, 3.591],
[true, '123', '122'],
[true, '123', 122],
[true, 123, '122'],
[false, '123', '124'],
[false, '123', 124],
[false, 123, '124'],
[true, '123.45', '123.44'],
[true, '123.45', 123.44],
[true, 123.45, '123.44'],
[false, '123.45', '123.46'],
[false, '123.45', 123.46],
[false, 123.45, '123.46'],
]);
it('correctly applies greater than or equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
$comparisonFilterRule = ComparisonFilterRule::GreaterThanOrEqual;
expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
[true, 1, 0],
[true, 12, 3],
[true, 1.12, 1.11],
[true, 11, 11],
[false, 0, 1],
[false, 3.59, 3.591],
[true, '123', '122'],
[true, '123', 122],
[true, 123, '123'],
[false, '123', '124'],
[false, '123', 124],
[false, 123, '124'],
[true, '123.45', '123.44'],
[true, '123.44', 123.44],
[true, 123.45, '123.44'],
[false, '123.45', '123.46'],
[false, '123.45', 123.46],
[false, 123.45, '123.46'],
]);
it('correctly applies less than operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
$comparisonFilterRule = ComparisonFilterRule::LessThan;
expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
[true, 0, 1],
[true, 4, 5],
[true, 5.79, 5.7901],
[false, 11, 11],
[false, 1, 0],
[false, 9.2901, 9.29],
[true, '123', '124'],
[true, '123', 124],
[true, 123, '124'],
[false, '123', '122'],
[false, '123', 122],
[false, 123, '122'],
[true, '123.45', '123.46'],
[true, '123.45', 123.46],
[true, 123.45, '123.46'],
[false, '123.45', '123.44'],
[false, '123.45', 123.44],
[false, 123.45, '123.44'],
]);
it('correctly applies less than or equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
$comparisonFilterRule = ComparisonFilterRule::LessThanOrEqual;
expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
[true, 0, 1],
[true, 4, 5],
[true, 5.79, 5.7901],
[true, 11, 11],
[false, 1, 0],
[false, 9.2901, 9.29],
[true, '123', '124'],
[true, '123', 124],
[true, 123, '123'],
[false, '123', '122'],
[false, '123', 122],
[false, 123, '122'],
[true, '123.45', '123.46'],
[true, '123.45', 123.45],
[true, 123.45, '123.46'],
[false, '123.45', '123.44'],
[false, '123.45', 123.44],
[false, 123.45, '123.44'],
]);
================================================
FILE: tests/Steps/Filters/Enums/StringFilterRuleTest.php
================================================
evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
[true, 'foobarbaz', 'foo'],
[true, 'foo bar baz', 'foo'],
[true, 'foo bar baz', 'bar'],
[true, 'foo bar baz', 'baz'],
[false, 'foo bar baz', 'Foo'],
]);
it('checks if a string starts with another string', function (
bool $expectedResult,
mixed $haystack,
mixed $needle,
) {
$stringFilterRule = StringFilterRule::StartsWith;
expect($stringFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
[true, 'foobarbaz', 'foo'],
[true, 'foo bar baz', 'foo'],
[true, 'foo bar baz', 'foo bar'],
[false, 'foo bar baz', 'bar'],
[false, 'foo bar baz', 'baz'],
[false, 'foo bar baz', 'Foo'],
]);
it('checks if a string ends with another string', function (
bool $expectedResult,
mixed $haystack,
mixed $needle,
) {
$stringFilterRule = StringFilterRule::EndsWith;
expect($stringFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
[true, 'foobarbaz', 'baz'],
[true, 'foo bar baz', 'baz'],
[true, 'foo bar baz', 'bar baz'],
[false, 'foo bar baz', 'bar'],
[false, 'foo bar baz', 'foo'],
[false, 'foo bar baz', 'Baz'],
]);
================================================
FILE: tests/Steps/Filters/Enums/StringLengthFilterRuleTest.php
================================================
evaluate($value1, $value2))->toBe($expectedResult);
})->with([
[true, 'foo', 3],
[true, 'lorem', 5],
[true, 'foo bar', 7],
[false, 'bar', 4],
[false, 'baz quz', 6],
]);
it('correctly applies not equal rule', function (bool $expectedResult, mixed $value1, mixed $value2) {
$comparisonFilterRule = StringLengthFilterRule::NotEqual;
expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
[true, 'foo', 2],
[true, 'foo bar', 8],
[false, 'foo', 3],
[false, 'lorem ipsum', 11],
]);
it('correctly applies greater than rule', function (bool $expectedResult, mixed $value1, mixed $value2) {
$comparisonFilterRule = StringLengthFilterRule::GreaterThan;
expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
[true, 'foo', 2],
[true, 'foo bar', 6],
[false, 'foo', 3],
[false, 'foo bar', 7],
]);
it('correctly applies greater than or equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
$comparisonFilterRule = StringLengthFilterRule::GreaterThanOrEqual;
expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
[true, 'foo', 2],
[true, 'foo', 3],
[true, 'foo bar', 6],
[true, 'foo bar', 7],
[false, 'foo', 4],
[false, 'foo bar', 8],
]);
it('correctly applies less than operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
$comparisonFilterRule = StringLengthFilterRule::LessThan;
expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
[true, 'foo', 4],
[true, 'foo bar', 8],
[false, 'foo', 3],
[false, 'foo bar', 7],
]);
it('correctly applies less than or equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
$comparisonFilterRule = StringLengthFilterRule::LessThanOrEqual;
expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
[true, 'foo', 4],
[true, 'foo', 3],
[true, 'foo bar', 8],
[true, 'foo bar', 7],
[false, 'foo', 2],
[false, 'foo bar', 6],
]);
================================================
FILE: tests/Steps/Filters/Enums/UrlFilterRuleTest.php
================================================
evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
[true, 'https://www.example.com', 'https'],
[true, 'http://www.example.com', 'http'],
[true, 'ftp://user:password@example.com:21/path', 'ftp'],
[false, 'https://www.example.com', 'http'],
]);
it('checks if a URL has a certain host', function (bool $expectedResult, mixed $haystack, mixed $needle) {
$urlFilterRule = UrlFilterRule::Host;
expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
[true, 'https://www.example.com', 'www.example.com'],
[true, 'https://jobs.example.com', 'jobs.example.com'],
[true, 'https://pew.pew.pew.example.com:8080/pew', 'pew.pew.pew.example.com'],
[false, 'https://jobs.example.com', 'www.example.com'],
]);
it('checks if a URL has a certain domain', function (bool $expectedResult, mixed $haystack, mixed $needle) {
$urlFilterRule = UrlFilterRule::Domain;
expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
[true, 'https://www.example.com', 'example.com'],
[true, 'https://jobs.example.com', 'example.com'],
[true, 'https://pew.pew.pew.example.com:8080/pew', 'example.com'],
[false, 'https://www.example.com', 'yolo.com'],
[false, 'https://www.example.com', 'www.example.com'],
]);
it('checks if a URL has a certain path', function (bool $expectedResult, mixed $haystack, mixed $needle) {
$urlFilterRule = UrlFilterRule::Path;
expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
[true, 'https://www.example.com/foo/bar', '/foo/bar'],
[false, 'https://www.example.com/foo/bar/baz', '/foo/bar'],
]);
it('checks if a URL path starts with a certain path', function (bool $expectedResult, mixed $haystack, mixed $needle) {
$urlFilterRule = UrlFilterRule::PathStartsWith;
expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
[true, 'https://www.example.com/foo/bar', '/foo/bar'],
[true, 'https://www.example.com/foo/bar', '/foo'],
[false, 'https://www.example.com/foo/bar', '/bar'],
]);
it('checks if a URL path matches a regex pattern', function (bool $expectedResult, mixed $haystack, mixed $needle) {
$urlFilterRule = UrlFilterRule::PathMatches;
expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
[true, 'https://www.example.com/foo/bar', '^/foo/'],
[true, 'https://www.example.com/56/something/foo', '^/\d{1,5}/[a-z]{1,20}'],
[false, 'https://www.example.com/56/some-thing/foo', '^/\d{1,5}/[a-z]{1,20}/'],
]);
================================================
FILE: tests/Steps/Filters/FilterTest.php
================================================
value = $this->getKey($valueInQuestion);
return true;
}
}
it('gets a key from an array', function () {
$filter = new TestFilter();
$filter->useKey('foo');
$filter->evaluate(['foo' => 'fooValue', 'bar' => 'barValue']);
expect($filter->value)->toBe('fooValue');
});
it('gets a key from an object', function () {
$filter = new TestFilter();
$filter->useKey('foo');
$filter->evaluate(helper_getStdClassWithData(['foo' => 'fooValue', 'bar' => 'barValue']));
expect($filter->value)->toBe('fooValue');
});
it('throws an exception when the value in question is not array or object when a key to use was defined', function () {
$filter = new TestFilter();
$filter->useKey('foo');
$filter->evaluate('foo');
})->throws(InvalidArgumentException::class);
it('throws an exception when the key to use is not contained in an array', function () {
$filter = new TestFilter();
$filter->useKey('foo');
$filter->evaluate(['bar' => 'barValue', 'baz' => 'bazValue']);
})->throws(Exception::class);
it('throws an exception when the key to use is not contained in an object', function () {
$filter = new TestFilter();
$filter->useKey('foo');
$filter->evaluate(helper_getStdClassWithData(['bar' => 'barValue', 'baz' => 'bazValue']));
})->throws(Exception::class);
================================================
FILE: tests/Steps/Filters/NegatedFilterTest.php
================================================
evaluate('foo'))->toBeTrue();
expect($negatedFilter->evaluate('foo'))->toBeFalse();
expect($filter->evaluate('bar'))->toBeFalse();
expect($negatedFilter->evaluate('bar'))->toBeTrue();
});
================================================
FILE: tests/Steps/Filters/StringFilterTest.php
================================================
evaluate('foo bar baz'))->toBeTrue();
expect($stringCheck->evaluate('lorem ipsum'))->toBeFalse();
});
it('checks a string from an array using a key', function () {
$stringCheck = new StringFilter(StringFilterRule::StartsWith, 'waldo');
$stringCheck->useKey('bar');
expect($stringCheck->evaluate(['foo' => 'something', 'bar' => 'waldo check', 'baz' => 'test']))->toBeTrue();
expect($stringCheck->evaluate(['foo' => 'something', 'bar' => 'check waldo', 'baz' => 'test']))->toBeFalse();
});
it('checks a string from an object using a key', function () {
$stringCheck = new StringFilter(StringFilterRule::EndsWith, 'waldo');
$stringCheck->useKey('bar');
$object = helper_getStdClassWithData(['foo' => 'something', 'bar' => 'check waldo', 'baz' => 'test']);
expect($stringCheck->evaluate($object))->toBeTrue();
$object = helper_getStdClassWithData(['foo' => 'something', 'bar' => 'waldo check', 'baz' => 'test']);
expect($stringCheck->evaluate($object))->toBeFalse();
});
================================================
FILE: tests/Steps/Filters/StringLengthFilterTest.php
================================================
evaluate('foo'))->toBeFalse();
expect($stringCheck->evaluate('lorem ipsum'))->toBeTrue();
});
it('checks a string from an array using a key', function () {
$stringCheck = new StringLengthFilter(StringLengthFilterRule::GreaterThan, 10);
$stringCheck->useKey('bar');
expect($stringCheck->evaluate(['foo' => 'one', 'bar' => 'two', 'baz' => 'three']))->toBeFalse();
expect($stringCheck->evaluate(['foo' => 'one', 'bar' => 'lorem ipsum', 'baz' => 'three']))->toBeTrue();
});
it('checks a string from an object using a key', function () {
$stringCheck = new StringLengthFilter(StringLengthFilterRule::GreaterThan, 10);
$stringCheck->useKey('bar');
$object = helper_getStdClassWithData(['foo' => 'one', 'bar' => 'two', 'baz' => 'three']);
expect($stringCheck->evaluate($object))->toBeFalse();
$object = helper_getStdClassWithData(['foo' => 'one', 'bar' => 'lorem ipsum', 'baz' => 'three']);
expect($stringCheck->evaluate($object))->toBeTrue();
});
================================================
FILE: tests/Steps/Filters/UrlFilterTest.php
================================================
evaluate('https://www.crwlr.software/packages'))->toBeTrue();
expect($urlFilter->evaluate('https://www.example.com/something'))->toBeFalse();
});
it('evaluates an url from an array using a key', function () {
$urlFilter = (new UrlFilter(UrlFilterRule::Scheme, 'https'))->useKey('bar');
expect($urlFilter->evaluate(['foo' => 'yo', 'bar' => 'https://www.example.com']))->toBeTrue();
expect($urlFilter->evaluate(['foo' => 'yo', 'bar' => 'http://www.example.com']))->toBeFalse();
});
it('evaluates a string from an object using a key', function () {
$urlFilter = (new UrlFilter(UrlFilterRule::PathStartsWith, '/foo'))->useKey('bar');
expect($urlFilter->evaluate(
helper_getStdClassWithData(['foo' => 'yo', 'bar' => 'https://www.example.com/foo/bar/baz']),
))->toBeTrue();
expect($urlFilter->evaluate(
helper_getStdClassWithData(['foo' => 'yo', 'bar' => 'https://www.example.com/articles/1']),
))->toBeFalse();
});
it('doesnt throw an exception when value is not a valid url', function () {
$urlFilter = new UrlFilter(UrlFilterRule::Host, 'invalid');
expect($urlFilter->evaluate('https*://invalid'))->toBeFalse();
});
================================================
FILE: tests/Steps/GroupTest.php
================================================
addStep($step);
}
return $group;
}
function helper_addUpdateInputUsingOutputCallbackToSteps(Closure $callback, Step ...$steps): void
{
foreach ($steps as $step) {
$step->updateInputUsingOutput($callback);
}
}
function helper_getStepThatRemembersIfItWasCalled(): Step
{
return new class extends Step {
public bool $called = false;
protected function invoke(mixed $input): Generator
{
$this->called = true;
yield 'test';
}
};
}
test('You can add a step and it passes on the logger', function () {
$step = Mockery::mock(StepInterface::class);
$step->shouldReceive('addLogger')->once();
$step->shouldNotReceive('setLoader');
$group = new Group();
$group->addLogger(new CliLogger());
$group->addStep($step);
});
it('also passes on a new logger to all steps when the logger is added after the steps', function () {
$step1 = Mockery::mock(StepInterface::class);
$step1->shouldReceive('addLogger')->once();
$step2 = Mockery::mock(StepInterface::class);
$step2->shouldReceive('addLogger')->once();
$group = new Group();
$group->addStep($step1);
$group->addStep($step2);
$group->addLogger(new CliLogger());
});
it('also passes on the loader to the step when setLoader method exists in step', function () {
$step = Mockery::mock(helper_getLoadingStep());
$step->shouldReceive('addLogger')->once();
$step->shouldReceive('setLoader')->once();
$group = new Group();
$group->addLogger(new CliLogger());
$group->setLoader(new HttpLoader(new BotUserAgent('MyBot')));
/** @var Step $step */
$group->addStep($step);
});
it('also passes on a new loader to all steps when it is added after the steps', function () {
$step1 = Mockery::mock(helper_getLoadingStep());
$step1->shouldReceive('setLoader')->once();
$step2 = Mockery::mock(helper_getLoadingStep());
$step2->shouldReceive('setLoader')->once();
$group = new Group();
/** @var Step $step1 */
$group->addStep($step1);
/** @var Step $step2 */
$group->addStep($step2);
$group->setLoader(new HttpLoader(new BotUserAgent('MyBot')));
});
test('The factory method returns a Group object instance', function () {
expect(Crawler::group())->toBeInstanceOf(Group::class);
});
test('You can add multiple steps and invokeStep calls all of them', function () {
$step1 = helper_getStepThatRemembersIfItWasCalled();
$step2 = helper_getStepThatRemembersIfItWasCalled();
$step3 = helper_getStepThatRemembersIfItWasCalled();
$group = new Group();
$group->addStep($step1)->addStep($step2)->addStep($step3);
helper_invokeStepWithInput($group);
expect($step1->called)->toBeTrue() // @phpstan-ignore-line
->and($step2->called)->toBeTrue() // @phpstan-ignore-line
->and($step3->called)->toBeTrue(); // @phpstan-ignore-line
});
it('combines the outputs of all it\'s steps into one output containing an array', function () {
$step1 = helper_getValueReturningStep('lorem');
$step2 = helper_getValueReturningStep('ipsum');
$step3 = helper_getValueReturningStep('dolor');
$group = new Group();
$group->addStep($step1)->addStep($step2)->addStep($step3);
$output = helper_invokeStepWithInput($group, 'gogogo');
expect($output)->toHaveCount(1)
->and($output[0])->toBeInstanceOf(Output::class)
->and($output[0]->get())->toBe(['lorem', 'ipsum', 'dolor']);
});
test(
'When defining keys for the steps via $step->outputKey(), the combined output array has those keys',
function () {
$step1 = helper_getValueReturningStep('ich');
$step2 = helper_getValueReturningStep('bin');
$step3 = helper_getValueReturningStep('ein berliner');
$group = (new Group())
->addStep($step1->outputKey('foo'))
->addStep($step2->outputKey('bar'))
->addStep($step3->outputKey('baz'));
$output = helper_invokeStepWithInput($group, 'https://www.gogo.go');
expect($output)->toHaveCount(1)
->and($output[0])->toBeInstanceOf(Output::class);
$expectedOutputAndResultArray = ['foo' => 'ich', 'bar' => 'bin', 'baz' => 'ein berliner'];
expect($output[0]->get())->toBe($expectedOutputAndResultArray);
},
);
it('merges array outputs with string keys to one array', function () {
$step1 = helper_getValueReturningStep(['foo' => 'fooValue', 'bar' => 'barValue']);
$step2 = helper_getValueReturningStep(['baz' => 'bazValue', 'yo' => 'lo']);
$group = (new Group())
->addStep($step1)
->addStep($step2);
$output = helper_invokeStepWithInput($group);
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe([
'foo' => 'fooValue',
'bar' => 'barValue',
'baz' => 'bazValue',
'yo' => 'lo',
]);
});
it('doesn\'t invoke twice with duplicate inputs when uniqueInput was called', function () {
$step1 = helper_getValueReturningStep('one');
$step2 = helper_getValueReturningStep('two');
$group = helper_addStepsToGroup(new Group(), $step1, $step2);
$outputs = helper_invokeStepWithInput($group, 'foo');
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($group, 'foo');
expect($outputs)->toHaveCount(1);
$group->resetAfterRun();
$group->uniqueInputs();
$outputs = helper_invokeStepWithInput($group, 'foo');
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($group, 'foo');
expect($outputs)->toHaveCount(0);
});
it(
'doesn\'t invoke twice with array inputs with duplicate keys when uniqueInput was called with that key',
function () {
$step1 = helper_getValueReturningStep('one');
$step2 = helper_getValueReturningStep('two');
$group = helper_addStepsToGroup(new Group(), $step1, $step2);
$group->uniqueInputs();
$outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'bttfc' => 'marty']);
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'bttfc' => 'doc']);
expect($outputs)->toHaveCount(1);
$group->resetAfterRun();
$group->uniqueInputs('foo');
$outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'bttfc' => 'marty']);
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'bttfc' => 'doc']);
expect($outputs)->toHaveCount(0);
},
);
it(
'doesn\'t invoke twice with object inputs with duplicate keys when uniqueInput was called with that key',
function () {
$step1 = helper_getValueReturningStep('one');
$step2 = helper_getValueReturningStep('two');
$group = helper_addStepsToGroup(new Group(), $step1, $step2);
$group->uniqueInputs();
$outputs = helper_invokeStepWithInput($group, helper_getStdClassWithData(['foo' => 'bar', 'bttfc' => 'marty']));
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($group, helper_getStdClassWithData(['foo' => 'bar', 'bttfc' => 'doc']));
expect($outputs)->toHaveCount(1);
$group->resetAfterRun();
$group->uniqueInputs('foo');
$outputs = helper_invokeStepWithInput($group, helper_getStdClassWithData(['foo' => 'bar', 'bttfc' => 'marty']));
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($group, helper_getStdClassWithData(['foo' => 'bar', 'bttfc' => 'doc']));
expect($outputs)->toHaveCount(0);
},
);
it('returns only unique outputs when uniqueOutput was called', function () {
$step1 = helper_getInputReturningStep();
$step2 = helper_getValueReturningStep('test');
$group = helper_addStepsToGroup(new Group(), $step1, $step2)->uniqueOutputs();
$outputs = helper_invokeStepWithInput($group, 'foo');
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($group, 'bar');
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($group, 'foo');
expect($outputs)->toHaveCount(0);
});
it('returns only unique outputs when outputs are arrays and uniqueOutput was called', function () {
$step1 = helper_getInputReturningStep();
$step2 = helper_getValueReturningStep(['lorem' => 'ipsum']);
$group = helper_addStepsToGroup(new Group(), $step1, $step2)->uniqueOutputs();
$outputs = helper_invokeStepWithInput($group, ['foo' => 'bar']);
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($group, ['baz' => 'quz']);
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($group, ['foo' => 'bar']);
expect($outputs)->toHaveCount(0);
});
it(
'returns only unique outputs when outputs are arrays and uniqueOutput was called with a key from the output arrays',
function () {
$step1 = helper_getInputReturningStep();
$step2 = helper_getValueReturningStep(['lorem' => 'ipsum']);
$group = helper_addStepsToGroup(new Group(), $step1, $step2)->uniqueOutputs('foo');
$outputs = helper_invokeStepWithInput($group, ['foo' => 'bar']);
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($group, ['foo' => 'baz']);
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'something' => 'else']);
expect($outputs)->toHaveCount(0);
},
);
it('returns only unique outputs when outputs are objects and uniqueOutput was called', function () {
$step1 = helper_getStepYieldingObjectWithNumber(10);
$step2 = helper_getStepYieldingObjectWithNumber(11);
$group = helper_addStepsToGroup(new Group(), $step1, $step2);
expect(helper_invokeStepWithInput($group))->toHaveCount(1);
$group->uniqueOutputs();
expect(helper_invokeStepWithInput($group))->toHaveCount(1)
->and(helper_invokeStepWithInput($group))->toHaveCount(0);
$incrementNumberCallback = function (mixed $input) {
return $input + 1;
};
helper_addUpdateInputUsingOutputCallbackToSteps($incrementNumberCallback, $step1, $step2);
expect(helper_invokeStepWithInput($group, new Input(1)))->toHaveCount(1);
});
it(
'returns only unique outputs when outputs are objects and uniqueOutput was called with a property name from the ' .
'output objects',
function () {
$step1 = helper_getStepYieldingObjectWithNumber(21);
$step2 = helper_getStepYieldingObjectWithNumber(23);
$group = helper_addStepsToGroup(new Group(), $step1, $step2);
expect(helper_invokeStepWithInput($group))->toHaveCount(1);
$group->resetAfterRun();
$group->uniqueOutputs('number');
expect(helper_invokeStepWithInput($group))->toHaveCount(1)
->and(helper_invokeStepWithInput($group))->toHaveCount(0);
$group->resetAfterRun();
$incrementNumberCallback = function (mixed $input) {
return $input + 1;
};
helper_addUpdateInputUsingOutputCallbackToSteps($incrementNumberCallback, $step1, $step2);
expect(helper_invokeStepWithInput($group, new Input(1)))->toHaveCount(1);
},
);
it(
'excludes the output of a step from the combined group output, when the excludeFromGroupOutput() method was called',
function () {
$step1 = helper_getValueReturningStep(['foo' => 'one']);
$step2 = helper_getValueReturningStep(['bar' => 'two'])->excludeFromGroupOutput();
$step3 = helper_getValueReturningStep(['baz' => 'three']);
$group = helper_addStepsToGroup(new Group(), $step1, $step2, $step3);
$outputs = helper_invokeStepWithInput($group);
expect($outputs)->toHaveCount(1)
->and($outputs[0]->get())->toBe(['foo' => 'one', 'baz' => 'three']);
},
);
test('You can update the input for further steps with the output of a step that is before those steps', function () {
$step1 = helper_getValueReturningStep(' rocks')
->updateInputUsingOutput(function (mixed $input, mixed $output) {
return $input . $output['foo'];
});
$step2 = helper_getInputReturningStep();
$group = (new Group())
->addStep($step1->outputKey('foo'))
->addStep($step2->outputKey('bar'));
$outputs = helper_invokeStepWithInput($group, 'crwlr.software');
expect($outputs)->toHaveCount(1)
->and($outputs[0]->get())->toBe(['foo' => ' rocks', 'bar' => 'crwlr.software rocks']);
});
it('uses a key from array input when defined', function () {
$step = helper_getInputReturningStep();
$group = (new Group())
->addStep($step->outputKey('test'))
->useInputKey('bar');
$outputs = helper_invokeStepWithInput($group, new Input(
['foo' => 'fooValue', 'bar' => 'barValue', 'baz' => 'bazValue'],
));
expect($outputs)->toHaveCount(1)
->and($outputs[0]->get())->toBe(['test' => 'barValue']);
});
it('keeps the combined output with a certain key when keepAs() is used', function () {
$step1 = helper_getValueReturningStep('foo');
$step2 = helper_getValueReturningStep('bar');
$group = (new Group())
->addStep($step1->outputKey('key1'))
->addStep($step2->outputKey('key2'))
->keepAs('test');
$output = helper_invokeStepWithInput($group);
expect($output)->toHaveCount(1)
->and($output[0]->keep)->toBe(['test' => ['key1' => 'foo', 'key2' => 'bar']]);
});
it('keeps all keys from a combined array output when keep() was called without argument', function () {
$step1 = helper_getValueReturningStep(['foo' => 'fooValue', 'bar' => 'barValue']);
$step2 = helper_getValueReturningStep(['baz' => 'bazValue', 'yo' => 'lo']);
$group = (new Group())
->addStep($step1)
->addStep($step2)
->keep();
$output = helper_invokeStepWithInput($group);
expect($output)->toHaveCount(1)
->and($output[0]->keep)->toBe([
'foo' => 'fooValue',
'bar' => 'barValue',
'baz' => 'bazValue',
'yo' => 'lo',
]);
});
it('keeps all defined keys from a combined array output when keep() was called with keys', function () {
$step1 = helper_getValueReturningStep(['foo' => 'fooValue', 'bar' => 'barValue']);
$step2 = helper_getValueReturningStep(['baz' => 'bazValue', 'yo' => 'lo']);
$group = (new Group())
->addStep($step1)
->addStep($step2)
->keep(['foo', 'baz', 'yo']);
$output = helper_invokeStepWithInput($group);
expect($output)->toHaveCount(1)
->and($output[0]->keep)->toBe([
'foo' => 'fooValue',
'baz' => 'bazValue',
'yo' => 'lo',
]);
});
it('keeps data, when keep() is called on child steps', function () {
$step1 = helper_getValueReturningStep(['foo' => 'fooValue', 'bar' => 'barValue']);
$step2 = helper_getValueReturningStep(['baz' => 'bazValue', 'quz' => 'quzValue']);
$group = (new Group())
->addStep($step1->keep('foo'))
->addStep($step2->keep(['baz', 'quz']));
$output = helper_invokeStepWithInput($group);
expect($output)->toHaveCount(1)
->and($output[0]->keep)->toBe([
'foo' => 'fooValue',
'baz' => 'bazValue',
'quz' => 'quzValue',
]);
});
it('keeps data, when keepAs() is called on child steps', function () {
$step1 = helper_getValueReturningStep('fooValue');
$step2 = helper_getValueReturningStep(['bar' => 'barValue', 'baz' => 'bazValue']);
$group = (new Group())
->addStep($step1->keepAs('foo'))
->addStep($step2->keepAs('quz'));
$output = helper_invokeStepWithInput($group);
expect($output)->toHaveCount(1)
->and($output[0]->keep)->toBe([
'foo' => 'fooValue',
'quz' => [
'bar' => 'barValue',
'baz' => 'bazValue',
],
]);
});
test(
'when steps yield multiple outputs it combines the first output from first step with first output from second ' .
'step and so on.',
function () {
$step1 = new class extends Step {
protected function invoke(mixed $input): Generator
{
yield ['one' => 'foo'];
yield ['two' => 'bar'];
}
};
$step2 = new class extends Step {
protected function invoke(mixed $input): Generator
{
yield ['three' => 'baz'];
yield ['four' => 'quz'];
}
};
$group = (new Group())
->addStep($step1)
->addStep($step2);
$output = helper_invokeStepWithInput($group);
expect($output)->toHaveCount(2)
->and($output[0]->get())->toBe(['one' => 'foo', 'three' => 'baz'])
->and($output[1]->get())->toBe(['two' => 'bar', 'four' => 'quz']);
},
);
it('ignores the key set via outputKey because group step output is always an array', function () {
$step1 = helper_getValueReturningStep(['one' => 'foo']);
$step2 = helper_getValueReturningStep(['two' => 'bar']);
$group = (new Group())
->addStep($step1)
->addStep($step2)
->outputKey('baz');
$output = helper_invokeStepWithInput($group);
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe(['one' => 'foo', 'two' => 'bar']);
});
it(
'keeps input data when keepFromInput() was called when outputs are combined',
function () {
$step1 = helper_getValueReturningStep(['foo' => 'one']);
$step2 = helper_getValueReturningStep(['bar' => 'two']);
$group = (new Group())
->addStep($step1)
->addStep($step2)
->keepFromInput();
$output = helper_invokeStepWithInput($group, new Input(['baz' => 'three']));
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two'])
->and($output[0]->keep)->toBe(['baz' => 'three']);
},
);
it('keeps non array input data in array output with key', function () {
$step1 = helper_getValueReturningStep(['foo' => 'one']);
$step2 = helper_getValueReturningStep(['bar' => 'two']);
$group = (new Group())
->addStep($step1)
->addStep($step2)
->keepInputAs('baz');
$output = helper_invokeStepWithInput($group, new Input('three'));
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two'])
->and($output[0]->keep)->toBe(['baz' => 'three']);
});
it('keeps a value with unnamed key, when non array input should be kept but no key is defined', function () {
$step1 = helper_getValueReturningStep(['foo' => 'one']);
$step2 = helper_getValueReturningStep(['bar' => 'two']);
$group = (new Group())
->addStep($step1)
->addStep($step2)
->keepFromInput();
$output = helper_invokeStepWithInput($group, new Input('three'));
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two'])
->and($output[0]->keep)->toBe(['unnamed1' => 'three']);
});
it('contains an element with a numeric key when it contains a step that yields non array output', function () {
$step1 = helper_getValueReturningStep('one');
$step2 = helper_getValueReturningStep(['bar' => 'two']);
$group = (new Group())
->addStep($step1)
->addStep($step2);
$output = helper_invokeStepWithInput($group);
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe([0 => 'one', 'bar' => 'two']);
});
it('keeps array input data when some output is non array but converted to array using outputKey()', function () {
$step1 = helper_getValueReturningStep('one');
$step2 = helper_getValueReturningStep(['bar' => 'two']);
$group = (new Group())
->addStep($step1->outputKey('foo'))
->addStep($step2)
->keepFromInput();
$output = helper_invokeStepWithInput($group, new Input(['baz' => 'three']));
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two'])
->and($output[0]->keep)->toBe(['baz' => 'three']);
});
it(
'keeps an input value with an unnamed key, when it is a non array value and no key is defined (via keepInputAs())',
function () {
$step1 = helper_getValueReturningStep('one');
$step2 = helper_getValueReturningStep(['bar' => 'two']);
$group = (new Group())
->addStep($step1)
->addStep($step2)
->keepFromInput();
$output = helper_invokeStepWithInput($group, new Input('three'));
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe([0 => 'one', 'bar' => 'two'])
->and($output[0]->keep)->toBe(['unnamed1' => 'three']);
},
);
it('keeps the original input data when useInputKey() is used', function () {
$step1 = helper_getValueReturningStep(['foo' => 'one']);
$step2 = helper_getValueReturningStep(['bar' => 'two']);
$group = (new Group())
->addStep($step1)
->addStep($step2)
->useInputKey('baz')
->keepFromInput();
$output = helper_invokeStepWithInput($group, new Input(['baz' => 'three', 'quz' => 'four']));
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two'])
->and($output[0]->keep)->toBe(['baz' => 'three', 'quz' => 'four']);
});
it('applies a Closure refiner to the steps output', function () {
$step1 = helper_getValueReturningStep(['foo' => 'one']);
$step2 = helper_getValueReturningStep(['bar' => 'two']);
$group = (new Group())
->addStep($step1)
->addStep($step2)
->refineOutput(function (mixed $outputValue) {
$outputValue['baz'] = 'three';
$outputValue['bar'] .= ' refined';
return $outputValue;
});
$outputs = helper_invokeStepWithInput($group);
expect($outputs[0]->get())->toBe(['foo' => 'one', 'bar' => 'two refined', 'baz' => 'three']);
});
it('applies an instance of the RefinerInterface to the steps output', function () {
$step1 = helper_getValueReturningStep(['foo' => 'lorem ipsum dolor']);
$step2 = helper_getValueReturningStep(['bar' => 'two']);
$group = (new Group())
->addStep($step1)
->addStep($step2)
->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor'));
$outputs = helper_invokeStepWithInput($group);
expect($outputs[0]->get())->toBe(['foo' => 'ipsum', 'bar' => 'two']);
});
it('applies multiple refiners to the steps output in the order they\'re added', function () {
$step1 = helper_getValueReturningStep(['foo' => 'lorem ipsum dolor']);
$step2 = helper_getValueReturningStep(['bar' => 'two']);
$group = (new Group())
->addStep($step1)
->addStep($step2)
->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor'))
->refineOutput('bar', fn(mixed $outputValue) => $outputValue . ' refined');
$outputs = helper_invokeStepWithInput($group);
expect($outputs[0]->get())->toBe(['foo' => 'ipsum', 'bar' => 'two refined']);
});
test('you can apply multiple refiners to the same output array key', function () {
$step1 = helper_getValueReturningStep(['foo' => 'lorem ipsum dolor']);
$step2 = helper_getValueReturningStep(['bar' => 'two']);
$group = (new Group())
->addStep($step1)
->addStep($step2)
->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor'))
->refineOutput('foo', fn(mixed $outputValue) => $outputValue . ' refined');
$outputs = helper_invokeStepWithInput($group);
expect($outputs[0]->get())->toBe(['foo' => 'ipsum refined', 'bar' => 'two']);
});
it(
'uses the original input value when applying a refiner, not only the value of an input array key chosen via ' .
'useInputKey()',
function () {
$step1 = helper_getValueReturningStep(['foo' => 'one']);
$step2 = helper_getValueReturningStep(['bar' => 'two']);
$group = (new Group())
->addStep($step1)
->addStep($step2)
->refineOutput(fn(mixed $outputValue, mixed $originalInputValue) => $originalInputValue);
$outputs = helper_invokeStepWithInput($group, ['yo' => 'lo']);
expect($outputs[0]->get())->toBe(['yo' => 'lo']);
},
);
it('stops calling its steps and producing outputs when maxOutputs is reached', function () {
$step1 = new class extends Step {
public int $called = 0;
protected function invoke(mixed $input): Generator
{
yield ['foo' => 'one'];
$this->called++;
}
};
$step2 = new class extends Step {
public int $called = 0;
protected function invoke(mixed $input): Generator
{
yield ['bar' => 'two'];
$this->called++;
}
};
$group = (new Group())
->addStep($step1)
->addStep($step2)
->maxOutputs(2);
expect(helper_invokeStepWithInput($group, 'hey'))->toHaveCount(1)
->and(helper_invokeStepWithInput($group, 'ho'))->toHaveCount(1)
->and(helper_invokeStepWithInput($group, 'hey'))->toHaveCount(0)
->and($step1->called)->toBe(2)
->and($step2->called)->toBe(2);
});
it(
'also stops creating outputs when maxOutputs is reached, when maxOutputs() was called before addStep()',
function () {
$step1 = new class extends Step {
public int $called = 0;
protected function invoke(mixed $input): Generator
{
yield ['foo' => 'one'];
$this->called++;
}
};
$step2 = new class extends Step {
public int $called = 0;
protected function invoke(mixed $input): Generator
{
yield ['bar' => 'two'];
$this->called++;
}
};
$group = (new Group())
->maxOutputs(2)
->addStep($step1)
->addStep($step2);
expect(helper_invokeStepWithInput($group, 'hey'))->toHaveCount(1)
->and(helper_invokeStepWithInput($group, 'ho'))->toHaveCount(1)
->and(helper_invokeStepWithInput($group, 'hey'))->toHaveCount(0)
->and($step1->called)->toBe(2)
->and($step2->called)->toBe(2);
},
);
================================================
FILE: tests/Steps/Html/CssSelectorTest.php
================================================
throws(InvalidDomQueryException::class)->with(['.foo;', '.foo:before']);
test('The apply method returns a string for a single match', function () {
$html = '
test
';
expect((new CssSelector('.item'))->apply(new HtmlDocument($html)))->toBe('test');
});
test('The apply method returns an array of strings for multiple matches', function () {
$html = '
';
expect((new CssSelector('.aitem'))->apply(new HtmlDocument($html)))->toBeNull();
});
it('trims whitespace', function () {
$html = <<
test
HTML;
expect((new CssSelector('.item'))->apply(new HtmlDocument($html)))->toBe('test');
});
it('contains inner tags when the html method is called', function () {
$html = '
test sub
';
expect((new CssSelector('.item'))->html()->apply(new HtmlDocument($html)))->toBe('test sub');
});
it('contains also the outer tag when the outerHtml method is called', function () {
$html = '
';
$converter = new Html2Text();
$converter->removeConverter('ul');
expect((new CssSelector('#a'))->formattedText($converter)->apply(new HtmlDocument($html)))
->toBe(<<test';
expect((new CssSelector('.item'))->attribute('data-attr')->apply(new HtmlDocument($html)))->toBe('content');
});
test('getting an attribute value returns an empty string when the attribute does not exist', function () {
$html = '
test
';
expect((new CssSelector('.item'))->attribute('foo')->apply(new HtmlDocument($html)))->toBe('');
});
it('turns the value into an absolute url when toAbsoluteUrl() is called', function () {
$html = 'getting started';
$document = new HtmlDocument($html);
$selector = new CssSelector('a');
$selector->setBaseUrl('https://www.crwlr.software/')
->attribute('href');
expect($selector->apply($document))->toBe('/packages/crawler/v0.4/getting-started');
$selector->toAbsoluteUrl();
expect($selector->apply($document))->toBe('https://www.crwlr.software/packages/crawler/v0.4/getting-started');
});
it(
'turns the value into the correct absolute url when toAbsoluteUrl() is called and the HTML contains a base tag',
function () {
$html = <<
link
HTML;
$document = new HtmlDocument($html);
$selector = new CssSelector('a');
$selector->setBaseUrl('https://www.example.com/a/b')
->attribute('href');
expect($selector->apply($document))->toBe('e');
$selector->toAbsoluteUrl();
expect($selector->apply($document))->toBe('https://www.example.com/c/e');
},
);
it('gets an absolute link from the href attribute of a link element, when the link() method is called', function () {
$html = '
';
$document = new HtmlDocument($html);
$selector = new CssSelector('#foo .bar');
$selector->setBaseUrl('https://www.example.com/');
expect($selector->apply($document))->toBe('Foo');
$selector->link();
expect($selector->apply($document))->toBe('https://www.example.com/foo/bar');
});
it('gets only the first matching element when the first() method is called', function () {
$selector = (new CssSelector('#list .item'))->first();
expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('one');
});
it('gets only the last matching element when the last() method is called', function () {
$selector = (new CssSelector('#list .item'))->last();
expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('four');
});
it('gets only the nth matching element when the nth() method is called', function () {
$selector = (new CssSelector('#list .item'))->nth(3);
expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('three');
});
it('returns null when no nth matching element exists', function () {
$selector = (new CssSelector('#list .item'))->nth(5);
expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBeNull();
});
it('gets only even matching elements when the even() method is called', function () {
$selector = (new CssSelector('#list .item'))->even();
expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['two', 'four']);
});
it('gets only odd matching elements when the odd() method is called', function () {
$selector = (new CssSelector('#list .item'))->odd();
expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['one', 'three']);
});
================================================
FILE: tests/Steps/Html/Exceptions/InvalidDomQueryExceptionTest.php
================================================
getDomQuery())
->toBe('.foo:before')
->and($exception->getMessage())
->toBe('error');
});
it('can be created from a symfony SyntaxErrorException', function () {
$exception = InvalidDomQueryException::fromSymfonyException('.foo;', new SyntaxErrorException('error message'));
expect($exception->getDomQuery())
->toBe('.foo;')
->and($exception->getMessage())
->toBe('error message');
});
it('can be created from a message and a query', function () {
$exception = InvalidDomQueryException::make('message', '.foo > .bar;');
expect($exception->getDomQuery())
->toBe('.foo > .bar;')
->and($exception->getMessage())
->toBe('message');
});
================================================
FILE: tests/Steps/Html/GetLinkTest.php
================================================
link'),
));
expect($link)->toHaveCount(1)
->and($link[0]->get())->toBe('https://www.crwl.io/blog');
});
it('logs an error message when fed with invalid input', function () {
$logger = new DummyLogger();
$step = (new GetLink())->addLogger($logger);
helper_traverseIterable($step->invokeStep(new Input(new Response())));
expect($logger->messages)->not->toBeEmpty()
->and($logger->messages[0]['message'])->toBe(
'The Crwlr\Crawler\Steps\Html\GetLink step was called with input that it can not work with: Input must ' .
'be an instance of RespondedRequest.',
);
});
test('When called without selector it just returns the first link', function () {
$step = (new GetLink());
$link = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.crwlr.software/packages/url/'),
new Response(
200,
[],
'
',
),
));
expect($link[0]->get())->toBe('https://www.crwlr.software/packages/url/v0.1');
});
test('When passing a CSS selector it selects the first matching link', function () {
$step = (new GetLink('.matchingLink'));
$responseHtml = <<
JobsNumbersProducts
HTML;
$link = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.foo.bar/company/about'),
new Response(200, [], $responseHtml),
));
expect($link[0]->get())->toBe('https://www.foo.bar/company/jobs');
});
test('When selector matches on a non-link element it\'s ignored', function () {
$step = (new GetLink('.link'));
$link = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], 'not a linklink'),
));
expect($link)->toHaveCount(1)
->and($link[0]->get())->toBe('https://www.otsch.codes/foo');
});
it('finds only links on the same domain when onSameDomain() was called', function () {
$html = <<link1
link2
HTML;
$step = (new GetLink())->onSameDomain();
$link = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($link)->toHaveCount(1)
->and($link[0]->get())->toBe('https://blog.otsch.codes/articles');
});
it('doesn\'t find a link on the same domain when notOnSameDomain() was called', function () {
$html = <<link1
link2
HTML;
$step = (new GetLink())->notOnSameDomain();
$link = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($link)->toHaveCount(1)
->and($link[0]->get())->toBe('https://www.crwlr.software/packages');
});
it('finds only links from domains the onDomain() method was called with', function () {
$html = <<link1
link2link3link4
HTML;
$step = (new GetLink())->onDomain('example.com');
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(1)
->and($links[0]->get())->toBe('https://www.example.com');
});
test('onDomain() also takes an array of domains', function () {
$html = <<link1
link2
HTML;
$step = (new GetLink())->onDomain(['otsch.codes', 'example.com']);
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(1)
->and($links[0]->get())->toBe('https://www.otsch.codes/contact');
$html = <<link1
link2
HTML;
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(1)
->and($links[0]->get())->toBe('https://www.example.com/foo');
});
test('onDomain() can be called multiple times and merges all domains it was called with', function () {
$html = <<link1
HTML;
$step = (new GetLink())->onDomain('crwl.io');
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(0);
$step->onDomain(['otsch.codes', 'crwlr.software']);
$html = <<link1
link2
HTML;
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(1)
->and($links[0]->get())->toBe('https://www.crwl.io');
$html = <<link1
link2
HTML;
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(1)
->and($links[0]->get())->toBe('https://www.otsch.codes/contact');
});
it('finds only links on the same host when onSameHost() was called', function () {
$html = <<link1
link2link3
HTML;
$step = (new GetLink())->onSameHost();
$link = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($link)->toHaveCount(1)
->and($link[0]->get())->toBe('https://www.otsch.codes/contact');
});
it('doesn\'t find a link on the same host when notOnSameHost() was called', function () {
$html = <<link1
link2
HTML;
$step = (new GetLink())->notOnSameHost();
$link = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($link)->toHaveCount(1)
->and($link[0]->get())->toBe('https://jobs.otsch.codes');
});
it('finds only links from hosts the onHost() method was called with', function () {
$html = <<link1
link2link3link4
HTML;
$step = (new GetLink())->onHost('www.example.com');
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(1)
->and($links[0]->get())->toBe('https://www.example.com');
});
test('onHost() also takes an array of hosts', function () {
$html = <<link1
link2
HTML;
$step = (new GetLink())->onHost(['www.otsch.codes', 'blog.example.com']);
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(1)
->and($links[0]->get())->toBe('https://www.otsch.codes/contact');
$html = <<link1
link2link3
HTML;
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(1)
->and($links[0]->get())->toBe('https://blog.example.com/articles/1');
});
test('onHost() can be called multiple times and merges all hosts it was called with', function () {
$html = <<link1
HTML;
$step = (new GetLink())->onHost('www.crwl.io');
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(0);
$step->onHost(['www.otsch.codes', 'www.crwlr.software']);
$html = <<link1
HTML;
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(1)
->and($links[0]->get())->toBe('https://www.crwl.io');
$html = <<link1
link2
HTML;
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(1)
->and($links[0]->get())->toBe('https://www.otsch.codes/blog');
});
it('works correctly when HTML contains a base tag', function () {
$html = <<
link
HTML;
$step = (new GetLink());
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.example.com/a/b'),
new Response(200, [], $html),
));
expect($links[0]->get())->toBe('https://www.example.com/c/e');
});
it('throws away the URL fragment part when withoutFragment() was called', function () {
$html = <<
link
HTML;
$step = (new GetLink());
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.example.com/foo/baz'),
new Response(200, [], $html),
);
$links = helper_invokeStepWithInput($step, $respondedRequest);
expect($links[0]->get())->toBe('https://www.example.com/foo/bar#fragment');
$step->withoutFragment();
$links = helper_invokeStepWithInput($step, $respondedRequest);
expect($links[0]->get())->toBe('https://www.example.com/foo/bar');
});
it('ignores special non HTTP links', function () {
$html = <<
mailto linkjavascript linkphone linklink
HTML;
$step = (new GetLink());
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.example.com/home'),
new Response(200, [], $html),
);
$links = helper_invokeStepWithInput($step, $respondedRequest);
expect($links[0]->get())->toBe('https://www.example.com/foo/bar');
});
================================================
FILE: tests/Steps/Html/GetLinksTest.php
================================================
link'),
));
expect($links)->toHaveCount(1)
->and($links[0]->get())->toBe('https://www.example.com/blog');
});
it('logs an error message when fed with invalid input', function () {
$logger = new DummyLogger();
$step = (new GetLinks())->addLogger($logger);
helper_traverseIterable($step->invokeStep(new Input(new stdClass())));
expect($logger->messages)->not->toBeEmpty()
->and($logger->messages[0]['message'])->toBe(
'The Crwlr\Crawler\Steps\Html\GetLinks step was called with input that it can not work with: Input must ' .
'be an instance of RespondedRequest.',
);
});
test('When called without selector it just gets all links', function () {
$step = (new GetLinks());
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.crwlr.software/packages/url/'),
new Response(
200,
[],
'
',
),
));
expect($links[0]->get())->toBe('https://www.crwlr.software/packages/url/v0.1')
->and($links[1]->get())->toBe('https://www.crwlr.software/packages/url/v1.0')
->and($links[2]->get())->toBe('https://www.crwlr.software/packages/url/v1.1');
});
test('When passing a CSS selector it only selects matching links', function () {
$step = (new GetLinks('.matchingLink'));
$responseHtml = <<
JobsNumbersProductsTeam
HTML;
$outputs = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.example.com/company/about'),
new Response(200, [], $responseHtml),
));
expect($outputs)->toHaveCount(3)
->and($outputs[0]->get())->toBe('https://www.example.com/company/jobs')
->and($outputs[1]->get())->toBe('https://www.example.com/company/numbers')
->and($outputs[2]->get())->toBe('https://www.example.com/team');
});
test('When selector matches on a non-link element it\'s ignored', function () {
$step = (new GetLinks('.link'));
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], 'FooBar'),
));
expect($links)->toHaveCount(1)
->and($links[0]->get())->toBe('https://www.otsch.codes/foo');
});
it('finds only links on the same domain when onSameDomain() was called', function () {
$html = <<link1
link2link3
HTML;
$step = (new GetLinks())->onSameDomain();
$link = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($link)->toHaveCount(2)
->and($link[0]->get())->toBe('https://blog.otsch.codes/articles')
->and($link[1]->get())->toBe('https://www.otsch.codes/blog');
});
it('doesn\'t find links on the same domain when notOnSameDomain() was called', function () {
$html = <<link1
link2link3
HTML;
$step = (new GetLinks())->notOnSameDomain();
$link = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($link)->toHaveCount(2)
->and($link[0]->get())->toBe('https://www.crwlr.software/packages')
->and($link[1]->get())->toBe('https://www.example.com/foo');
});
it('finds only links from domains the onDomain() method was called with', function () {
$html = <<link1
link2link3link4
HTML;
$step = (new GetLinks())->onDomain('crwlr.software');
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(2)
->and($links[0]->get())->toBe('https://www.crwlr.software/packages')
->and($links[1]->get())->toBe('https://www.crwlr.software/blog');
});
test('onDomain() also takes an array of domains', function () {
$html = <<link1
link2link3
HTML;
$step = (new GetLinks())->onDomain(['otsch.codes', 'crwlr.software']);
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(2)
->and($links[0]->get())->toBe('https://www.otsch.codes/contact')
->and($links[1]->get())->toBe('https://www.crwlr.software/packages');
});
test('onDomain() can be called multiple times and merges all domains it was called with', function () {
$html = <<link1
link2link3
HTML;
$step = (new GetLinks())->onDomain('crwl.io');
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(0);
$step->onDomain(['otsch.codes', 'crwlr.software']);
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(2);
$step->onDomain('example.com');
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(3);
});
it('finds only links on the same host when onSameHost() was called', function () {
$html = <<link1
link2link3link4
HTML;
$step = (new GetLinks())->onSameHost();
$link = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($link)->toHaveCount(2)
->and($link[0]->get())->toBe('https://www.otsch.codes/contact')
->and($link[1]->get())->toBe('https://www.otsch.codes/blog');
});
it('doesn\'t find links on the same host when notOnSameHost() was called', function () {
$html = <<link1
link2link3
HTML;
$step = (new GetLinks())->notOnSameHost();
$link = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($link)->toHaveCount(2)
->and($link[0]->get())->toBe('https://jobs.otsch.codes')
->and($link[1]->get())->toBe('https://www.crwlr.software/packages');
});
it('finds only links from hosts the onHost() method was called with', function () {
$html = <<link1
link2link3link4
HTML;
$step = (new GetLinks())->onHost('www.crwlr.software');
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(2)
->and($links[0]->get())->toBe('https://www.crwlr.software/packages')
->and($links[1]->get())->toBe('https://www.crwlr.software/packages/crawler/v0.4/getting-started');
});
test('onHost() also takes an array of hosts', function () {
$html = <<link1
link2
HTML;
$step = (new GetLinks())->onHost(['www.otsch.codes', 'blog.example.com']);
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(1)
->and($links[0]->get())->toBe('https://www.otsch.codes/contact');
$html = <<link1
link2link3
HTML;
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(2)
->and($links[1]->get())->toBe('https://blog.example.com/articles/1');
});
test('onHost() can be called multiple times and merges all hosts it was called with', function () {
$html = <<link1
HTML;
$step = (new GetLinks())->onHost('www.crwl.io');
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(0);
$step->onHost(['www.otsch.codes', 'www.crwlr.software']);
$html = <<link1
HTML;
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(1)
->and($links[0]->get())->toBe('https://www.crwl.io');
$html = <<link1
link2
HTML;
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.otsch.codes'),
new Response(200, [], $html),
));
expect($links)->toHaveCount(2)
->and($links[0]->get())->toBe('https://www.otsch.codes/blog')
->and($links[1]->get())->toBe('https://www.crwl.io');
});
it('works correctly when HTML contains a base tag', function () {
$html = <<
linklink2link3
HTML;
$step = (new GetLinks());
$links = helper_invokeStepWithInput($step, new RespondedRequest(
new Request('GET', 'https://www.example.com/a/b'),
new Response(200, [], $html),
));
expect($links[0]->get())->toBe('https://www.example.com/c/e')
->and($links[1]->get())->toBe('https://www.example.com/f/g')
->and($links[2]->get())->toBe('https://www.example.com/c/h');
});
it('throws away the URL fragment part when withoutFragment() was called', function () {
$html = <<
link another link
HTML;
$step = (new GetLinks());
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.example.com/foo/baz'),
new Response(200, [], $html),
);
$links = helper_invokeStepWithInput($step, $respondedRequest);
expect($links[0]->get())->toBe('https://www.example.com/foo/bar#fragment')
->and($links[1]->get())->toBe('https://www.example.com/baz#quz-fragment');
$step->withoutFragment();
$links = helper_invokeStepWithInput($step, $respondedRequest);
expect($links[0]->get())->toBe('https://www.example.com/foo/bar')
->and($links[1]->get())->toBe('https://www.example.com/baz');
});
it('ignores special non HTTP links', function () {
$html = <<
mailto linklink onejavascript linklink twophone linklink three
HTML;
$step = (new GetLinks());
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.example.com/home'),
new Response(200, [], $html),
);
$links = helper_invokeStepWithInput($step, $respondedRequest);
expect($links)->toHaveCount(3)
->and($links[0]->get())->toBe('https://www.example.com/one')
->and($links[1]->get())->toBe('https://www.example.com/two')
->and($links[2]->get())->toBe('https://www.example.com/three');
});
================================================
FILE: tests/Steps/Html/MetaDataTest.php
================================================
Hello World!
HTML;
$outputs = helper_invokeStepWithInput(new MetaData(), $html);
expect($outputs[0]->get())->toBe(['title' => '']);
});
it('returns an array with the title and all meta tags having a name or property attribute', function () {
$html = <<
Hello World!
Hello World!
HTML;
$outputs = helper_invokeStepWithInput(new MetaData(), $html);
expect($outputs[0]->get())->toBe([
'title' => 'Hello World!',
'description' => 'This is a page saying: Hello World!',
'keywords' => 'lorem, ipsum, hello, world',
'og:title' => 'Hello World!',
'og:type' => 'website',
]);
});
it('returns only the meta tags defined via the only() method', function () {
$html = <<
Hello World!
Hello World!
HTML;
$outputs = helper_invokeStepWithInput(Html::metaData()->only(['description', 'og:title']), $html);
expect($outputs[0]->get())->toBe([
'description' => 'This is a page saying: Hello World!',
'og:title' => 'Hello World!',
]);
});
================================================
FILE: tests/Steps/Html/SchemaOrgTest.php
================================================
Foo Bar
Baz
Other content
HTML;
}
function helper_schemaOrgExampleMultipleObjects(): string
{
return <<
Foo Bar
Some Article
This is some article about something.
HTML;
}
it('extracts schema.org data in JSON-LD format from an HTML document', function () {
$html = helper_schemaOrgExampleOneJobPostingInBody();
$outputs = helper_invokeStepWithInput(Html::schemaOrg(), $html);
expect($outputs)->toHaveCount(1);
expect($outputs[0]->get())->toBeInstanceOf(JobPosting::class);
});
it('converts the spatie schema.org objects to arrays when calling the toArray() method', function () {
$html = helper_schemaOrgExampleOneJobPostingInBody();
$outputs = helper_invokeStepWithInput(Html::schemaOrg()->toArray(), $html);
expect($outputs)->toHaveCount(1);
expect($outputs[0]->get())->toBeArray();
expect($outputs[0]->get()['hiringOrganization'])->toBeArray();
expect($outputs[0]->get()['hiringOrganization'])->toHaveKey('name');
expect($outputs[0]->get()['hiringOrganization']['name'])->toBe('Foo Ltd.');
});
it('gets all the schema.org objects contained in a document', function () {
$html = helper_schemaOrgExampleMultipleObjects();
$outputs = helper_invokeStepWithInput(Html::schemaOrg(), $html);
expect($outputs)->toHaveCount(3);
});
it('gets only schema.org objects of a certain type if you use the onlyType method', function () {
$html = helper_schemaOrgExampleMultipleObjects();
$outputs = helper_invokeStepWithInput(
Html::schemaOrg()->onlyType('Article'),
$html,
);
expect($outputs)->toHaveCount(1);
expect($outputs[0]->get())->toBeInstanceOf(Article::class);
});
it('also finds schema.org objects of a certain type in children of another schema.org object', function () {
$html = helper_schemaOrgExampleMultipleObjects();
$outputs = helper_invokeStepWithInput(
Html::schemaOrg()->onlyType('Organization'),
$html,
);
expect($outputs)->toHaveCount(2);
expect($outputs[0]->get()->getProperty('name'))->toBe('Example Company');
expect($outputs[1]->get()->getProperty('name'))->toBe('Some Organization, Inc.');
});
it('extracts certain data from schema.org objects when using the extract() method', function () {
$html = helper_schemaOrgExampleMultipleObjects();
$outputs = helper_invokeStepWithInput(
Html::schemaOrg()->onlyType('Article')->extract(['url', 'headline', 'publisher' => 'publisher.name']),
$html,
);
expect($outputs)->toHaveCount(1);
expect($outputs[0]->get())->toBe([
'url' => 'https://de.example.org/articles/some',
'headline' => 'This is some article about something.',
'publisher' => 'Some Organization, Inc.',
]);
});
test('If an object doesn\'t contain a property from the extract mapping, it\'s just null in the output', function () {
$html = helper_schemaOrgExampleMultipleObjects();
$outputs = helper_invokeStepWithInput(
Html::schemaOrg()->onlyType('Article')->extract(['url', 'headline', 'alternativeHeadline']),
$html,
);
expect($outputs)->toHaveCount(1);
expect($outputs[0]->get())->toBe([
'url' => 'https://de.example.org/articles/some',
'headline' => 'This is some article about something.',
'alternativeHeadline' => null,
]);
});
================================================
FILE: tests/Steps/Html/XPathQueryTest.php
================================================
throws(InvalidDomQueryException::class);
test('The apply method returns a string for a single match', function () {
$xml = 'test';
expect((new XPathQuery('//item'))->apply(new XmlDocument($xml)))->toBe('test');
});
test('The apply method returns an array of strings for multiple matches', function () {
$html = 'testtest 2 subtest 3';
expect((new XPathQuery('//item'))->apply(new HtmlDocument($html)))->toBe(['test', 'test 2 sub', 'test 3']);
});
test('The apply method returns null if nothing matches', function () {
$xml = 'test';
expect((new XPathQuery('//aitem'))->apply(new XmlDocument($xml)))->toBeNull();
});
it('trims whitespace', function () {
$xml = <<
test
XML;
expect((new XPathQuery('//item'))->apply(new XmlDocument($xml)))->toBe('test');
});
it('contains inner tags when the html method is called', function () {
$xml = 'test sub';
expect((new XPathQuery('//item'))->html()->apply(new XmlDocument($xml)))->toBe('test sub');
});
it('contains also the outer tag when the outerHtml method is called', function () {
$xml = 'test sub';
expect((new XPathQuery('//item'))->outerHtml()->apply(new XmlDocument($xml)))->toBe('test sub');
});
it('gets the contents of an attribute using the attribute method', function () {
$xml = 'test';
expect((new XPathQuery('//item'))->attribute('attr')->apply(new XmlDocument($xml)))->toBe('content');
});
test('getting an attribute value returns an empty string when the attribute does not exist', function () {
$xml = 'test';
expect((new XPathQuery('//item'))->attribute('attr')->apply(new XmlDocument($xml)))->toBe('');
});
it('turns the value into an absolute url when toAbsoluteUrl() is called', function () {
$xml = '/foo/bar';
$document = new XmlDocument($xml);
$query = (new XPathQuery('//item'))
->setBaseUrl('https://www.example.com');
expect($query->apply($document))->toBe('/foo/bar');
$query->toAbsoluteUrl();
expect($query->apply($document))->toBe('https://www.example.com/foo/bar');
});
it('gets an absolute link from the href attribute of a link element, when the link() method is called', function () {
$html = '
';
$document = new HtmlDocument($html);
$selector = (new XPathQuery('//*[@id=\'foo\']/a[@class=\'bar\']'))
->setBaseUrl('https://www.example.com/');
expect($selector->apply($document))->toBe('Foo');
$selector->link();
expect($selector->apply($document))->toBe('https://www.example.com/foo/bar');
});
it('gets only the first matching element when the first() method is called', function () {
$selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->first();
expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('one');
});
it('gets only the last matching element when the last() method is called', function () {
$selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->last();
expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('four');
});
it('gets only the nth matching element when the nth() method is called', function () {
$selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->nth(3);
expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('three');
});
it('returns null when no nth matching element exists', function () {
$selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->nth(5);
expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBeNull();
});
it('gets only even matching elements when the even() method is called', function () {
$selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->even();
expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['two', 'four']);
});
it('gets only odd matching elements when the odd() method is called', function () {
$selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->odd();
expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['one', 'three']);
});
================================================
FILE: tests/Steps/HtmlTest.php
================================================
extract('.title'),
helper_getHtmlContent('bookstore.html'),
);
expect($output)->toHaveCount(4)
->and($output[0]->get())->toBe('Everyday Italian')
->and($output[3]->get())->toBe('Learning XML');
});
it('extracts data from an HTML document with CSS selectors by default', function () {
$output = helper_invokeStepWithInput(
Html::each('#bookstore .book')->extract(['title' => '.title', 'author' => '.author', 'year' => '.year']),
helper_getHtmlContent('bookstore.html'),
);
expect($output)->toHaveCount(4)
->and($output[0]->get())->toBe(
['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'],
)
->and($output[1]->get())->toBe(['title' => 'Harry Potter', 'author' => 'J K. Rowling', 'year' => '2005'])
->and($output[2]->get())->toBe(
[
'title' => 'XQuery Kick Start',
'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'],
'year' => '2003',
],
)
->and($output[3]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']);
});
it('can also extract data using XPath queries', function () {
$output = helper_invokeStepWithInput(
Html::each(Dom::xPath('//div[@id=\'bookstore\']/div[@class=\'book\']'))->extract([
'title' => Dom::xPath('//h3[@class=\'title\']'),
'author' => Dom::xPath('//*[@class=\'author\']'),
'year' => Dom::xPath('//span[@class=\'year\']'),
]),
helper_getHtmlContent('bookstore.html'),
);
expect($output)->toHaveCount(4)
->and($output[2]->get())->toBe(
[
'title' => 'XQuery Kick Start',
'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'],
'year' => '2003',
],
);
});
it('returns only one (compound) output when the root method is used', function () {
$output = helper_invokeStepWithInput(
Html::root()->extract(['title' => '.title', 'author' => '.author', 'year' => '.year',]),
helper_getHtmlContent('bookstore.html'),
);
expect($output)->toHaveCount(1)
->and($output[0]->get()['title'])->toBe(['Everyday Italian', 'Harry Potter', 'XQuery Kick Start', 'Learning XML']);
});
it('extracts the data of the first matching element when the first method is used', function () {
$output = helper_invokeStepWithInput(
Html::first('#bookstore .book')->extract(['title' => '.title', 'author' => '.author', 'year' => '.year']),
helper_getHtmlContent('bookstore.html'),
);
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe(
['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'],
);
});
it('extracts the data of the last matching element when the last method is used', function () {
$output = helper_invokeStepWithInput(
Html::last('#bookstore .book')->extract(['title' => '.title', 'author' => '.author', 'year' => '.year']),
helper_getHtmlContent('bookstore.html'),
);
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']);
});
test(
'you can extract data in a second level to the output array using another Html step as an element in the mapping ' .
'array',
function () {
$response = new RespondedRequest(
new Request('GET', 'https://www.example.com/meetups/some-meetup/'),
new Response(body: helper_getHtmlContent('event.html')),
);
$output = helper_invokeStepWithInput(
Html::root()->extract([
'title' => '#event h1',
'location' => '#event .location',
'date' => '#event .date',
'talks' => Html::each('#event .talks .talk')->extract([
'title' => '.title',
'speaker' => '.speaker',
'slides' => Dom::cssSelector('.slidesLink')->attribute('href')->toAbsoluteUrl(),
]),
]),
$response,
);
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe([
'title' => 'Some Meetup',
'location' => 'Somewhere',
'date' => '2023-01-14 21:00',
'talks' => [
[
'title' => 'Sophisticated talk title',
'speaker' => 'Super Mario',
'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk1.pdf',
],
[
'title' => 'Simple beginner talk',
'speaker' => 'Luigi',
'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk2.pdf',
],
[
'title' => 'Fun talk',
'speaker' => 'Princess Peach',
'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk3.pdf',
],
],
]);
},
);
test(
'When a child step is nested in the extraction and does not use each(), the extracted value is an array with ' .
'the keys defined in extract(), rather than an array of such arrays as it would be with each().',
function () {
$xml = <<
something
ABCDEFGmbH
1984
Germany, Frankfurt
Saubär GmbH
2014
Austria, Klagenfurt
HTML;
$expectedCompany1 = [
'name' => 'ABCDEFGmbH',
'founded' => '1984',
'location' => ['country' => 'Germany', 'city' => 'Frankfurt'],
];
$expectedCompany2 = [
'name' => 'Saubär GmbH',
'founded' => '2014',
'location' => ['country' => 'Austria', 'city' => 'Klagenfurt'],
];
// With base root()
$step = Html::each('.company')->extract([
'name' => '.name',
'founded' => '.founded',
'location' => Html::root()->extract(['country' => '.location .country', 'city' => '.location .city']),
]);
$outputs = helper_invokeStepWithInput($step, $xml);
expect($outputs)->toHaveCount(2)
->and($outputs[0]->get())->toBe($expectedCompany1)
->and($outputs[1]->get())->toBe($expectedCompany2);
// With base first()
$step = Html::each('.company')->extract([
'name' => '.name',
'founded' => '.founded',
'location' => Html::first('.location')->extract(['country' => '.country', 'city' => '.city']),
]);
$outputs = helper_invokeStepWithInput($step, $xml);
expect($outputs)->toHaveCount(2)
->and($outputs[0]->get())->toBe($expectedCompany1)
->and($outputs[1]->get())->toBe($expectedCompany2);
// With base last()
$step = Html::each('.company')->extract([
'name' => '.name',
'founded' => '.founded',
'location' => Html::last('.location')->extract(['country' => '.country', 'city' => '.city']),
]);
$outputs = helper_invokeStepWithInput($step, $xml);
expect($outputs)->toHaveCount(2)
->and($outputs[0]->get())->toBe($expectedCompany1)
->and($outputs[1]->get())->toBe($expectedCompany2);
},
);
test(
'when selecting elements with each(), you can reference the element already selected within the each() selector ' .
'itself, in sub selectors',
function () {
$html = <<
Bookstore Example in HTML :)
';
}
/** @var TestCase $this */
it('says it has finished when no initial response was provided yet', function () {
$paginator = new SimpleWebsitePaginator('.pagination');
expect($paginator->hasFinished())->toBeTrue();
});
it('says it has finished when a response is provided, but it has no pagination links', function () {
$paginator = new SimpleWebsitePaginator('.pagination', 3);
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing', '');
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect($paginator->hasFinished())->toBeTrue();
});
it('says it has not finished when an initial response with pagination links is provided', function () {
$paginator = new SimpleWebsitePaginator('.pagination', 3);
$responseBody = helper_createResponseBodyWithPaginationLinks([
'/listing?page=1' => 'First page',
'/listing?page=2' => 'Next page',
'/listing?page12' => 'Last page',
]);
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect($paginator->hasFinished())->toBeFalse();
});
it('has finished when the loaded pages count exceeds the max pages limit', function () {
$paginator = new SimpleWebsitePaginator('.pagination', 3);
$responseBody = helper_createResponseBodyWithPaginationLinks([
'/listing?page=1' => 'First page',
'/listing?page=2' => 'Next page',
'/listing?page12' => 'Last page',
]);
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect($paginator->hasFinished())->toBeFalse();
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect($paginator->hasFinished())->toBeFalse();
$responseBody = helper_createResponseBodyWithPaginationLinks([
'/listing?page=1' => 'First page',
'/listing?page=3' => 'Next page',
'/listing?page12' => 'Last page',
]);
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=2', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect($paginator->hasFinished())->toBeTrue();
});
it('says it has finished when there are no more found pagination links, that haven\'t been loaded yet', function () {
$paginator = new SimpleWebsitePaginator('.pagination', 3);
$responseBody = helper_createResponseBodyWithPaginationLinks(['/listing?page=2' => 'Page Two']);
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect($paginator->hasFinished())->toBeFalse();
$paginator->getNextRequest();
$responseBody = helper_createResponseBodyWithPaginationLinks(['/listing?page=2' => 'Page Two']);
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=2', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect($paginator->hasFinished())->toBeTrue();
});
it('finds pagination links when the selector matches the link itself', function () {
$paginator = new SimpleWebsitePaginator('.nextPageLink', 3);
$responseBody = 'Next Page';
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect($paginator->getNextRequest()?->getUri()->__toString())->toBe('https://www.example.com/listing?page=2');
});
it('finds pagination links when the selected element is a wrapper for pagination links', function () {
$paginator = new SimpleWebsitePaginator('.pagination', 3);
$responseBody = '
';
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect($paginator->getNextRequest()?->getUri()->__toString())->toBe('https://www.example.com/listing?page=2');
});
it('finds all pagination links, when multiple elements match the pagination links selector', function () {
$paginator = new SimpleWebsitePaginator('.pagination', 3);
$responseBody = <<Next Page
HTML;
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect($paginator->getNextRequest()?->getUri()->__toString())->toBe('https://www.example.com/listing?page=2')
->and($paginator->getNextRequest()?->getUri()->__toString())->toBe('https://www.example.com/listing?page=12');
});
it('logs that max pages limit was reached when it was reached', function () {
$paginator = new SimpleWebsitePaginator('.pagination', 3);
$responseBody = <<
Page OnePage TwoPage ThreePage Four
HTML;
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=2', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=3', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect($paginator->hasFinished())->toBeTrue();
$paginator->logWhenFinished(new CliLogger());
$output = $this->getActualOutputForAssertion();
expect($output)->toContain('Max pages limit reached');
});
it('logs that all found pagination links have been loaded when max pages limit was not reached', function () {
$paginator = new SimpleWebsitePaginator('.pagination', 3);
$responseBody = <<
Page OnePage TwoPage Three
HTML;
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
$paginator->getNextRequest();
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=2', $responseBody);
$paginator->logWhenFinished(new CliLogger());
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
$paginator->logWhenFinished(new CliLogger());
$paginator->getNextRequest();
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=3', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect($paginator->hasFinished())->toBeTrue();
$paginator->logWhenFinished(new CliLogger());
$output = $this->getActualOutputForAssertion();
expect($output)
->not()->toContain('Max pages limit reached')
->and($output)
->toContain('All found pagination links loaded');
});
it(
'always creates upcoming requests from the parent request, where a link was found (which does not have to be ' .
'the latest processed response)',
function () {
$paginator = new SimpleWebsitePaginator('.pagination', 3);
$responseBody = <<
Page OnePage TwoPage Three
HTML;
$respondedRequest = helper_getRespondedRequest(
'GET',
'https://www.example.com/list?page=1',
['foo' => 'bar'],
responseBody: $responseBody,
);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
$responseBody = <<
Page OnePage TwoPage Three
HTML;
$respondedRequest = helper_getRespondedRequest(
'GET',
'https://www.example.com/list?page=2',
['foo' => 'baz'],
responseBody: $responseBody,
);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
$nextRequest = $paginator->getNextRequest();
expect($nextRequest?->getHeader('foo'))->toBe(['bar']);
},
);
it('cleans up the stored parent requests always when getting the next request to load', function () {
$paginator = new class ('.pagination') extends SimpleWebsitePaginator {
/**
* @return array
*/
public function parentRequests(): array
{
return $this->parentRequests;
}
};
$responseBody = <<
Page TwoPage Three
HTML;
$respondedRequest = helper_getRespondedRequest(
'GET',
'https://www.example.com/list?page=1',
['foo' => 'bar'],
responseBody: $responseBody,
);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect(count($paginator->parentRequests()))->toBe(1);
$nextRequest = $paginator->getNextRequest();
if (!$nextRequest) {
$this->fail('failed to get next request');
}
$respondedRequest = new RespondedRequest($nextRequest, new Response());
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect(count($paginator->parentRequests()))->toBe(1);
$nextRequest = $paginator->getNextRequest();
if (!$nextRequest) {
$this->fail('failed to get next request');
}
$respondedRequest = new RespondedRequest($nextRequest, new Response());
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect(count($paginator->parentRequests()))->toBe(0);
});
it('does not stop, when a response does not meet the stop rule criterion', function () {
$paginator = new SimpleWebsitePaginator('.pagination', 3);
$paginator->stopWhen(PaginatorStopRules::contains('hello world'));
$responseBody = helper_createResponseBodyWithPaginationLinks(['/listing?page=2' => 'Next page']);
$respondedRequest = helper_getRespondedRequestWithResponseBody('/listing', $responseBody);
$paginator->processLoaded($respondedRequest->request, $respondedRequest);
expect($paginator->hasFinished())->toBeFalse();
});
================================================
FILE: tests/Steps/Loading/Http/Paginators/StopRules/ContainsTest.php
================================================
shouldStop(new Request('GET', 'https://www.example.com/foo'), null))->toBeTrue();
});
it('stops when the string is contained in the response body', function () {
$rule = PaginatorStopRules::contains('foo');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: 'This string contains foo'),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});
it('does not stop when the string is not contained in the response body', function () {
$rule = PaginatorStopRules::contains('foo');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: 'This does not contain the string'),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse();
});
================================================
FILE: tests/Steps/Loading/Http/Paginators/StopRules/IsEmptyInHtmlTest.php
================================================
shouldStop(new Request('GET', 'https://www.crwl.io/'), null))->toBeTrue();
});
it('should stop, when response is not HTML', function () {
$rule = PaginatorStopRules::isEmptyInHtml('#list .item');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: '{ "foo": "bar" }'),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});
it('should stop, when the selector target does not exist in the HTML response', function () {
$rule = PaginatorStopRules::isEmptyInHtml('#list');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: ''),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});
it('should stop, when the selector target is empty in the response', function () {
$rule = PaginatorStopRules::isEmptyInHtml('#list');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: '
'),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});
it('should not stop, when the selector target is not empty in the response', function () {
$rule = PaginatorStopRules::isEmptyInHtml('#list');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: '
a
'),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse();
// Also if the content is only child elements.
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: '
'),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse();
});
================================================
FILE: tests/Steps/Loading/Http/Paginators/StopRules/IsEmptyInJsonTest.php
================================================
shouldStop(new Request('GET', 'https://www.crwl.io/'), null))->toBeTrue();
});
it('throws an exception when response is not valid JSON', function () {
$rule = PaginatorStopRules::isEmptyInJson('data.items');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: ''),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
})->throws(InvalidJsonException::class);
it('should stop, when the dot notation key does not exist in the response', function () {
$rule = PaginatorStopRules::isEmptyInJson('data.items');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: '{ "data": { "foo": "bar" } }'),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});
it('should stop, when the dot notation key is empty in the response', function () {
$rule = PaginatorStopRules::isEmptyInJson('data.items');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: '{ "data": { "items": [] } }'),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});
it('should not stop, when the dot notation key is not empty in the response', function () {
$rule = PaginatorStopRules::isEmptyInJson('data.items');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: '{ "data": { "items": ["foo", "bar"] } }'),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse();
});
================================================
FILE: tests/Steps/Loading/Http/Paginators/StopRules/IsEmptyInXmlTest.php
================================================
shouldStop(new Request('GET', 'https://www.crwl.io/'), null))->toBeTrue();
});
it('should stop, when response is not XML', function () {
$rule = PaginatorStopRules::isEmptyInXml('channel item');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: '{}'),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});
it('should stop, when the selector target does not exist in the XML response', function () {
$rule = PaginatorStopRules::isEmptyInXml('channel item');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: ''),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});
it('should stop, when the selector target is empty in the response', function () {
$rule = PaginatorStopRules::isEmptyInXml('channel item');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(
body: '',
),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});
it('should not stop, when the selector target is not empty in the response', function () {
$rule = PaginatorStopRules::isEmptyInXml('channel item');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(
body: 'a',
),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse();
// Also if the content is only child elements.
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(
body: '',
),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse();
});
================================================
FILE: tests/Steps/Loading/Http/Paginators/StopRules/IsEmptyResponseTest.php
================================================
shouldStop(new Request('GET', 'https://www.crwl.io/'), null))->toBeTrue();
});
it('should stop, when the response body is empty', function () {
$rule = PaginatorStopRules::isEmptyResponse();
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: ''),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});
it('should stop, when the response body is only spaces', function () {
$rule = PaginatorStopRules::isEmptyResponse();
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.example.com/'),
new Response(body: " \n\r\t "),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});
it('should stop, when the response body is an empty JSON array', function () {
$rule = PaginatorStopRules::isEmptyResponse();
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwlr.software/packages'),
new Response(body: " [] "),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});
it('should stop, when the response body is an empty JSON object', function () {
$rule = PaginatorStopRules::isEmptyResponse();
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/en/home'),
new Response(body: "{}"),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});
================================================
FILE: tests/Steps/Loading/Http/Paginators/StopRules/NotContainsTest.php
================================================
shouldStop(new Request('GET', 'https://www.example.com/foo'), null))->toBeTrue();
});
it('stops when the string is not contained in the response body', function () {
$rule = PaginatorStopRules::notContains('foo');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: 'This does not contain the string'),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});
it('does not stop when the string is contained in the response body', function () {
$rule = PaginatorStopRules::notContains('foo');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/'),
new Response(body: 'This contains the string foo'),
);
expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse();
});
================================================
FILE: tests/Steps/Loading/HttpTest.php
================================================
shouldReceive('load')->once();
$step = (new Http('GET'))->setLoader($loader);
helper_traverseIterable($step->invokeStep(new Input('https://www.foo.bar/baz')));
});
it('can be invoked with a PSR-7 Uri object as input', function () {
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('load')->once();
$step = (new Http('GET'))->setLoader($loader);
helper_traverseIterable($step->invokeStep(new Input(Url::parsePsr7('https://www.linkedin.com/'))));
});
it('logs an error message when invoked with something else as input', function () {
$logger = new DummyLogger();
$loader = Mockery::mock(HttpLoader::class);
$step = (new Http('GET'))->setLoader($loader)->addLogger($logger);
helper_traverseIterable($step->invokeStep(new Input(new stdClass())));
expect($logger->messages)->not->toBeEmpty()
->and($logger->messages[0]['message'])->toStartWith(
'The Crwlr\Crawler\Steps\Loading\Http step was called with input that it can not work with:',
)
->and($logger->messages[0]['message'])->toEndWith('. The invalid input is of type object.');
});
it('logs an error message when invoked with a relative reference URI', function () {
$logger = new DummyLogger();
$loader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger);
$step = (new Http('GET'))->setLoader($loader)->addLogger($logger);
helper_invokeStepWithInput($step, '/foo/bar');
expect($logger->messages)->not->toBeEmpty()
->and($logger->messages[0]['message'])->toBe(
'Invalid input URL: /foo/bar - The URI is a relative reference and therefore can\'t be loaded.',
);
});
it('catches the exception and logs an error when feeded with an invalid URL', function () {
$loader = Mockery::mock(HttpLoader::class);
$logger = new DummyLogger();
$step = (new Http('GET'))->setLoader($loader);
$step->addLogger($logger);
helper_traverseIterable($step->invokeStep(new Input('https://')));
expect($logger->messages)->toHaveCount(1)
->and($logger->messages[0]['level'])->toBe('error')
->and($logger->messages[0]['message'])->toBe(
'The Crwlr\\Crawler\\Steps\\Loading\\Http step was called with input that it can not work with: https:// ' .
'is not a valid URL.',
);
});
it('throws an exception when invoked with a relative reference URI and stopOnErrorResponse() was called', function () {
$logger = new DummyLogger();
$loader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger);
$step = (new Http('GET'))->setLoader($loader)->addLogger($logger);
$step->stopOnErrorResponse();
helper_invokeStepWithInput($step, '/foo/bar');
})->throws(InvalidArgumentException::class);
test('You can set the request method via constructor', function (string $httpMethod) {
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($httpMethod) {
return $request->getMethod() === $httpMethod;
})->once();
if ($httpMethod !== 'GET') {
$loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();
}
$step = (new Http($httpMethod))->setLoader($loader);
helper_traverseIterable($step->invokeStep(new Input('https://www.foo.bar/baz')));
})->with(['GET', 'POST', 'PUT', 'PATCH', 'DELETE']);
test('You can set request headers via constructor', function () {
$loader = Mockery::mock(HttpLoader::class);
$headers = [
'Accept' => [
'text/html',
'application/xhtml+xml',
'application/xml;q=0.9',
'image/avif',
'image/webp',
'image/apng',
'*/*;q=0.8',
'application/signed-exchange;v=b3;q=0.9',
],
'Accept-Encoding' => ['gzip', 'deflate', 'br'],
'Accept-Language' => ['de-DE', 'de;q=0.9', 'en-US;q=0.8', 'en;q=0.7'],
];
$loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($headers) {
foreach ($headers as $headerName => $values) {
if (!$request->getHeader($headerName) || $request->getHeader($headerName) !== $values) {
return false;
}
}
return true;
})->once();
$step = (new Http('GET', $headers))->setLoader($loader);
helper_traverseIterable($step->invokeStep(new Input('https://www.crwlr.software/packages/url')));
});
test('You can set request body via constructor', function () {
$loader = Mockery::mock(HttpLoader::class);
$body = 'This is the request body';
$loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($body) {
return $request->getBody()->getContents() === $body;
})->once();
$loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();
$step = (new Http('PATCH', [], $body))->setLoader($loader);
helper_traverseIterable($step->invokeStep(new Input('https://github.com/')));
});
test('You can set the http version for the request via constructor', function (string $httpVersion) {
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($httpVersion) {
return $request->getProtocolVersion() === $httpVersion;
})->once();
$loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();
$step = (new Http('PATCH', [], 'body', $httpVersion))->setLoader($loader);
helper_traverseIterable($step->invokeStep(new Input('https://packagist.org/packages/crwlr/url')));
})->with(['1.0', '1.1', '2.0']);
it('has static methods to create instances with all the different http methods', function (string $httpMethod) {
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($httpMethod) {
return $request->getMethod() === $httpMethod;
})->once();
if ($httpMethod !== 'GET') {
$loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();
}
$step = (Http::{strtolower($httpMethod)}())->setLoader($loader);
helper_traverseIterable($step->invokeStep(new Input('https://dev.to/otsch')));
})->with(['GET', 'POST', 'PUT', 'PATCH', 'DELETE']);
it(
'calls the loadOrFail() loader method when the stopOnErrorResponse() method was called',
function (string $httpMethod) {
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('loadOrFail')->withArgs(function (RequestInterface $request) use ($httpMethod) {
return $request->getMethod() === $httpMethod;
})->once()->andReturn(new RespondedRequest(new Request('GET', '/foo'), new Response(200)));
if ($httpMethod !== 'GET') {
$loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();
}
$step = (Http::{strtolower($httpMethod)}())
->setLoader($loader)
->stopOnErrorResponse();
helper_traverseIterable($step->invokeStep(new Input('https://example.com/otsch')));
},
)->with(['GET', 'POST', 'PUT', 'PATCH', 'DELETE']);
test('you can keep response properties with their aliases', function () {
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('load')->once()->andReturn(
new RespondedRequest(
new Request('GET', 'https://www.example.com/testresponse'),
new Response(202, ['foo' => 'bar'], Utils::streamFor('testbody')),
),
);
$step = Http::get()
->setLoader($loader)
->keep(['url', 'status', 'headers', 'body']);
$outputs = helper_invokeStepWithInput($step);
expect($outputs)->toHaveCount(1)
->and($outputs[0]->keep)->toBe([
'url' => 'https://www.example.com/testresponse',
'status' => 202,
'headers' => ['foo' => ['bar']],
'body' => 'testbody',
]);
});
test(
'the value behind url and uri is the effectiveUri',
function (string $outputKey) {
$loader = Mockery::mock(HttpLoader::class);
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.example.com/testresponse'),
new Response(202, ['foo' => 'bar'], Utils::streamFor('testbody')),
);
$respondedRequest->addRedirectUri('https://www.example.com/testresponseredirect');
$loader->shouldReceive('load')->once()->andReturn($respondedRequest);
$step = Http::get()
->setLoader($loader)
->keep([$outputKey]);
$outputs = helper_invokeStepWithInput($step);
expect($outputs)->toHaveCount(1)
->and($outputs[0]->keep)->toBe([$outputKey => 'https://www.example.com/testresponseredirect']);
},
)->with(['url', 'uri']);
it('gets the URL for the request from an input array when useInputKeyAsUrl() was called', function () {
$inputArray = [
'foo' => 'bar',
'someUrl' => 'https://www.example.com/baz',
];
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($inputArray) {
return $request->getUri()->__toString() === $inputArray['someUrl'];
})->once()->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));
$step = Http::get()
->setLoader($loader)
->useInputKeyAsUrl('someUrl');
helper_invokeStepWithInput($step, $inputArray);
});
it(
'automatically gets the URL for the request from an input array when it contains an url or uri key',
function ($key) {
$inputArray = [
'foo' => 'bar',
$key => 'https://www.example.com/baz',
];
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($inputArray, $key) {
return $request->getUri()->__toString() === $inputArray[$key];
})->once()->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));
$step = Http::get()
->setLoader($loader);
helper_invokeStepWithInput($step, $inputArray);
},
)->with(['url', 'uri']);
it('gets the body for the request from an input array when useInputKeyAsBody() was called', function () {
$inputArray = [
'foo' => 'bar',
'someUrl' => 'https://www.example.com/baz',
'someBodyThatIUsedToKnow' => 'foo=bar&baz=quz',
];
$loader = Mockery::mock(HttpLoader::class);
$loader
->shouldReceive('load')
->withArgs(function (RequestInterface $request) use ($inputArray) {
return $request->getBody()->getContents() === $inputArray['someBodyThatIUsedToKnow'];
})
->once()
->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));
$step = Http::get()
->setLoader($loader)
->useInputKeyAsUrl('someUrl')
->useInputKeyAsBody('someBodyThatIUsedToKnow');
helper_invokeStepWithInput($step, $inputArray);
});
it('gets as single header for the request from an input array when useInputKeyAsHeader() was called', function () {
$inputArray = [
'foo' => 'bar',
'someUrl' => 'https://www.example.com/baz',
'someHeader' => 'someHeaderValue',
];
$loader = Mockery::mock(HttpLoader::class);
$loader
->shouldReceive('load')
->withArgs(function (RequestInterface $request) use ($inputArray) {
return $request->getHeader('header-name-x') === [$inputArray['someHeader']];
})
->once()
->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));
$step = Http::get()
->setLoader($loader)
->useInputKeyAsUrl('someUrl')
->useInputKeyAsHeader('someHeader', 'header-name-x');
helper_invokeStepWithInput($step, $inputArray);
});
it('uses the input key as header name if no header name defined as argument', function () {
$inputArray = [
'foo' => 'bar',
'url' => 'https://www.example.com/baz',
'header-name' => 'someHeaderValue',
];
$loader = Mockery::mock(HttpLoader::class);
$loader
->shouldReceive('load')
->withArgs(function (RequestInterface $request) use ($inputArray) {
return $request->getHeader('header-name') === [$inputArray['header-name']];
})
->once()
->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));
$step = Http::get()
->setLoader($loader)
->useInputKeyAsHeader('header-name');
helper_invokeStepWithInput($step, $inputArray);
});
it('merges header values if you provide a static header value and use an input value as header', function () {
$inputArray = [
'foo' => 'bar',
'someUrl' => 'https://www.example.com/baz',
'someHeader' => 'someHeaderValue',
];
$loader = Mockery::mock(HttpLoader::class);
$loader
->shouldReceive('load')
->withArgs(function (RequestInterface $request) use ($inputArray) {
return $request->getHeader('header-name-x') === ['foo', $inputArray['someHeader']];
})
->once()
->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));
$step = Http::get(['header-name-x' => 'foo'])
->setLoader($loader)
->useInputKeyAsUrl('someUrl')
->useInputKeyAsHeader('someHeader', 'header-name-x');
helper_invokeStepWithInput($step, $inputArray);
});
test('you can use useInputKeyAsHeader() multiple times', function () {
$inputArray = [
'foo' => 'bar',
'someUrl' => 'https://www.example.com/baz',
'someHeader' => 'someHeaderValue',
'anotherHeader' => 'anotherHeaderValue',
];
$loader = Mockery::mock(HttpLoader::class);
$loader
->shouldReceive('load')
->withArgs(function (RequestInterface $request) use ($inputArray) {
return $request->getHeader('header-name-x') === [$inputArray['someHeader']] &&
$request->getHeader('header-name-y') === [$inputArray['anotherHeader']];
})
->once()
->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));
$step = Http::get()
->setLoader($loader)
->useInputKeyAsUrl('someUrl')
->useInputKeyAsHeader('someHeader', 'header-name-x')
->useInputKeyAsHeader('anotherHeader', 'header-name-y');
helper_invokeStepWithInput($step, $inputArray);
});
it('gets multiple headers from an input array using useInputKeyAsHeaders()', function () {
$inputArray = [
'foo' => 'bar',
'someUrl' => 'https://www.example.com/baz',
'customHeaders' => [
'header-name-x' => 'foo',
'header-name-y' => ['bar', 'baz'],
],
];
$loader = Mockery::mock(HttpLoader::class);
$loader
->shouldReceive('load')
->withArgs(function (RequestInterface $request) use ($inputArray) {
$customHeaders = $inputArray['customHeaders'];
$yHeaderExpectedValue = array_merge(['quz'], $customHeaders['header-name-y']);
return $request->getHeader('header-name-x') === [$customHeaders['header-name-x']] &&
$request->getHeader('header-name-y') === $yHeaderExpectedValue;
})
->once()
->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));
$step = Http::get(['header-name-y' => 'quz'])
->setLoader($loader)
->useInputKeyAsUrl('someUrl')
->useInputKeyAsHeaders('customHeaders');
helper_invokeStepWithInput($step, $inputArray);
});
it('uses a static URL when defined', function () {
$input = 'foo';
$loader = Mockery::mock(HttpLoader::class);
$loader
->shouldReceive('load')
->withArgs(function (RequestInterface $request) {
return $request->getUri()->__toString() === 'https://www.example.com/servus';
})
->once()
->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/servus'), new Response(200)));
$step = Http::get()
->setLoader($loader)
->staticUrl('https://www.example.com/servus');
helper_invokeStepWithInput($step, $input);
});
it('resolves variables in a static URL from input data', function () {
$input = ['one' => 'foo', 'two' => 'bar'];
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('usesHeadlessBrowser')->andReturn(false);
$loader
->shouldReceive('load')
->withArgs(function (RequestInterface $request) {
return $request->getUri()->__toString() === 'https://www.example.com/foo/bar/baz';
})
->once()
->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/foo/bar/baz'), new Response(200)));
$step = Http::get()
->setLoader($loader)
->staticUrl('https://www.example.com/[crwl:\'one\']/[crwl:two]/baz');
helper_invokeStepWithInput($step, $input);
});
it('resolves variables in the request body from input data', function () {
$input = [
'url' => 'https://www.example.com/foo',
'hey' => 'ho',
'yo' => 'lo',
];
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('usesHeadlessBrowser')->andReturn(false);
$loader
->shouldReceive('load')
->withArgs(function (RequestInterface $request) {
$bodyString = Http::getBodyString($request);
return $bodyString === 'Ho ho ho and lo asdf';
})
->once()
->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/foo'), new Response(200)));
$step = Http::post(body: 'Ho ho [crwl:hey] and [crwl:yo] asdf')
->setLoader($loader);
helper_invokeStepWithInput($step, $input);
});
it('resolves variables in request headers from input data', function () {
$input = [
'url' => 'https://www.example.com/foo',
'encoding' => 'deflate, br',
'language' => 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7',
];
$loader = Mockery::mock(HttpLoader::class);
$loader
->shouldReceive('load')
->withArgs(function (RequestInterface $request) {
return $request->getHeaderLine('Accept-Encoding') === 'gzip, deflate, br, zstd' &&
$request->getHeaderLine('Accept-Language') === 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7';
})
->once()
->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/foo'), new Response(200)));
$step = Http::get([
'Accept-Encoding' => 'gzip, [crwl:"encoding"], zstd',
'Accept-Language' => '[crwl:language]',
])
->setLoader($loader);
helper_invokeStepWithInput($step, $input);
});
test(
'the getBodyString() method does not generate a warning, when the response contains a ' .
'Content-Type: application/x-gzip header, but the content actually isn\'t compressed',
function () {
$warnings = [];
set_error_handler(function ($errno, $errstr) use (&$warnings) {
if ($errno === E_WARNING) {
$warnings[] = $errstr;
}
return false;
});
$response = helper_getRespondedRequest(
url: 'https://example.com/yolo',
responseHeaders: ['Content-Type' => 'application/x-gzip'],
responseBody: 'Servas!',
);
$string = Http::getBodyString($response);
restore_error_handler();
expect($warnings)->toBeEmpty()
->and($string)->toBe('Servas!');
},
);
it('rejects post browser navigate hooks, when the HTTP method is not GET', function (string $httpMethod) {
$logger = new DummyLogger();
$step = (new Http($httpMethod))->addLogger($logger)->postBrowserNavigateHook(BrowserAction::wait(1.0));
expect($logger->messages)->toHaveCount(1)
->and($logger->messages[0]['message'])->toBe(
'A ' . $httpMethod . ' request cannot be executed using the (headless) browser, so post browser ' .
'navigate hooks can\'t be defined for this step either.',
)
->and(invade($step)->postBrowserNavigateHooks)->toBe([]);
})->with(['POST', 'PUT', 'PATCH', 'DELETE']);
it(
'calls the HttpLoader::skipCacheForNextRequest() method before calling load when the skipCache() method was called',
function () {
$loader = Mockery::mock(HttpLoader::class);
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.example.com/blog/posts'),
new Response(200, body: Utils::streamFor('blog posts')),
);
$loader->shouldReceive('skipCacheForNextRequest')->once();
$loader->shouldReceive('load')->once()->andReturn($respondedRequest);
$step = Http::get()->setLoader($loader)->skipCache();
helper_invokeStepWithInput($step);
},
);
it(
'calls the HttpLoader::skipCacheForNextRequest() method before calling loadOrFail() when the skipCache() method ' .
'was called',
function () {
$loader = Mockery::mock(HttpLoader::class);
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.example.com/blog/posts'),
new Response(200, body: Utils::streamFor('blog posts')),
);
$loader->shouldReceive('skipCacheForNextRequest')->once();
$loader->shouldReceive('loadOrFail')->once()->andReturn($respondedRequest);
$step = Http::get()->setLoader($loader)->skipCache()->stopOnErrorResponse();
helper_invokeStepWithInput($step);
},
);
it(
'switches the loader to use the browser, when useBrowser() was called and the loader is configured to use the ' .
'HTTP client',
function () {
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(false);
$loader->shouldReceive('useHeadlessBrowser')->once();
$loader->shouldReceive('useHttpClient')->once();
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.example.com/hello/world'),
new Response(200, body: Utils::streamFor('Hello World!')),
);
$loader->shouldReceive('load')->once()->andReturn($respondedRequest);
$step = Http::get()->setLoader($loader)->useBrowser();
helper_invokeStepWithInput($step);
},
);
it(
'switches the loader to use the browser, when stopOnErrorResponse() and useBrowser() was called and the loader ' .
'is configured to use the HTTP client',
function () {
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(false);
$loader->shouldReceive('useHeadlessBrowser')->once();
$loader->shouldReceive('useHttpClient')->once();
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.example.com/hello/world'),
new Response(200, body: Utils::streamFor('Hello World!')),
);
$loader->shouldReceive('loadOrFail')->once()->andReturn($respondedRequest);
$step = Http::get()->setLoader($loader)->stopOnErrorResponse()->useBrowser();
helper_invokeStepWithInput($step);
},
);
it(
'does not switch the loader to use the browser, when useBrowser() was called, the loader is configured to use ' .
'the HTTP client, but the request method is not GET',
function (string $httpMethod) {
$logger = new DummyLogger();
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(false);
$loader->shouldNotReceive('useHeadlessBrowser');
$respondedRequest = new RespondedRequest(
new Request($httpMethod, 'https://www.example.com/something'),
new Response(200, body: Utils::streamFor('Something!')),
);
$loader->shouldReceive('load')->once()->andReturn($respondedRequest);
$step = Http::{$httpMethod}()->setLoader($loader)->addLogger($logger)->useBrowser();
helper_invokeStepWithInput($step);
expect($logger->messages)->toHaveCount(1)
->and($logger->messages[0]['message'])->toBe(
'The (headless) browser can only be used for GET requests! Therefore this step will use the HTTP ' .
'client for loading.',
);
},
)->with(['post', 'put', 'patch', 'delete']);
it(
'automatically switches the loader to use the HTTP client, when the HTTP method is not GET and the loader is ' .
'configured to use the browser',
function (string $httpMethod) {
$logger = new DummyLogger();
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(true);
$loader->shouldReceive('useHttpClient')->once();
$loader->shouldReceive('useHeadlessBrowser')->once();
$respondedRequest = new RespondedRequest(
new Request($httpMethod, 'https://www.example.com/something'),
new Response(200, body: Utils::streamFor('Something!')),
);
$loader->shouldReceive('load')->once()->andReturn($respondedRequest);
$step = Http::{$httpMethod}()->setLoader($loader)->addLogger($logger)->useBrowser();
helper_invokeStepWithInput($step);
expect($logger->messages)->toHaveCount(1)
->and($logger->messages[0]['message'])->toBe(
'The (headless) browser can only be used for GET requests! Therefore this step will use the HTTP ' .
'client for loading.',
);
},
)->with(['post', 'put', 'patch', 'delete']);
it(
'switches back the loader to use the HTTP client, when stopOnErrorResponse() and useBrowser() was called and ' .
'loading throws an exception',
function () {
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(false);
$loader->shouldReceive('useHeadlessBrowser')->once();
$loader->shouldReceive('useHttpClient')->once();
$loader->shouldReceive('loadOrFail')->once()->andThrow(new LoadingException('error message'));
$step = Http::get()->setLoader($loader)->stopOnErrorResponse()->useBrowser();
try {
helper_invokeStepWithInput($step);
} catch (Throwable $exception) {
}
},
);
it(
'does not call the useHeadlessBrowser() method of the loader, when useBrowser() was called and the loader is ' .
'already configured to use the browser',
function () {
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(true);
$loader->shouldNotReceive('useHeadlessBrowser');
$loader->shouldNotReceive('useHttpClient');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.example.com/hello/world'),
new Response(200, body: Utils::streamFor('Hello World!')),
);
$loader->shouldReceive('load')->once()->andReturn($respondedRequest);
$step = Http::get()->setLoader($loader)->useBrowser();
helper_invokeStepWithInput($step);
},
);
it(
'does not call the useHeadlessBrowser() method of the loader, when stopOnErrorResponse() and useBrowser() was ' .
'called and the loader is already configured to use the browser',
function () {
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(true);
$loader->shouldNotReceive('useHeadlessBrowser');
$loader->shouldNotReceive('useHttpClient');
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.example.com/hello/world'),
new Response(200, body: Utils::streamFor('Hello World!')),
);
$loader->shouldReceive('loadOrFail')->once()->andReturn($respondedRequest);
$step = Http::get()->setLoader($loader)->stopOnErrorResponse()->useBrowser();
helper_invokeStepWithInput($step);
},
);
it(
'sets post browser navigate hooks, when useBrowser() was called and the loader is configured to use the HTTP ' .
'client',
function () {
$loader = Mockery::mock(HttpLoader::class)->makePartial();
$browserHelperMock = Mockery::mock(HeadlessBrowserLoaderHelper::class);
$loader->shouldReceive('browser')->andReturn($browserHelperMock);
$browserHelperMock
->shouldReceive('setTempPostNavigateHooks')
->once()
->withArgs(function (array $hooks) {
return $hooks[0] instanceof Closure;
});
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.example.com/woop'),
new Response(200, body: Utils::streamFor('Woop')),
);
$loader->shouldReceive('load')->once()->andReturn($respondedRequest);
$step = Http::get()->setLoader($loader)->useBrowser()->postBrowserNavigateHook(BrowserAction::wait(1.0));
helper_invokeStepWithInput($step);
},
);
================================================
FILE: tests/Steps/Loading/LoadingStepTest.php
================================================
*/
use LoadingStep;
protected function invoke(mixed $input): Generator
{
$this->getLoader()->load($input);
yield [];
}
};
$loader = Mockery::mock(HttpLoader::class);
$loader->shouldReceive('load')->once();
$step->setLoader($loader);
helper_traverseIterable($step->invokeStep(new Input('https://www.digitalocean.com/blog')));
});
test(
'you can provide a custom loader to a step via the withLoader() method, and it will be preferred to the loader ' .
'provided via setLoader()',
function () {
$loaderOne = Mockery::mock(Loader::class);
$loaderOne->shouldNotReceive('load');
$loaderTwo = Mockery::mock(Loader::class);
$loaderTwo->shouldReceive('load')->once()->andReturn('Hi');
$step = new class extends Step {
/**
* @use LoadingStep
*/
use LoadingStep;
protected function invoke(mixed $input): Generator
{
yield $this->getLoader()->load($input);
}
};
$step->withLoader($loaderTwo);
// The crawler will call the setLoader() method of the step after the step was added to the crawler.
// So, the call to withLoader() will happen before that.
// Nevertheless, the loader passed to withLoader() should be preferred.
$step->setLoader($loaderOne);
helper_invokeStepWithInput($step);
},
);
================================================
FILE: tests/Steps/Refiners/AbstractRefinerTest.php
================================================
logger?->info('logging works');
return $value;
}
public function testLogTypeWarning(): void
{
$this->logTypeWarning('Some::staticMethodName()', 'foo');
}
}
/** @var TestCase $this */
it('takes a logger that can be used in the Refiner', function () {
$refiner = new SomeRefiner();
$refiner->addLogger(new CliLogger());
$refiner->refine('foo');
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)->toContain('logging works');
});
it('provides a method for children to log a warning if the type of the incoming value is wrong', function () {
(new SomeRefiner())->addLogger(new CliLogger())->testLogTypeWarning();
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)->toContain('Refiner Some::staticMethodName() can\'t be applied to value of type string');
});
================================================
FILE: tests/Steps/Refiners/DateTime/DateTimeFormatTest.php
================================================
refine($from);
expect($refinedValue)->toBe($to);
})->with([
['2024-09-21T13:55:41Z', '2024-09-21 13:55:41'],
['2024-09-21T13:55:41.000Z', '2024-09-21 13:55:41'],
['2024-09-21', '2024-09-21 00:00:00'],
['2024-09-21, 13:55:41', '2024-09-21 13:55:41'],
['21 September 2024, 13:55:41', '2024-09-21 13:55:41'],
['21. September 2024, 13:55:41', '2024-09-21 13:55:41'],
['21 September 2024', '2024-09-21 00:00:00'],
['21. September 2024', '2024-09-21 00:00:00'],
['21.09.2024', '2024-09-21 00:00:00'],
['21.09.2024 13:55', '2024-09-21 13:55:00'],
['21.09.2024 13:55:41', '2024-09-21 13:55:41'],
['Sat, 21 September 2024 13:55:41 +0000', '2024-09-21 13:55:41'],
['Sat Sep 21 2024 16:55:41 GMT+0100', '2024-09-21 15:55:41'],
]);
it('reformats a format that PHP\'s strtotime() does not know, when the origin format is provided', function () {
$refinedValue = DateTimeRefiner::reformat('Y-m-d H:i:s', 'd. F Y \u\m H:i:s')
->refine('21. September 2024 um 13:55:41');
expect($refinedValue)->toBe('2024-09-21 13:55:41');
});
it('logs a warning message (and keeps original input) when it wasn\'t able to auto-convert a date time string', function () {
$refiner = DateTimeRefiner::reformat('Y-m-d H:i:s');
$logger = new DummyLogger();
$refiner->addLogger($logger);
$refinedValue = $refiner->refine('21. September 2024 um 13:55:41');
expect($logger->messages)->toHaveCount(1)
->and($logger->messages[0]['level'])->toBe('warning')
->and($logger->messages[0]['message'])->toStartWith('Failed to automatically (without known format) parse')
->and($refinedValue)->toBe('21. September 2024 um 13:55:41');
});
it(
'logs a warning message (and keeps original input) when it wasn\'t able to convert a date time string with the ' .
'given origin format',
function () {
$refiner = DateTimeRefiner::reformat('Y-m-d H:i:s', 'd. F Y um H:i:s');
$logger = new DummyLogger();
$refiner->addLogger($logger);
$refinedValue = $refiner->refine('21. September 2024 um 13:55:41');
expect($logger->messages)->toHaveCount(1)
->and($logger->messages[0]['level'])->toBe('warning')
->and($logger->messages[0]['message'])->toStartWith('Failed parsing date/time ')
->and($refinedValue)->toBe('21. September 2024 um 13:55:41');
},
);
it('reformats an array of date time strings', function () {
$refinedValue = DateTimeRefiner::reformat('Y-m-d H:i:s')->refine([
'2024-09-21T13:55:41Z',
'2024-09-21T13:55:41.000Z',
'2024-09-21',
]);
expect($refinedValue)->toBe([
'2024-09-21 13:55:41',
'2024-09-21 13:55:41',
'2024-09-21 00:00:00',
]);
});
================================================
FILE: tests/Steps/Refiners/Html/RemoveFromHtmlTest.php
================================================
')
->and($refinedValue)->not()->toContain('');
});
it('removes node from an array of HTML snippets', function () {
$html = [
<<
foo
bar
baz
quz
HTML,
<<
lorem
ipsum
dolor
sit
HTML,
];
$refinedValue = HtmlRefiner::remove('.remove')->refine($html);
expect($refinedValue[0])->not()->toContain('bar')
->and($refinedValue[0])->not()->toContain('quz')
->and($refinedValue[1])->not()->toContain('ipsum')
->and($refinedValue[1])->not()->toContain('sit');
});
================================================
FILE: tests/Steps/Refiners/String/AfterFirstTest.php
================================================
addLogger(new CliLogger())
->refine($value);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)
->toContain('Refiner StringRefiner::afterFirst() can\'t be applied to value of type ' . gettype($value))
->and($refinedValue)->toBe($value);
})->with([
[123],
[12.3],
[true],
]);
it('works with an array of strings as value', function () {
$refinedValue = StringRefiner::afterFirst('a')
->addLogger(new CliLogger())
->refine(['foo a bar a baz', 'lorem a ipsum a dolor']);
expect($refinedValue)->toBe(['bar a baz', 'ipsum a dolor']);
});
it('returns the string after first occurrence of another string', function () {
expect(StringRefiner::afterFirst('foo')->refine('yo lo foo boo choo foo gnu'))->toBe('boo choo foo gnu');
});
it('returns the full string if the string to look for is empty', function () {
expect(StringRefiner::afterFirst('')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo');
});
it('returns the full string when the string to look for is not contained', function () {
expect(StringRefiner::afterFirst('moo')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo');
});
================================================
FILE: tests/Steps/Refiners/String/AfterLastTest.php
================================================
addLogger(new CliLogger())
->refine($value);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)
->toContain('Refiner StringRefiner::afterLast() can\'t be applied to value of type ' . gettype($value))
->and($refinedValue)->toBe($value);
})->with([
[123],
[12.3],
[true],
]);
it('works with an array of strings as value', function () {
$refinedValue = StringRefiner::afterLast('a')
->addLogger(new CliLogger())
->refine(['foo a bar a baz', 'lorem a ipsum a dolor']);
expect($refinedValue)->toBe(['z', 'dolor']);
});
it('returns the string after last occurrence of another string', function () {
expect(StringRefiner::afterLast('foo')->refine('yo lo foo boo choo foo gnu'))->toBe('gnu');
});
it('returns an empty string if the string to look for is empty', function () {
expect(StringRefiner::afterLast('')->refine('yo lo foo boo choo'))->toBe('');
});
it('returns the full string when the string to look for is not contained', function () {
expect(StringRefiner::afterLast('moo')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo');
});
================================================
FILE: tests/Steps/Refiners/String/BeforeFirstTest.php
================================================
addLogger(new CliLogger())
->refine($value);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)
->toContain('Refiner StringRefiner::beforeFirst() can\'t be applied to value of type ' . gettype($value))
->and($refinedValue)->toBe($value);
})->with([
[123],
[12.3],
[true],
]);
it('works with an array of strings as value', function () {
$refinedValue = StringRefiner::beforeFirst('a')
->addLogger(new CliLogger())
->refine(['foo a bar a baz', 'lorem a ipsum a dolor']);
expect($refinedValue)->toBe(['foo', 'lorem']);
});
it('returns the string before the first occurrence of another string', function () {
expect(StringRefiner::beforeFirst('foo')->refine('yo lo foo boo choo foo gnu'))->toBe('yo lo');
});
it('returns an empty string if the string to look for is empty', function () {
expect(StringRefiner::beforeFirst('')->refine('yo lo foo boo choo'))->toBe('');
});
it('returns the full string when the string to look for is not contained', function () {
expect(StringRefiner::beforeFirst('moo')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo');
});
================================================
FILE: tests/Steps/Refiners/String/BeforeLastTest.php
================================================
addLogger(new CliLogger())
->refine($value);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)
->toContain('Refiner StringRefiner::beforeLast() can\'t be applied to value of type ' . gettype($value))
->and($refinedValue)->toBe($value);
})->with([
[123],
[12.3],
[true],
]);
it('works with an array of strings as value', function () {
$refinedValue = StringRefiner::beforeLast('a')
->addLogger(new CliLogger())
->refine(['foo a bar a baz', 'lorem a ipsum a dolor']);
expect($refinedValue)->toBe(['foo a bar a b', 'lorem a ipsum']);
});
it('returns the string before the last occurrence of another string', function () {
expect(StringRefiner::beforeLast('foo')->refine('yo lo foo boo choo foo gnu'))->toBe('yo lo foo boo choo');
});
it('returns the full string if the string to look for is empty', function () {
expect(StringRefiner::beforeLast('')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo');
});
it('returns the full string when the string to look for is not contained', function () {
expect(StringRefiner::beforeLast('moo')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo');
});
================================================
FILE: tests/Steps/Refiners/String/BetweenFirstTest.php
================================================
addLogger(new CliLogger())
->refine($value);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)
->toContain('Refiner StringRefiner::betweenFirst() can\'t be applied to value of type ' . gettype($value))
->and($refinedValue)->toBe($value);
})->with([
[123],
[12.3],
[true],
]);
it('works with an array of strings as value', function () {
$refinedValue = StringRefiner::betweenFirst('foo', 'bar')
->addLogger(new CliLogger())
->refine(['one foo two bar three foo four bar five', 'six foo seven bar eight foo nine bar ten']);
expect($refinedValue)->toBe(['two', 'seven']);
});
it('gets the (trimmed) string between the first occurrence of start and the next occurrence of end', function () {
$refiner = StringRefiner::betweenFirst('foo', 'bar');
$refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar');
expect($refinedValue)->toBe('bli');
});
test('if start is an empty string, start from the beginning', function () {
$refiner = StringRefiner::betweenFirst('', 'bar');
$refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar');
expect($refinedValue)->toBe('bla foo bli');
});
test('if end is an empty string, it takes the rest of the string until the end', function () {
$refiner = StringRefiner::betweenFirst('blu', '');
$refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar');
expect($refinedValue)->toBe('foo bar asdf foo bar');
});
it('returns an empty string if start is not contained in the string', function () {
$refiner = StringRefiner::betweenFirst('not contained', '');
$refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar');
expect($refinedValue)->toBe('');
});
================================================
FILE: tests/Steps/Refiners/String/BetweenLastTest.php
================================================
addLogger(new CliLogger())
->refine($value);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)
->toContain('Refiner StringRefiner::betweenLast() can\'t be applied to value of type ' . gettype($value))
->and($refinedValue)->toBe($value);
})->with([
[123],
[12.3],
[true],
]);
it('works with an array of strings as value', function () {
$refinedValue = StringRefiner::betweenLast('foo', 'bar')
->addLogger(new CliLogger())
->refine(['one foo two bar three foo four bar five', 'six foo seven bar eight foo nine bar ten']);
expect($refinedValue)->toBe(['four', 'nine']);
});
it('gets the (trimmed) string between the last occurrence of start and the next occurrence of end', function () {
$refiner = StringRefiner::betweenLast('foo', 'bar');
$refinedValue = $refiner->refine('bla foo bli bar blu foo ble foo blo bar blö bar blä');
expect($refinedValue)->toBe('blo');
});
test('if start is an empty string, start from the beginning', function () {
$refiner = StringRefiner::betweenLast('', 'blu');
$refinedValue = $refiner->refine('bla foo bli bar blu foo ble foo blo bar blö bar blä');
expect($refinedValue)->toBe('bla foo bli bar');
});
test('if end is an empty string, it takes the rest of the string until the end', function () {
$refiner = StringRefiner::betweenLast('blo', '');
$refinedValue = $refiner->refine('bla foo bli bar blu foo ble foo blo bar blö bar blä');
expect($refinedValue)->toBe('bar blö bar blä');
});
it('returns an empty string if start is not contained in the string', function () {
$refiner = StringRefiner::betweenFirst('not contained', '');
$refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar');
expect($refinedValue)->toBe('');
});
================================================
FILE: tests/Steps/Refiners/String/ReplaceTest.php
================================================
addLogger(new CliLogger())
->refine($value);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)
->toContain('Refiner StringRefiner::replace() can\'t be applied to value of type ' . gettype($value))
->and($refinedValue)->toBe($value);
})->with([
[123],
[12.3],
[true],
]);
it('works when the value is an array of strings', function () {
$refinedValue = StringRefiner::replace('foo', 'bar')
->addLogger(new CliLogger())
->refine(['foo boo', 'who foo', 'yo lo']);
expect($refinedValue)->toBe(['bar boo', 'who bar', 'yo lo']);
});
it('replaces occurrences of a string with another string', function () {
expect(StringRefiner::replace('foo', 'bar')->refine('foo, test lorem foo yolo'))->toBe('bar, test lorem bar yolo');
});
it('replaces occurrences of an array of strings with another array of strings', function () {
expect(StringRefiner::replace(['foo', 'bar'], ['yo', 'lo'])->refine('foo bar baz'))->toBe('yo lo baz');
});
it('replaces occurrences of an array of strings with some single string', function () {
expect(StringRefiner::replace(['foo', 'bar'], '-')->refine('foo bar baz'))->toBe('- - baz');
});
================================================
FILE: tests/Steps/Refiners/Url/WithFragmentTest.php
================================================
addLogger(new CliLogger())
->refine($value);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)
->toContain('Refiner UrlRefiner::withFragment() can\'t be applied to value of type ' . gettype($value))
->and($refinedValue)->toBe($value);
},
)->with([
[123],
[true],
[new stdClass()],
]);
it('replaces the query in a URL', function (mixed $value, string $expected) {
expect(UrlRefiner::withFragment('#lorem')->refine($value))->toBe($expected);
})->with([
['https://www.example.com/path#foo', 'https://www.example.com/path#lorem'],
['https://www.example.com/path', 'https://www.example.com/path#lorem'],
[Url::parse('https://www.crwlr.software/some/path#abc'), 'https://www.crwlr.software/some/path#lorem'],
[Url::parsePsr7('https://www.crwl.io/quz#'), 'https://www.crwl.io/quz#lorem'],
]);
it('resets any query', function (mixed $value, string $expected) {
expect(UrlRefiner::withoutFragment()->refine($value))->toBe($expected);
})->with([
['https://www.example.com/foo#bar', 'https://www.example.com/foo'],
['https://www.crwlr.software/#', 'https://www.crwlr.software/'],
]);
it('refines an array of URLs', function () {
expect(
UrlRefiner::withFragment('#lorem')
->refine([
'https://www.example.com/path#foo',
'https://www.example.com/path#bar',
]),
)->toBe(['https://www.example.com/path#lorem', 'https://www.example.com/path#lorem']);
});
================================================
FILE: tests/Steps/Refiners/Url/WithHostTest.php
================================================
addLogger(new CliLogger())
->refine($value);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)
->toContain('Refiner UrlRefiner::withHost() can\'t be applied to value of type ' . gettype($value))
->and($refinedValue)->toBe($value);
},
)->with([
[123],
[true],
[new stdClass()],
]);
it('replaces the host in a URL', function (mixed $value, string $expected) {
expect(UrlRefiner::withHost('www.crwlr.software')->refine($value))->toBe($expected);
})->with([
['https://www.example.com/foo', 'https://www.crwlr.software/foo'],
['https://www.crwl.io/bar', 'https://www.crwlr.software/bar'],
[Url::parse('https://www.crwlr.software/baz'), 'https://www.crwlr.software/baz'],
[Url::parsePsr7('https://crwl.io/quz'), 'https://www.crwlr.software/quz'],
]);
it('refines an array of URLs', function () {
expect(
UrlRefiner::withHost('crwl.io')
->refine([
'https://www.example.com/foo',
'https://www.example.com/bar',
]),
)->toBe(['https://crwl.io/foo', 'https://crwl.io/bar']);
});
================================================
FILE: tests/Steps/Refiners/Url/WithPathTest.php
================================================
addLogger(new CliLogger())
->refine($value);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)
->toContain('Refiner UrlRefiner::withPath() can\'t be applied to value of type ' . gettype($value))
->and($refinedValue)->toBe($value);
},
)->with([
[123],
[true],
[new stdClass()],
]);
it('replaces the path in a URL', function (mixed $value, string $expected) {
expect(UrlRefiner::withPath('/some/path/123')->refine($value))->toBe($expected);
})->with([
['https://www.example.com/foo', 'https://www.example.com/some/path/123'],
['https://localhost/yo', 'https://localhost/some/path/123'],
[Url::parse('https://www.crwlr.software/packages'), 'https://www.crwlr.software/some/path/123'],
[Url::parsePsr7('https://www.crwl.io/'), 'https://www.crwl.io/some/path/123'],
]);
it('refines an array of URLs', function () {
expect(
UrlRefiner::withPath('/hawedere')
->refine([
'https://www.example.com/foo',
'https://www.example.com/bar',
]),
)->toBe(['https://www.example.com/hawedere', 'https://www.example.com/hawedere']);
});
================================================
FILE: tests/Steps/Refiners/Url/WithPortTest.php
================================================
addLogger(new CliLogger())
->refine($value);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)
->toContain('Refiner UrlRefiner::withPort() can\'t be applied to value of type ' . gettype($value))
->and($refinedValue)->toBe($value);
},
)->with([
[123],
[true],
[new stdClass()],
]);
it('replaces the port in a URL', function (mixed $value, string $expected) {
expect(UrlRefiner::withPort(1234)->refine($value))->toBe($expected);
})->with([
['https://www.example.com:8000/foo', 'https://www.example.com:1234/foo'],
['https://localhost:8080/yo', 'https://localhost:1234/yo'],
[Url::parse('https://www.crwlr.software:5678/bar'), 'https://www.crwlr.software:1234/bar'],
[Url::parsePsr7('https://crwl.io/quz'), 'https://crwl.io:1234/quz'],
]);
it('refines an array of URLs', function () {
expect(
UrlRefiner::withPort(1234)
->refine([
'https://www.example.com/foo',
'https://www.example.com/bar',
]),
)->toBe(['https://www.example.com:1234/foo', 'https://www.example.com:1234/bar']);
});
================================================
FILE: tests/Steps/Refiners/Url/WithQueryTest.php
================================================
addLogger(new CliLogger())
->refine($value);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)
->toContain('Refiner UrlRefiner::withQuery() can\'t be applied to value of type ' . gettype($value))
->and($refinedValue)->toBe($value);
},
)->with([
[123],
[true],
[new stdClass()],
]);
it('replaces the query in a URL', function (mixed $value, string $expected) {
expect(UrlRefiner::withQuery('a=b&c=d')->refine($value))->toBe($expected);
})->with([
['https://www.example.com/foo?one=two', 'https://www.example.com/foo?a=b&c=d'],
['https://www.example.com/bar', 'https://www.example.com/bar?a=b&c=d'],
[Url::parse('https://www.crwlr.software/?'), 'https://www.crwlr.software/?a=b&c=d'],
[Url::parsePsr7('https://www.crwl.io/quz?a=c&b=d'), 'https://www.crwl.io/quz?a=b&c=d'],
]);
it('resets any query', function (mixed $value, string $expected) {
expect(UrlRefiner::withoutQuery()->refine($value))->toBe($expected);
})->with([
['https://www.example.com/foo?one=two', 'https://www.example.com/foo'],
['https://www.crwlr.software/?', 'https://www.crwlr.software/'],
]);
it('refines an array of URLs', function () {
expect(
UrlRefiner::withoutQuery()
->refine([
'https://www.example.com/foo?one=two',
'https://www.example.com/bar?three=four',
]),
)->toBe(['https://www.example.com/foo', 'https://www.example.com/bar']);
});
================================================
FILE: tests/Steps/Refiners/Url/WithSchemeTest.php
================================================
addLogger(new CliLogger())
->refine($value);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)
->toContain('Refiner UrlRefiner::withScheme() can\'t be applied to value of type ' . gettype($value))
->and($refinedValue)->toBe($value);
},
)->with([
[123],
[true],
[new stdClass()],
]);
it('replaces the scheme in a URL', function (mixed $value, string $expected) {
expect(UrlRefiner::withScheme('https')->refine($value))->toBe($expected);
})->with([
['http://www.example.com/foo', 'https://www.example.com/foo'],
['https://www.example.com/foo', 'https://www.example.com/foo'],
[Url::parse('ftp://www.example.com/bar'), 'https://www.example.com/bar'],
[Url::parsePsr7('http://www.example.com/baz'), 'https://www.example.com/baz'],
]);
it('refines an array of URLs', function () {
expect(
UrlRefiner::withScheme('https')
->refine([
'http://www.example.com/foo',
'https://www.example.com/bar',
]),
)->toBe(['https://www.example.com/foo', 'https://www.example.com/bar']);
});
================================================
FILE: tests/Steps/Refiners/Url/WithoutPortTest.php
================================================
addLogger(new CliLogger())
->refine($value);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)
->toContain('Refiner UrlRefiner::withoutPort() can\'t be applied to value of type ' . gettype($value))
->and($refinedValue)->toBe($value);
},
)->with([
[123],
[true],
[new stdClass()],
]);
it('resets the port to null in a URL', function (mixed $value, string $expected) {
expect(UrlRefiner::withoutPort()->refine($value))->toBe($expected);
})->with([
['https://www.example.com:8000/foo', 'https://www.example.com/foo'],
['http://localhost:8080/yo', 'http://localhost/yo'],
[Url::parse('https://www.crwlr.software:5678/bar'), 'https://www.crwlr.software/bar'],
[Url::parsePsr7('https://crwl.io/quz'), 'https://crwl.io/quz'],
]);
it('refines an array of URLs', function () {
expect(
UrlRefiner::withoutPort()
->refine([
'https://www.example.com:8000/foo',
'https://www.example.com:8080/bar',
]),
)->toBe(['https://www.example.com/foo', 'https://www.example.com/bar']);
});
================================================
FILE: tests/Steps/Sitemap/GetUrlsFromSitemapTest.php
================================================
https://www.crwlr.software/0.5https://www.crwlr.software/packages0.7https://www.crwlr.software/blog0.7https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-512022-09-03https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php12022-06-02https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-412022-05-10https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-2-and-v0-312022-04-30https://www.crwlr.software/blog/release-of-crwlr-crawler-v-0-1-012022-04-18https://www.crwlr.software/blog/prevent-homograph-attacks-in-user-input-urls12022-01-19
XML;
$outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap(), $xml);
expect($outputs)->toHaveCount(9)
->and($outputs[0]->get())->toBe('https://www.crwlr.software/')
->and($outputs[8]->get())->toBe('https://www.crwlr.software/blog/prevent-homograph-attacks-in-user-input-urls');
});
it('gets all urls with additional data when the withData() method is used', function () {
$xml = <<https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-512022-09-03https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php12022-06-02https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-40.72022-05-10
XML;
$outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap()->withData(), $xml);
expect($outputs)->toHaveCount(3)
->and($outputs[0]->get())->toBe([
'url' => 'https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-5',
'lastmod' => '2022-09-03',
'priority' => '1',
])
->and($outputs[1]->get())->toBe([
'url' => 'https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php',
'lastmod' => '2022-06-02',
'priority' => '1',
])
->and($outputs[2]->get())->toBe([
'url' => 'https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-4',
'lastmod' => '2022-05-10',
'priority' => '0.7',
]);
});
it('doesn\'t fail when sitemap is empty', function () {
$xml = <<
XML;
$outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap()->withData(), $xml);
expect($outputs)->toHaveCount(0);
});
it(
'doesn\'t fail when the urlset tag contains attributes, that would cause the symfony DomCrawler to not find the ' .
'elements',
function () {
$xml = <<https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-5https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-phphttps://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-4
XML;
$outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap(), $xml);
expect($outputs)->toHaveCount(3);
},
);
it(
'doesn\'t fail when the urlset tag contains attributes, that would cause the symfony DomCrawler to not find the ' .
'elements, when the XML content has no line breaks',
function () {
$xml = <<https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-5https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-phphttps://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-4
XML;
$outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap(), $xml);
expect($outputs)->toHaveCount(3);
},
);
================================================
FILE: tests/Steps/StepTest.php
================================================
*/
protected function invoke(mixed $input): Generator
{
$this->logger?->info('logging works');
yield 'something';
}
};
$step->addLogger(new CliLogger());
helper_traverseIterable($step->invokeStep(new Input('test')));
$output = $this->getActualOutputForAssertion();
expect($output)->toContain('logging works');
});
test('The invokeStep method wraps the values returned by invoke in Output objects', function () {
$step = helper_getValueReturningStep('returnValue');
$output = helper_invokeStepWithInput($step);
expect($output)->toHaveCount(1)
->and($output[0])->toBeInstanceOf(Output::class)
->and($output[0]->get())->toBe('returnValue');
});
/* ------------------------------- keep() ------------------------------- */
test('keep() can pick keys from nested (array) output using dot notation', function () {
$step = helper_getValueReturningStep([
'users' => [
['user' => 'otsch', 'firstname' => 'Christian', 'surname' => 'Olear'],
['user' => 'juerx', 'firstname' => 'Jürgen', 'surname' => 'Müller'],
['user' => 'sandy', 'firstname' => 'Sandra', 'surname' => 'Mayr'],
],
'foo' => 'bar',
])
->keep(['nickname' => 'users.0.user', 'foo']);
$output = helper_invokeStepWithInput($step);
expect($output[0]->keep)->toBe(['nickname' => 'otsch', 'foo' => 'bar']);
});
test('keep() picks keys from nested output including a RespondedRequest object', function () {
$step = helper_getValueReturningStep([
'response' => new RespondedRequest(
new Request('GET', 'https://www.example.com/something'),
new Response(200, body: 'Hi :)'),
),
'foo' => 'bar',
])
->keep(['content' => 'response.body']);
$output = helper_invokeStepWithInput($step);
expect($output[0]->keep)->toBe(['content' => 'Hi :)']);
});
it('maps output keys to different keys when defined in the array passed to keep()', function () {
$step = helper_getValueReturningStep(['user' => 'otsch', 'firstname' => 'Christian', 'surname' => 'Olear'])
->keep(['foo' => 'firstname', 'bar' => 'surname']);
$output = helper_invokeStepWithInput($step);
expect($output[0]->keep)->toBe(['foo' => 'Christian', 'bar' => 'Olear']);
});
/* ------------------------------- useInputKey() ------------------------------- */
it('uses a key from array input when defined', function () {
$step = helper_getInputReturningStep()->useInputKey('bar');
$output = helper_invokeStepWithInput($step, new Input(
['foo' => 'fooValue', 'bar' => 'barValue', 'baz' => 'bazValue'],
));
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe('barValue');
});
it('logs a warning message when the input key to use does not exist in input array', function () {
$step = helper_getInputReturningStep()->useInputKey('baz');
$step->addLogger(new CliLogger());
$output = helper_invokeStepWithInput($step, new Input(['foo' => 'one', 'bar' => 'two']));
expect($output)->toHaveCount(0)
->and($this->getActualOutputForAssertion())
->toContain('Can\'t get key from input, because it does not exist.');
});
it(
'logs a warning message when useInputKey() was called but the input value is not an array',
function (mixed $inputValue) {
$step = helper_getInputReturningStep()->useInputKey('baz');
$step->addLogger(new CliLogger());
$output = helper_invokeStepWithInput($step, new Input($inputValue));
expect($output)->toHaveCount(0)
->and($this->getActualOutputForAssertion())
->toContain(
'Can\'t get key from input, because input is of type ' . gettype($inputValue) . ' instead of array.',
);
},
)->with([
['string'],
[0],
[new stdClass()],
]);
it('does not lose previously kept data, when it uses the useInputKey() method', function () {
$step = helper_getValueReturningStep(['test' => 'test'])->useInputKey('foo');
$outputs = helper_invokeStepWithInput($step, new Input(['foo' => 'test'], ['some' => 'thing']));
expect($outputs[0]->keep)->toBe(['some' => 'thing']);
});
it('keeps the original input data when useInputKey() is used', function () {
$step = helper_getValueReturningStep(['baz' => 'three'])
->keepFromInput()
->useInputKey('bar');
$outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two']);
expect($outputs[0]->get())->toBe(['baz' => 'three'])
->and($outputs[0]->keep)->toBe(['foo' => 'one', 'bar' => 'two']);
});
test('useInputKey() can be used to get data that was kept from a previous step with keep() or keepAs()', function () {
$step = helper_getInputReturningStep();
$step->useInputKey('bar');
$outputs = helper_invokeStepWithInput($step, new Input('value', keep: ['bar' => 'baz']));
expect($outputs[0]->get())->toBe('baz');
});
it(
'also passes on kept data through further steps when they don\'t define any further data to keep',
function () {
$step = helper_getValueReturningStep('returnValue');
$output = helper_invokeStepWithInput($step, new Input('inputValue', ['prevProperty' => 'foobar']));
expect($output)->toHaveCount(1)
->and($output[0]->keep)->toBe(['prevProperty' => 'foobar']);
},
);
/* ------------------------------- uniqueInputs() ------------------------------- */
it('doesn\'t invoke twice with duplicate inputs when uniqueInput was called', function () {
$step = helper_getInputReturningStep();
$outputs = helper_invokeStepWithInput($step, 'foo');
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($step, 'foo');
expect($outputs)->toHaveCount(1);
$step->uniqueInputs();
$outputs = helper_invokeStepWithInput($step, 'foo');
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($step, 'foo');
expect($outputs)->toHaveCount(0);
});
it(
'doesn\'t invoke twice with inputs with the same value in an array key when uniqueInput was called with that key',
function () {
$step = helper_getInputReturningStep();
$step->uniqueInputs();
$outputs = helper_invokeStepWithInput($step, ['foo' => 'bar', 'number' => 1]);
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($step, ['foo' => 'bar', 'number' => 2]);
expect($outputs)->toHaveCount(1);
$step->resetAfterRun();
$step->uniqueInputs('foo');
$outputs = helper_invokeStepWithInput($step, ['foo' => 'bar', 'number' => 1]);
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($step, ['foo' => 'bar', 'number' => 2]);
expect($outputs)->toHaveCount(0);
},
);
it(
'doesn\'t invoke twice with inputs with the same value in an object key when uniqueInput was called with that key',
function () {
$step = helper_getInputReturningStep();
$step->uniqueInputs();
$outputs = helper_invokeStepWithInput($step, helper_getStdClassWithData(['foo' => 'bar', 'number' => 1]));
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($step, helper_getStdClassWithData(['foo' => 'bar', 'number' => 2]));
expect($outputs)->toHaveCount(1);
$step->resetAfterRun();
$step->uniqueInputs('foo');
$outputs = helper_invokeStepWithInput($step, helper_getStdClassWithData(['foo' => 'bar', 'number' => 1]));
expect($outputs)->toHaveCount(1);
$outputs = helper_invokeStepWithInput($step, helper_getStdClassWithData(['foo' => 'bar', 'number' => 2]));
expect($outputs)->toHaveCount(0);
},
);
/* ------------------------------- uniqueOutputs() ------------------------------- */
it('makes outputs unique when uniqueOutput was called', function () {
$step = helper_getStepYieldingMultipleNumbers();
$step->uniqueOutputs();
$output = helper_invokeStepWithInput($step, new Input('anything'));
expect($output)->toHaveCount(5)
->and($output[0]->get())->toBe('one')
->and($output[1]->get())->toBe('two')
->and($output[2]->get())->toBe('three')
->and($output[3]->get())->toBe('four')
->and($output[4]->get())->toBe('five');
});
it('makes outputs unique when providing a key name to uniqueOutput to use from array output', function () {
$step = helper_getStepYieldingMultipleArraysWithNumber();
$step->uniqueOutputs('number');
$output = helper_invokeStepWithInput($step, new Input('anything'));
expect($output)->toHaveCount(5);
});
it('makes outputs unique when providing a key name to uniqueOutput to use from object output', function () {
$step = helper_getStepYieldingMultipleObjectsWithNumber();
$step->uniqueOutputs('number');
$output = helper_invokeStepWithInput($step, new Input('anything'));
expect($output)->toHaveCount(5);
});
it('makes array outputs unique when providing no key name to uniqueOutput', function () {
$step = helper_getStepYieldingMultipleArraysWithNumber();
$step->uniqueOutputs();
$output = helper_invokeStepWithInput($step, new Input(false));
expect($output)->toHaveCount(5);
$output = helper_invokeStepWithInput($step, new Input(true));
expect($output)->toHaveCount(8);
});
it('makes object outputs unique when providing no key name to uniqueOutput', function () {
$step = helper_getStepYieldingMultipleArraysWithNumber();
$step->uniqueOutputs();
$output = helper_invokeStepWithInput($step, new Input(false));
expect($output)->toHaveCount(5);
$output = helper_invokeStepWithInput($step, new Input(true));
expect($output)->toHaveCount(8);
});
/* ----------------------------- oneOutputPerInput() ----------------------------- */
test(
'when a step yields multiple outputs per input and the oneOutputPerInput() method was called, the step yields it ' .
'as a single output with an array of all the single output values',
function () {
$step = helper_getStepYieldingInputArrayAsSeparateOutputs();
$step->oneOutputPerInput();
$outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']);
expect($outputs)->toHaveCount(1)
->and($outputs[0]->get())->toBe(['foo', 'bar', 'baz']);
},
);
test('when using oneOutputPerInput(), the combined output counts as one output for the max outputs limit', function () {
$step = helper_getStepYieldingInputArrayAsSeparateOutputs();
$step->oneOutputPerInput()->maxOutputs(2);
$outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']);
expect($outputs)->toHaveCount(1)
->and($outputs[0]->get())->toBe(['foo', 'bar', 'baz']);
$outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']);
expect($outputs)->toHaveCount(1)
->and($outputs[0]->get())->toBe(['foo', 'bar', 'baz']);
$outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']);
expect($outputs)->toHaveCount(0);
});
test('when using oneOutputPerInput(), refiners are applied to the single elements of the combined output', function () {
$step = helper_getStepYieldingInputArrayAsSeparateOutputs();
$step->oneOutputPerInput()->refineOutput('title', fn(mixed $outputValue) => $outputValue . '-hey');
$outputs = helper_invokeStepWithInput($step, [
['title' => 'foo'],
['title' => 'bar'],
['title' => 'baz'],
]);
expect($outputs)->toHaveCount(1)
->and($outputs[0]->get())->toBe([
['title' => 'foo-hey'],
['title' => 'bar-hey'],
['title' => 'baz-hey'],
]);
});
test('when using oneOutputPerInput(), filters are applied to the single elements of the combined output', function () {
$step = helper_getStepYieldingInputArrayAsSeparateOutputs();
$step->where('id', Filter::greaterThan(109))->oneOutputPerInput();
$outputs = helper_invokeStepWithInput($step, [
['title' => 'foo', 'id' => 109],
['title' => 'bar', 'id' => 110],
['title' => 'baz', 'id' => 111],
]);
expect($outputs)->toHaveCount(1)
->and($outputs[0]->get())->toBe([
['title' => 'bar', 'id' => 110],
['title' => 'baz', 'id' => 111],
]);
});
test(
'when using oneOutputPerInput() in combination with outputKey(), the whole combined output is returned in an ' .
'array with the defined key',
function () {
$step = helper_getStepYieldingInputArrayAsSeparateOutputs();
$step->outputKey('test')->oneOutputPerInput();
$outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']);
expect($outputs)->toHaveCount(1)
->and($outputs[0]->get())->toBe(['test' => ['foo', 'bar', 'baz']]);
},
);
test(
'when using oneOutputPerInput() in combination with uniqueOutputs(), the whole combined output is compared',
function () {
$step = helper_getStepYieldingInputArrayAsSeparateOutputs();
$step->oneOutputPerInput()->uniqueOutputs();
$outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']);
expect($outputs)->toHaveCount(1)
->and($outputs[0]->get())->toBe(['foo', 'bar', 'baz']);
$outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'quz']);
expect($outputs)->toHaveCount(1)
->and($outputs[0]->get())->toBe(['foo', 'bar', 'quz']);
$outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']);
expect($outputs)->toHaveCount(0);
},
);
/* -------------------------- validateAndSanitizeInput() -------------------------- */
it('calls the validateAndSanitizeInput method', function () {
$step = new class extends Step {
protected function validateAndSanitizeInput(mixed $input): string
{
return $input . ' validated and sanitized';
}
protected function invoke(mixed $input): Generator
{
yield $input;
}
};
$output = helper_invokeStepWithInput($step, 'inputValue');
expect($output[0]->get())->toBe('inputValue validated and sanitized');
});
test(
'when calling validateAndSanitizeStringOrStringable() and the input is array with a single element it tries to ' .
'use that element as input value',
function () {
$step = new class extends Step {
protected function validateAndSanitizeInput(mixed $input): string
{
return $this->validateAndSanitizeStringOrStringable($input);
}
protected function invoke(mixed $input): Generator
{
yield $input;
}
};
$output = helper_invokeStepWithInput($step, ['inputValue']);
expect($output[0]->get())->toBe('inputValue');
},
);
test(
'when calling validateAndSanitizeStringOrStringable() and the input is array with multiple elements it logs ' .
'an error message',
function () {
$logger = new DummyLogger();
$step = new class extends Step {
protected function validateAndSanitizeInput(mixed $input): string
{
return $this->validateAndSanitizeStringOrStringable($input);
}
protected function invoke(mixed $input): Generator
{
yield $input;
}
};
$step->addLogger($logger);
helper_invokeStepWithInput($step, ['inputValue', 'foo' => 'bar']);
expect($logger->messages)->not->toBeEmpty()
->and($logger->messages[0]['message'])->toStartWith(
'A step was called with input that it can not work with:',
)
->and($logger->messages[0]['message'])->toEndWith('. The invalid input is of type array.');
},
);
test(
'when throwing an InvalidArgumentException from the validateAndSanitizeInput() it is caught and logged as an error',
function () {
$logger = new DummyLogger();
$step = new class extends Step {
protected function validateAndSanitizeInput(mixed $input): string
{
throw new InvalidArgumentException('hey :)');
}
protected function invoke(mixed $input): Generator
{
yield $input;
}
};
$step->addLogger($logger);
$outputs = helper_invokeStepWithInput($step, 'anything');
expect($outputs)->toBeEmpty()
->and($logger->messages)->not->toBeEmpty()
->and($logger->messages[0]['message'])->toBe(
'A step was called with input that it can not work with: hey :)',
);
},
);
test(
'when throwing an Exception that is not an InvalidArgumentException, from the validateAndSanitizeInput() it is ' .
'not caught',
function () {
$logger = new DummyLogger();
$step = new class extends Step {
protected function validateAndSanitizeInput(mixed $input): string
{
throw new Exception('hey :)');
}
protected function invoke(mixed $input): Generator
{
yield $input;
}
};
$step->addLogger($logger);
helper_invokeStepWithInput($step, 'anything');
},
)->throws(Exception::class);
it('is possible that a step does not produce any output at all', function () {
$step = new class extends Step {
protected function invoke(mixed $input): Generator
{
if ($input === 'foo') {
yield 'bar';
}
}
};
$output = helper_invokeStepWithInput($step, 'lol');
expect($output)->toHaveCount(0);
$output = helper_invokeStepWithInput($step, 'foo');
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe('bar');
});
/* --------------------------- updateInputUsingOutput() --------------------------- */
test('You can add and call an updateInputUsingOutput callback', function () {
$step = helper_getValueReturningStep('something');
$step->updateInputUsingOutput(function (mixed $input, mixed $output) {
return $input . ' ' . $output;
});
$updatedInput = $step->callUpdateInputUsingOutput(new Input('Boo'), new Output('Yah!'));
expect($updatedInput)->toBeInstanceOf(Input::class)
->and($updatedInput->get())->toBe('Boo Yah!');
});
it('does not lose previously kept data, when updateInputUsingOutput() is called', function () {
$step = helper_getValueReturningStep('something');
$step->updateInputUsingOutput(function (mixed $input, mixed $output) {
return $input . ' ' . $output;
});
$updatedInput = $step->callUpdateInputUsingOutput(
new Input('Some', ['foo' => 'bar']),
new Output('thing'),
);
expect($updatedInput->keep)->toBe(['foo' => 'bar']);
});
/* -------------------------------- maxOutputs() -------------------------------- */
it('does not yield more outputs than defined via maxOutputs() method', function () {
$step = helper_getValueReturningStep('yolo')->maxOutputs(3);
for ($i = 1; $i <= 5; $i++) {
$outputs = helper_invokeStepWithInput($step, new Input('asdf'));
if ($i <= 3) {
expect($outputs)->toHaveCount(1);
} else {
expect($outputs)->toHaveCount(0);
}
}
});
it(
'does not yield more outputs than defined via maxOutputs() when step yields multiple outputs per input and the ' .
'limit is reached in the middle of the outputs resulting from one input',
function () {
$step = new class extends Step {
protected function invoke(mixed $input): Generator
{
yield 'one';
yield 'two';
yield 'three';
}
};
$step->maxOutputs(7);
$outputs = helper_invokeStepWithInput($step, new Input('a'));
expect($outputs)->toHaveCount(3);
$outputs = helper_invokeStepWithInput($step, new Input('b'));
expect($outputs)->toHaveCount(3);
$outputs = helper_invokeStepWithInput($step, new Input('c'));
expect($outputs)->toHaveCount(1);
},
);
test('When a step has max outputs defined, it won\'t call the invoke method after the limit was reached', function () {
$step = new class extends Step {
public int $_invokeCallCount = 0;
protected function invoke(mixed $input): Generator
{
$this->_invokeCallCount += 1;
yield 'something';
}
};
$step->maxOutputs(2);
helper_invokeStepWithInput($step, new Input('one'));
helper_invokeStepWithInput($step, new Input('two'));
helper_invokeStepWithInput($step, new Input('three'));
helper_invokeStepWithInput($step, new Input('four'));
expect($step->_invokeCallCount)->toBe(2);
});
it('resets outputs count for maxOutputs rule when resetAfterRun() is called', function () {
$step = helper_getValueReturningStep('gogogo')->maxOutputs(2);
helper_invokeStepWithInput($step, new Input('one'));
helper_invokeStepWithInput($step, new Input('two'));
$step->resetAfterRun();
expect(helper_invokeStepWithInput($step, new Input('three')))->toHaveCount(1);
});
/* -------------------------------- outputKey() -------------------------------- */
it('converts non array output to array with a certain key using the outputKey() method', function () {
$step = helper_getValueReturningStep('bar')->outputKey('foo');
$outputs = helper_invokeStepWithInput($step);
expect($outputs[0]->get())->toBe(['foo' => 'bar']);
});
test('keeping a scalar output value with keep() also works when outputKey() was used', function () {
$step = new class extends Step {
protected function invoke(mixed $input): Generator
{
yield 'hey';
}
public function outputType(): StepOutputType
{
return StepOutputType::Scalar;
}
};
$step
->outputKey('greeting')
->keep();
$step->validateBeforeRun(Http::get());
$outputs = helper_invokeStepWithInput($step, 'guten tag');
expect($outputs[0]->get())->toBe(['greeting' => 'hey']);
});
/* -------------------------------- refineOutput() -------------------------------- */
it('applies a Closure refiner to the steps output', function () {
$step = helper_getValueReturningStep('output');
$step->refineOutput(function (mixed $outputValue) {
return $outputValue . ' refined';
});
$outputs = helper_invokeStepWithInput($step);
expect($outputs[0]->get())->toBe('output refined');
});
it('applies an instance of the RefinerInterface to the steps output', function () {
$step = helper_getInputReturningStep();
$step->refineOutput(StringRefiner::betweenFirst('foo', 'baz'));
$outputs = helper_invokeStepWithInput($step, 'foo bar baz');
expect($outputs[0]->get())->toBe('bar');
});
it('applies multiple refiners to the steps output in the order they\'re added', function () {
$step = helper_getInputReturningStep();
$step
->refineOutput(StringRefiner::betweenFirst('foo', 'baz'))
->refineOutput(function (mixed $outputValue) {
return $outputValue . ' refined';
})
->refineOutput(function (mixed $outputValue) {
return $outputValue . ', and refined further';
});
$outputs = helper_invokeStepWithInput($step, 'foo bar baz');
expect($outputs[0]->get())->toBe('bar refined, and refined further');
});
it('applies refiners to certain keys from array output when the key is provided', function () {
$step = helper_getInputReturningStep();
$step
->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor'))
->refineOutput('baz', function (mixed $outputValue) {
return 'refined ' . $outputValue;
});
$outputs = helper_invokeStepWithInput(
$step,
['foo' => 'lorem ipsum dolor', 'bar' => 'bla', 'baz' => 'quz'],
);
expect($outputs[0]->get())->toBe([
'foo' => 'ipsum',
'bar' => 'bla',
'baz' => 'refined quz',
]);
});
test('you can apply multiple refiners to the same output array key', function () {
$step = helper_getInputReturningStep();
$step
->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor'))
->refineOutput('foo', function (mixed $outputValue) {
return $outputValue . ' yolo';
});
$outputs = helper_invokeStepWithInput(
$step,
['foo' => 'lorem ipsum dolor', 'bar' => 'bla'],
);
expect($outputs[0]->get())->toBe([
'foo' => 'ipsum yolo',
'bar' => 'bla',
]);
});
it(
'uses the original input value when applying a refiner, not only the value of an input array key chosen via ' .
'useInputKey()',
function () {
$step = helper_getInputReturningStep();
$step
->useInputKey('bar')
->refineOutput(function (mixed $outputValue, mixed $originalInputValue) {
return $originalInputValue;
});
$outputs = helper_invokeStepWithInput(
$step,
['foo' => 'one', 'bar' => 'two'],
);
expect($outputs[0]->get())->toBe(['foo' => 'one', 'bar' => 'two']);
},
);
/* ------------------------------- outputKeyAliases() ------------------------------- */
test('you can define aliases for output keys and they are considered when using keep()', function () {
$step = new class extends Step {
protected function invoke(mixed $input): Generator
{
yield [
'foo' => 'one',
'bar' => 'two',
'baz' => 'three',
];
}
protected function outputKeyAliases(): array
{
return [
'woo' => 'foo',
'war' => 'bar',
'waz' => 'baz',
];
}
};
$step->keep(['woo', 'far' => 'war', 'waz']);
$outputs = helper_invokeStepWithInput($step);
expect($outputs[0]->keep)->toBe([
'woo' => 'one',
'far' => 'two',
'waz' => 'three',
]);
});
test('you can filter outputs using an output key alias', function () {
$step = new class extends Step {
protected function invoke(mixed $input): Generator
{
yield [
'foo' => 'one',
'bar' => 'two',
];
}
protected function outputKeyAliases(): array
{
return [
'baz' => 'bar',
];
}
};
$step->where('baz', Filter::equal('two'));
$outputs = helper_invokeStepWithInput($step);
expect($outputs[0])->toBeInstanceOf(Output::class);
});
it('can filter by a key that only exists in the serialized version of an output object', function () {
$step = new class extends Step {
protected function invoke(mixed $input): Generator
{
yield new class {
public string $foo = 'one';
public string $bar = 'two';
/**
* @return string[]
*/
public function __serialize(): array
{
return [
'foo' => $this->foo,
'bar' => $this->bar,
'baz' => $this->bar,
];
}
};
}
protected function outputKeyAliases(): array
{
return [
'quz' => 'baz',
];
}
};
$step->where('quz', Filter::equal('two'));
$outputs = helper_invokeStepWithInput($step);
expect($outputs[0])->toBeInstanceOf(Output::class);
});
================================================
FILE: tests/Steps/XmlTest.php
================================================
extract('title'),
helper_getStepFilesContent('Xml/bookstore.xml'),
);
expect($output)->toHaveCount(4)
->and($output[0]->get())->toBe('Everyday Italian')
->and($output[3]->get())->toBe('Learning XML');
});
it('extracts data from an XML document with XPath queries per default', function () {
$output = helper_invokeStepWithInput(
Xml::each('bookstore book')->extract([
'title' => 'title',
'author' => 'author',
'year' => 'year',
]),
helper_getStepFilesContent('Xml/bookstore.xml'),
);
expect($output)->toHaveCount(4)
->and($output[0]->get())->toBe(
['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'],
)
->and($output[1]->get())->toBe(['title' => 'Harry Potter', 'author' => 'J K. Rowling', 'year' => '2005'])
->and($output[2]->get())->toBe(
[
'title' => 'XQuery Kick Start',
'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'],
'year' => '2003',
],
)
->and($output[3]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']);
});
it('can also extract data using XPath queries', function () {
$output = helper_invokeStepWithInput(
Xml::each(Dom::xPath('//bookstore/book'))->extract([
'title' => Dom::xPath('//title'),
'author' => Dom::xPath('//author'),
'year' => Dom::xPath('//year'),
]),
helper_getStepFilesContent('Xml/bookstore.xml'),
);
expect($output)->toHaveCount(4)
->and($output[2]->get())->toBe(
[
'title' => 'XQuery Kick Start',
'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'],
'year' => '2003',
],
);
});
it('returns only one (compound) output when the root method is used', function () {
$output = helper_invokeStepWithInput(
Xml::root()->extract(['title' => 'title', 'author' => 'author', 'year' => 'year']),
helper_getStepFilesContent('Xml/bookstore.xml'),
);
expect($output)->toHaveCount(1)
->and($output[0]->get()['title'])->toBe(['Everyday Italian', 'Harry Potter', 'XQuery Kick Start', 'Learning XML']);
});
it('extracts the data of the first matching element when the first method is used', function () {
$output = helper_invokeStepWithInput(
Xml::first('bookstore book')->extract(['title' => 'title', 'author' => 'author', 'year' => 'year']),
helper_getStepFilesContent('Xml/bookstore.xml'),
);
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe(
['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'],
);
});
it('extracts the data of the last matching element when the last method is used', function () {
$output = helper_invokeStepWithInput(
Xml::last('bookstore book')->extract(['title' => 'title', 'author' => 'author', 'year' => 'year']),
helper_getStepFilesContent('Xml/bookstore.xml'),
);
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']);
});
test(
'you can extract data in a second level to the output array using another Xml step as an element in the mapping ' .
'array',
function () {
$response = new RespondedRequest(
new Request('GET', 'https://www.example.com/events.xml'),
new Response(body: helper_getStepFilesContent('Xml/events.xml')),
);
$outputs = helper_invokeStepWithInput(
Xml::each('events event')->extract([
'title' => 'name',
'location' => 'location',
'date' => 'date',
'talks' => Xml::each('talks talk')->extract([
'title' => 'title',
'speaker' => 'speaker',
]),
]),
$response,
);
expect($outputs)->toHaveCount(2)
->and($outputs[0]->get())->toBe([
'title' => 'Some Meetup',
'location' => 'Somewhere',
'date' => '2023-01-14 20:00',
'talks' => [
[
'title' => 'Sophisticated talk title',
'speaker' => 'Super Mario',
],
[
'title' => 'Fun talk',
'speaker' => 'Princess Peach',
],
],
])
->and($outputs[1]->get())->toBe([
'title' => 'Another Meetup',
'location' => 'Somewhere else',
'date' => '2023-01-21 19:00',
'talks' => [
[
'title' => 'Join the dark side',
'speaker' => 'Wario',
],
[
'title' => 'Let\'s go',
'speaker' => 'Yoshi',
],
],
]);
},
);
test(
'When a child step is nested in the extraction and does not use each(), the extracted value is an array with ' .
'the keys defined in extract(), rather than an array of such arrays as it would be with each().',
function () {
$xml = <<ABCDEFGmbHfooGermanyFrankfurtSaubär GmbHbarAustriaKlagenfurt
XML;
$expectedCompany1 = [
'name' => 'ABCDEFGmbH',
'founded' => '1984',
'location' => ['country' => 'Germany', 'city' => 'Frankfurt'],
];
$expectedCompany2 = [
'name' => 'Saubär GmbH',
'founded' => '2014',
'location' => ['country' => 'Austria', 'city' => 'Klagenfurt'],
];
// With base root()
$step = Xml::each(Dom::xPath('//companies/company'))->extract([
'name' => Dom::cssSelector('name')->text(),
'founded' => Dom::xPath('//founded')->attribute('year'),
'location' => Xml::root()->extract([
'country' => Dom::xPath('//location/country')->text(),
'city' => Dom::cssSelector('location city')->text(),
]),
]);
$outputs = helper_invokeStepWithInput($step, $xml);
expect($outputs)->toHaveCount(2)
->and($outputs[0]->get())->toBe($expectedCompany1)
->and($outputs[1]->get())->toBe($expectedCompany2);
// With base first()
$step = Xml::each(Dom::xPath('//companies/company'))->extract([
'name' => Dom::cssSelector('name')->text(),
'founded' => Dom::xPath('//founded')->attribute('year'),
'location' => Xml::first(Dom::cssSelector('location'))->extract([
'country' => Dom::xPath('//country')->text(),
'city' => Dom::cssSelector('city')->text(),
]),
]);
$outputs = helper_invokeStepWithInput($step, $xml);
expect($outputs)->toHaveCount(2)
->and($outputs[0]->get())->toBe($expectedCompany1)
->and($outputs[1]->get())->toBe($expectedCompany2);
// With base last()
$step = Xml::each(Dom::xPath('//companies/company'))->extract([
'name' => Dom::cssSelector('name')->text(),
'founded' => Dom::xPath('//founded')->attribute('year'),
'location' => Xml::last(Dom::cssSelector('location'))->extract([
'country' => Dom::xPath('//country')->text(),
'city' => Dom::cssSelector('city')->text(),
]),
]);
$outputs = helper_invokeStepWithInput($step, $xml);
expect($outputs)->toHaveCount(2)
->and($outputs[0]->get())->toBe($expectedCompany1)
->and($outputs[1]->get())->toBe($expectedCompany2);
},
);
it('works when the response string starts with an UTF-8 byte order mark character', function () {
$response = new RespondedRequest(
new Request('GET', 'https://www.example.com/rss'),
new Response(body: helper_getStepFilesContent('Xml/rss-with-bom.xml')),
);
$outputs = helper_invokeStepWithInput(
Xml::each('channel item')->extract([
'url' => 'link',
'title' => 'title',
]),
$response,
);
expect($outputs[0]->get())->toBe([
'url' => 'https://www.example.com/story/1234567/foo-bar-baz?ref=rss',
'title' => 'Some title',
]);
});
test(
'when selecting elements with each(), you can reference the element already selected within the each() selector ' .
'itself, in sub selectors',
function () {
$xml = <<123456
XML;
$response = new RespondedRequest(
new Request('GET', 'https://www.example.com/foo'),
new Response(body: $xml),
);
$output = helper_invokeStepWithInput(
Xml::each('data items item')->extract([
// This is what this test is about. The element already selected in each (item) can be
// referenced in these child selectors.
'id' => Dom::cssSelector('item > id'),
'attribute' => Dom::cssSelector('')->attribute('attr'),
]),
$response,
);
expect($output)->toHaveCount(1)
->and($output[0]->get())->toBe(['id' => '123', 'attribute' => 'abc']);
},
);
it('works with tags with camelCase names', function () {
$xml = <<foofooabc-1232024-11-07T11:00:31ZFoo bar baz!https://www.example.com/item-1?utm_source=foo&utm_medium=feed-xmltest
XML;
$response = new RespondedRequest(
new Request('GET', 'https://www.example.com/xml-feed'),
new Response(body: $xml),
);
$outputs = helper_invokeStepWithInput(
Xml::each(Dom::cssSelector('feed items item'))->extract([
'title' => 'title',
'some-url' => 'someUrl',
'foo-bar-baz' => 'foo baRbaz',
]),
$response,
);
expect($outputs[0]->get())->toBe([
'title' => 'Foo bar baz!',
'some-url' => 'https://www.example.com/item-1?utm_source=foo&utm_medium=feed-xml',
'foo-bar-baz' => 'test',
]);
})->group('php84');
================================================
FILE: tests/Steps/_Files/Csv/basic.csv
================================================
123,"Otsch","https://www.otsch.codes"
234,"John Doe","https://www.john.doe"
345,"Jane Doe","https://www.jane.doe"
================================================
FILE: tests/Steps/_Files/Csv/enclosure.csv
================================================
123,?Kräftige Rindsuppe?,4.5
234,?Crispy Chicken Burger?,12
345,?Duett von Saibling und Forelle?,21
================================================
FILE: tests/Steps/_Files/Csv/escape.csv
================================================
123,"test %"escape%" test",test
123,"foo %"escape%" bar %"baz%" lorem",test
================================================
FILE: tests/Steps/_Files/Csv/separator.csv
================================================
123*"CoDerOtsch"*Christian*Olear*35
234*"g3n1u5"*Albert*Einstein*143
345*"sWiFtY"*Taylor*Swift*32
================================================
FILE: tests/Steps/_Files/Csv/with-column-headlines.csv
================================================
Stunde,Montag,Dienstag,Mittwoch,Donnerstag,Freitag
1,Mathematik,Deutsch,Englisch,Erdkunde,Politik
2,Sport,Deutsch,Englisch,Sport,Geschichte
3,Sport,"Religion (ev., kath.)",Kunst,,Kunst
================================================
FILE: tests/Steps/_Files/Html/basic.html
================================================
match 1
match 2
match 3
================================================
FILE: tests/Steps/_Files/Html/bookstore.html
================================================
Bookstore Example in HTML :)
Everyday Italian
Giada De Laurentiis
2005 - 30.00
Harry Potter
J K. Rowling
2005 - 29.99
XQuery Kick Start
James McGovern,
Per Bothner,
Kurt Cagle,
James Linn,
Vaidyanathan Nagarajan2003 - 49.99
Learning XML
Erik T. Ray
2003 - 39.95
================================================
FILE: tests/Steps/_Files/Html/event.html
================================================
Bookstore Example in HTML :)
================================================
FILE: tests/Steps/_Files/Xml/bookstore.xml
================================================
Everyday ItalianGiada De Laurentiis200530.00Harry PotterJ K. Rowling200529.99XQuery Kick StartJames McGovernPer BothnerKurt CagleJames LinnVaidyanathan Nagarajan200349.99Learning XMLErik T. Ray200339.95
================================================
FILE: tests/Steps/_Files/Xml/events.xml
================================================
Some MeetupSomewhere2023-01-14 20:00Sophisticated talk titleSuper MarioFun talkPrincess PeachAnother MeetupSomewhere else2023-01-21 19:00Join the dark sideWarioLet's goYoshi
================================================
FILE: tests/Steps/_Files/Xml/rss-with-bom.xml
================================================
Foo - Barhttps://www.example.com/somethinglorem ipsum dolor sitde-dehttps://www.example.com/story/1234567/foo-bar-baz?ref=rsshttps://www.example.com/story/1234567/foo-bar-baz?ref=rssFooBarSome titlelorem ipsum dolor sit ametMon, 08 May 2023 14:08:21 ZFoto: Foo/BarFoto: Foo/Bar
================================================
FILE: tests/Stores/JsonFileStoreTest.php
================================================
$value) {
$result->set($key, $value);
}
return $result;
}
it('saves Results to a JSON file', function () {
$result1 = helper_getResultWithJsonData(['user' => 'otsch', 'firstname' => 'Christian', 'surname' => 'Olear']);
$store = new JsonFileStore(__DIR__ . '/_files', 'test');
$store->store($result1);
expect(file_get_contents($store->filePath()))->toBe('[{"user":"otsch","firstname":"Christian","surname":"Olear"}]');
$result2 = helper_getResultWithJsonData(['user' => 'hader', 'firstname' => 'Josef', 'surname' => 'Hader']);
$store->store($result2);
expect(file_get_contents($store->filePath()))->toBe(
'[{"user":"otsch","firstname":"Christian","surname":"Olear"},' .
'{"user":"hader","firstname":"Josef","surname":"Hader"}]',
);
$result3 = helper_getResultWithJsonData(['user' => 'evamm', 'firstname' => 'Eva Maria', 'surname' => 'Maier']);
$store->store($result3);
expect(file_get_contents($store->filePath()))->toBe(
'[{"user":"otsch","firstname":"Christian","surname":"Olear"},' .
'{"user":"hader","firstname":"Josef","surname":"Hader"},' .
'{"user":"evamm","firstname":"Eva Maria","surname":"Maier"}]',
);
});
afterAll(function () {
$dir = __DIR__ . '/_files';
if (file_exists($dir)) {
$files = scandir($dir);
if (is_array($files)) {
foreach ($files as $file) {
if ($file === '.' || $file === '..' || !str_ends_with($file, '.json')) {
continue;
}
@unlink($dir . '/' . $file);
}
}
}
});
================================================
FILE: tests/Stores/SimpleCsvFileStoreTest.php
================================================
$value) {
$result->set($key, $value);
}
return $result;
}
it('saves Results to a csv file', function () {
$result1 = helper_getResultWithData(['user' => 'otsch', 'firstname' => 'Christian', 'surname' => 'Olear']);
$store = new SimpleCsvFileStore(__DIR__ . '/_files', 'test');
$store->store($result1);
expect(file_get_contents($store->filePath()))->toBe("user,firstname,surname\notsch,Christian,Olear\n");
$result2 = helper_getResultWithData(['user' => 'hader', 'firstname' => 'Josef', 'surname' => 'Hader']);
$store->store($result2);
expect(file_get_contents($store->filePath()))->toBe(
"user,firstname,surname\notsch,Christian,Olear\nhader,Josef,Hader\n",
);
$result3 = helper_getResultWithData(['user' => 'evamm', 'firstname' => 'Eva Maria', 'surname' => 'Maier']);
$store->store($result3);
expect(file_get_contents($store->filePath()))->toBe(
"user,firstname,surname\notsch,Christian,Olear\nhader,Josef,Hader\nevamm,\"Eva Maria\",Maier\n",
);
});
test('if the value of a result property is an array, it concatenates the values separated with a pipe', function () {
$result1 = helper_getResultWithData(['col1' => 'foo', 'col2' => ['bar', 'baz', 'quz']]);
$store = new SimpleCsvFileStore(__DIR__ . '/_files', 'test2');
$store->store($result1);
expect(file_get_contents($store->filePath()))->toBe("col1,col2\nfoo,\"bar | baz | quz\"\n");
$result2 = helper_getResultWithData(['col1' => 'Donald', 'col2' => ['Tick', 'Trick', 'Track']]);
$store->store($result2);
expect(file_get_contents($store->filePath()))->toBe(
"col1,col2\nfoo,\"bar | baz | quz\"\nDonald,\"Tick | Trick | Track\"\n",
);
});
afterAll(function () {
$dir = __DIR__ . '/_files';
if (file_exists($dir)) {
$files = scandir($dir);
if (is_array($files)) {
foreach ($files as $file) {
if ($file === '.' || $file === '..' || !str_ends_with($file, '.csv')) {
continue;
}
unlink($dir . '/' . $file);
}
}
}
});
================================================
FILE: tests/Stores/_files/.gitkeep
================================================
================================================
FILE: tests/UserAgents/BotUserAgentTest.php
================================================
assertStringContainsString('SomeBot', $userAgent);
});
test('Create UserAgent instance via static make method', function () {
$userAgent = BotUserAgent::make('CrwlrBot');
$this->assertStringContainsString('CrwlrBot', $userAgent);
});
test('Create instance with info uri', function () {
$userAgent = new BotUserAgent('SomeBot', 'https://www.example.com/somebot');
$this->assertStringContainsString('SomeBot; +https://www.example.com/somebot', $userAgent);
});
test('Create instance with info uri and version', function () {
$userAgent = new BotUserAgent('SomeBot', 'https://www.example.com/somebot', '1.3');
$this->assertStringContainsString('SomeBot/1.3; +https://www.example.com/somebot', $userAgent);
});
test('Create instance with version but without info uri', function () {
$userAgent = new BotUserAgent('SomeBot', version: '1.3');
$this->assertStringContainsString('SomeBot/1.3)', $userAgent);
});
test('User agent string starts with Mozilla/5.0', function () {
$userAgent = new BotUserAgent('ExampleBot', 'https://www.example.com/bot', '2.0');
expect($userAgent->__toString())->toStartWith('Mozilla/5.0');
});
================================================
FILE: tests/UserAgents/UserAgentTest.php
================================================
__toString())->toBe($string);
},
)->with([
'',
'Foo',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 ' .
'Safari/537.36',
'%$§$!")(=aäöüäö?ßß``2304980=)(§$/&!"=)=',
]);
================================================
FILE: tests/Utils/GzipTest.php
================================================
not->toBe($string)
->and(strlen($compressed))->toBeLessThan(strlen($string));
});
it('decodes a string', function () {
$encoded = Gzip::encode('Hello World!');
expect($encoded)->not->toBe('Hello World!')
->and(Gzip::decode($encoded))->toBe('Hello World!');
});
it('does not generate a warning, when string to decode actually isn\'t encoded', function () {
$warnings = [];
set_error_handler(function ($errno, $errstr) use (&$warnings) {
if ($errno === E_WARNING) {
$warnings[] = $errstr;
}
return false;
});
$decoded = Gzip::decode('Hello World!');
restore_error_handler();
expect($decoded)->toBe('Hello World!')
->and($warnings)->toBeEmpty();
});
================================================
FILE: tests/Utils/HttpHeadersTest.php
================================================
'de',
'Accept-Encoding' => ['gzip', 'deflate', 'br'],
]))->toBe([
'Accept-Language' => ['de'],
'Accept-Encoding' => ['gzip', 'deflate', 'br'],
]);
});
it('merges two header arrays', function () {
$headers = [
'Accept-Language' => ['de'],
'Accept-Encoding' => ['gzip', 'deflate', 'br'],
];
$merge = [
'Accept' => ['text/html', 'application/xhtml+xml', 'application/xml'],
'Accept-Language' => ['de', 'en'],
];
expect(HttpHeaders::merge($headers, $merge))->toBe([
'Accept-Language' => ['de', 'en'],
'Accept-Encoding' => ['gzip', 'deflate', 'br'],
'Accept' => ['text/html', 'application/xhtml+xml', 'application/xml'],
]);
});
it('adds a single value to a certain header in a headers array', function () {
$headers = ['Accept-Language' => ['de']];
expect(HttpHeaders::addTo($headers, 'Accept-Language', 'en'))->toBe(['Accept-Language' => ['de', 'en']]);
});
it('adds an array of values to a certain header in a headers array', function () {
$headers = ['Accept-Language' => ['de']];
expect(
HttpHeaders::addTo($headers, 'Accept-Language', ['en-US', 'en']),
)->toBe(['Accept-Language' => ['de', 'en-US', 'en']]);
});
it('adds the header when calling addTo() with a header name that the array does not contain yet', function () {
$headers = ['Accept-Encoding' => ['gzip', 'deflate', 'br']];
expect(
HttpHeaders::addTo($headers, 'Accept-Language', ['de', 'en']),
)->toBe([
'Accept-Encoding' => ['gzip', 'deflate', 'br'],
'Accept-Language' => ['de', 'en'],
]);
});
================================================
FILE: tests/Utils/OutputTypeHelperTest.php
================================================
'bar', 'baz'];
}
};
expect(OutputTypeHelper::objectToArray($object))->toBe(['foo' => 'bar', 'baz']);
});
it('converts an object with a toArray() method to an array', function () {
$object = new class {
/**
* @return string[]
*/
public function toArray(): array
{
return ['foo' => 'bar'];
}
};
expect(OutputTypeHelper::objectToArray($object))->toBe(['foo' => 'bar']);
});
it('converts an object with a __serialize() method to an array', function () {
$object = new class {
public function __serialize(): array
{
return ['winnie' => 'the pooh'];
}
};
expect(OutputTypeHelper::objectToArray($object))->toBe(['winnie' => 'the pooh']);
});
it('converts an object to an array by just casting it', function () {
$object = new class {
public string $foo = 'one';
public string $bar = 'two';
};
expect(OutputTypeHelper::objectToArray($object))->toBe(['foo' => 'one', 'bar' => 'two']);
});
it('checks if a value is a scalar value', function (mixed $value, bool $expectedResult) {
expect(OutputTypeHelper::isScalar($value))->toBe($expectedResult);
})->with([
['foo', true],
[123, true],
[true, true],
[false, true],
[1.23, true],
[['foo', 'bar'], true], // only associative array counts as non scalar for the output types
[['foo' => 'bar'], false],
[new stdClass(), false],
]);
it('checks if a value is an associative array', function (mixed $value, bool $expectedResult) {
expect(OutputTypeHelper::isAssociativeArray($value))->toBe($expectedResult);
})->with([
['foo', false],
[['foo', 'bar'], false],
[['foo' => 'bar'], true],
[new stdClass(), false],
]);
it(
'checks if a value is an associative array or object (a.k.a. non-scalar)',
function (mixed $value, bool $expectedResult) {
expect(OutputTypeHelper::isAssociativeArrayOrObject($value))->toBe($expectedResult);
},
)->with([
['foo', false],
[['foo', 'bar'], false],
[['foo' => 'bar'], true],
[new stdClass(), true],
]);
================================================
FILE: tests/Utils/RequestKeyTest.php
================================================
'gzip, deflate, br']);
expect(RequestKey::from($request))->toBe('fc2a9e78c97e68674201853cea4a3d74');
$request = $request->withAddedHeader('accept-language', 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7');
expect(RequestKey::from($request))->not()->toBe('fc2a9e78c97e68674201853cea4a3d74');
});
it('makes a cache key from a RespondedRequest object', function () {
$respondedRequest = new RespondedRequest(
new Request('GET', 'https://www.crwl.io/en/home', ['accept-encoding' => 'gzip, deflate, br']),
new Response(),
);
expect(RequestKey::from($respondedRequest))->toBe('08bcc643c9fb21af5e4f3361243e2220');
});
test('when creating the key it ignores cookies in the sent headers by default', function () {
$request = new Request('GET', 'https://www.crwlr.software/packages', ['accept-encoding' => 'gzip, deflate, br']);
$keyWithoutCookie = RequestKey::from($request);
$request = new Request('GET', 'https://www.crwlr.software/packages', [
'accept-encoding' => 'gzip, deflate, br',
'Cookie' => 'cookieName=v4lu3',
]);
expect(RequestKey::from($request))->toBe($keyWithoutCookie);
});
it('also ignores other headers when provided in second parameter', function () {
$request = new Request('GET', 'https://www.example.com', ['accept-encoding' => 'gzip, deflate, br']);
$keyWithAcceptEncodingHeader = RequestKey::from($request);
$keyWithoutAcceptEncodingHeader = RequestKey::from($request, ['accept-encoding']);
expect($keyWithAcceptEncodingHeader)->not()->toBe($keyWithoutAcceptEncodingHeader);
$request = new Request('GET', 'https://www.example.com', ['Accept-Encoding' => 'gzip']);
$anotherKeyWithoutAcceptEncodingHeader = RequestKey::from($request, ['accept-encoding']);
expect($keyWithoutAcceptEncodingHeader)->toBe($anotherKeyWithoutAcceptEncodingHeader);
});
================================================
FILE: tests/Utils/TemplateStringTest.php
================================================
'foo',
'asdf' => 'asdf',
'var' => 'yolo',
'asdf\'asdf' => 'replace',
'qu"z' => 'double',
]);
expect($replaced)->toBe(
<< 'bonjour', 'two' => 'ciao'],
),
)->toBe('hi bonjour/ciao bye');
});
================================================
FILE: tests/_Integration/GroupTest.php
================================================
input('http://localhost:8000/blog-post-with-json-ld');
$crawler
->addStep(Http::get())
->addStep(
Crawler::group()
->addStep(
Html::first('#content article.blog-post')
->extract(['title' => 'h1', 'date' => '.date']),
)
->addStep(
Html::schemaOrg()
->onlyType('BlogPosting')
->extract([
'author' => 'author.name',
'keywords',
]),
)
->keep(),
);
$result = helper_generatorToArray($crawler->run());
expect($result[0]->toArray())->toBe([
'title' => 'Prevent Homograph Attacks using the crwlr/url Package',
'date' => '2022-01-19',
'author' => 'Christian Olear',
'keywords' => 'homograph, attack, security, idn, internationalized domain names, prevention, url, uri',
]);
},
);
================================================
FILE: tests/_Integration/Http/CharsetTest.php
================================================
input('http://localhost:8000/non-utf-8-charset')
->addStep(Http::get())
->addStep(Html::root()->extract(['foo' => '.element']));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1)
->and($results[0]->toArray())->toBe(['foo' => '0 l/m²']);
});
================================================
FILE: tests/_Integration/Http/CrawlingTest.php
================================================
robotsTxtHandler = new class ($this, $this->logger) extends RobotsTxtHandler {
public function isAllowed(UriInterface|Url|string $url): bool
{
if (is_string($url)) {
$url = Url::parse($url);
} elseif ($url instanceof UriInterface) {
$url = Url::parse($url);
}
if ($url->path() === '/not-allowed') {
return false;
}
return parent::isAllowed($url);
}
};
}
public function load(mixed $subject): ?RespondedRequest
{
$request = $this->validateSubjectType($subject);
$this->loadedUrls[] = $request->getUri()->__toString();
return parent::load($subject);
}
}
/**
* To check if the Crawler stays on the same host or same domain when crawling, the PSR-18 HTTP ClientInterface
* of this Crawler's Loader, replaces the host in the request URI just before sending the Request. The Loader thinks
* it actually loaded the page from the incoming URI and the returned RespondedRequest object also has that original URI
* as effectiveUri (except if the requested page redirects).
*/
class Crawler extends HttpCrawler
{
public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): TestLoader
{
$client = new class implements ClientInterface {
private Client $guzzleClient;
public function __construct()
{
$this->guzzleClient = new Client();
}
public function sendRequest(RequestInterface $request): ResponseInterface
{
$request = $request->withUri($request->getUri()->withHost('localhost')->withPort(8000));
return $this->guzzleClient->sendRequest($request);
}
};
$loader = new TestLoader($userAgent, $client, $logger);
// To not slow down tests unnecessarily
$loader->throttle()
->waitBetween(new MultipleOf(0.0001), new MultipleOf(0.0002))
->waitAtLeast(Microseconds::fromSeconds(0.0001));
return $loader;
}
protected function userAgent(): UserAgentInterface
{
return new UserAgent('SomeUserAgent');
}
/**
* This method is here for the return type, so phpstan doesn't complain.
*/
public function getLoader(): TestLoader
{
return parent::getLoader(); // @phpstan-ignore-line
}
}
/** @var TestCase $this */
it('stays on the same host by default', function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/main')
->addStep(Http::crawl());
$crawler->runAndTraverse();
expect($crawler->getLoader()->loadedUrls)->not()->toContain('http://foo.example.com/crawling/main-on-subdomain');
});
it('stays on the same domain when method sameDomain() is called', function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/main')
->addStep(Http::crawl()->sameDomain());
$crawler->runAndTraverse();
expect($crawler->getLoader()->loadedUrls)->toContain('http://foo.example.com/crawling/main-on-subdomain')
->and($crawler->getLoader()->loadedUrls)->not()->toContain('https://www.crwlr.software/packages/crawler');
});
it('stays on the same host when method sameHost() is called', function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/main')
->addStep(
Http::crawl()
->sameDomain()
->sameHost(),
);
$crawler->runAndTraverse();
expect($crawler->getLoader()->loadedUrls)->not()->toContain('http://foo.example.com/crawling/main-on-subdomain');
});
it('crawls every page of a website that is linked somewhere', function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/main')
->addStep(Http::crawl());
$crawler->runAndTraverse();
expect($crawler->getLoader()->loadedUrls)->toHaveCount(6)
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main')
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1')
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1')
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2')
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1')
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1');
});
it('crawls only to a certain depth when the crawl depth is defined', function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/main')
->addStep(Http::crawl()->depth(1));
$crawler->runAndTraverse();
expect($crawler->getLoader()->loadedUrls)->toHaveCount(3);
$crawler = (new Crawler())
->input('http://www.example.com/crawling/main')
->addStep(Http::crawl()->depth(2));
$crawler->runAndTraverse();
expect($crawler->getLoader()->loadedUrls)->toHaveCount(5);
});
it('extracts URLs from a sitemap if you call method inputIsSitemap()', function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/sitemap.xml')
->addStep(Http::crawl()->inputIsSitemap());
$crawler->runAndTraverse();
expect($crawler->getLoader()->loadedUrls)->toHaveCount(7);
});
it('fails to extract URLs if you provide a sitemap as input and don\'t call inputIsSitemap()', function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/sitemap.xml')
->addStep(Http::crawl());
$crawler->runAndTraverse();
expect($crawler->getLoader()->loadedUrls)->toHaveCount(1);
});
it(
'extracts URLs from a sitemap where the tag contains attributes that cause symfony DomCrawler to fail',
function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/sitemap2.xml')
->addStep(Http::crawl()->inputIsSitemap());
$crawler->runAndTraverse();
expect($crawler->getLoader()->loadedUrls)->toHaveCount(7);
},
);
it('loads only pages where the path starts with a certain string when method pathStartsWith() is called', function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/sitemap.xml')
->addStep(
Http::crawl()
->inputIsSitemap()
->pathStartsWith('/crawling/sub1'),
);
$crawler->runAndTraverse();
expect($crawler->getLoader()->loadedUrls)->toHaveCount(3)
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sitemap.xml')
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1')
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1');
});
it('loads only URLs where the path matches a regex when method pathMatches() is used', function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/sitemap.xml')
->addStep(
Http::crawl()
->inputIsSitemap()
->pathMatches('/^\/crawling\/sub[12]$/'),
);
$crawler->runAndTraverse();
expect($crawler->getLoader()->loadedUrls)->toHaveCount(3);
});
it('loads only URLs where the Closure passed to method customFilter() returns true', function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/sitemap.xml')
->addStep(
Http::crawl()
->inputIsSitemap()
->customFilter(function (Url $url) {
return in_array($url->path(), [
'/crawling/main',
'/crawling/sub1/sub1',
'/crawling/sub2/sub1/sub1',
], true);
}),
);
$crawler->runAndTraverse();
expect($crawler->getLoader()->loadedUrls)->toHaveCount(4)
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main')
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1')
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1');
});
it(
'receives the link element where the URL was found, as second param in the Closure passed to method ' .
'customFilter() when it was found in an HTML document',
function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/main')
->addStep(
Http::crawl()
->customFilter(function (Url $url, ?HtmlElement $linkElement) {
return $linkElement && str_contains($linkElement->text(), 'Subpage 2');
}),
);
$crawler->runAndTraverse();
expect($crawler->getLoader()->loadedUrls)->toHaveCount(4)
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main')
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2')
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1')
->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1');
},
);
it(
'loads all pages, but yields only responses where the URL path starts with a certain string, when methods ' .
'pathStartsWith() and loadAllButYieldOnlyMatching() are called',
function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/sitemap.xml')
->addStep(
Http::crawl()
->inputIsSitemap()
->pathStartsWith('/crawling/sub2')
->loadAllButYieldOnlyMatching(),
);
$results = helper_generatorToArray($crawler->run());
expect($crawler->getLoader()->loadedUrls)->toHaveCount(7)
->and($results)->toHaveCount(3);
},
);
it(
'loads all URLs, but yields only responses where the URL path matches a regex, when methods pathMatches() and ' .
'loadAllButYieldOnlyMatching() are called',
function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/sitemap.xml')
->addStep(
Http::crawl()
->inputIsSitemap()
->pathMatches('/^\/crawling\/sub[12]$/')
->loadAllButYieldOnlyMatching(),
);
$results = helper_generatorToArray($crawler->run());
expect($crawler->getLoader()->loadedUrls)->toHaveCount(7)
->and($results)->toHaveCount(2);
},
);
it(
'loads all URLs but yields only responses where the Closure passed to method customFilter() returns true, when ' .
'methods customFilter() and loadAllButYieldOnlyMatching() are called',
function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/sitemap.xml')
->addStep(
Http::crawl()
->inputIsSitemap()
->customFilter(function (Url $url) {
return in_array($url->path(), [
'/crawling/main',
'/crawling/sub1/sub1',
'/crawling/sub2/sub1/sub1',
], true);
})
->loadAllButYieldOnlyMatching(),
);
$results = helper_generatorToArray($crawler->run());
expect($crawler->getLoader()->loadedUrls)->toHaveCount(7)
->and($results)->toHaveCount(3);
},
);
it(
'keeps the fragment parts in URLs and treats the same URL with a different fragment part as separate URLs when ' .
'keepUrlFragment() was called',
function () {
// Explanation: in almost all cases URLs with a fragment part at the end (#something) will respond with the
// same content. So, to avoid loading the same page multiple times, the step throws away the fragment part of
// discovered URLs by default.
$crawler = (new Crawler())
->input('http://www.example.com/crawling/main')
->addStep(Http::crawl()->keepUrlFragment()->keep(['url']));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(8);
$urls = [];
foreach ($results as $result) {
$urls[] = $result->get('url');
}
expect($urls)->toContain('http://www.example.com/crawling/sub2')
->and($urls)->toContain('http://www.example.com/crawling/sub2#fragment1')
->and($urls)->toContain('http://www.example.com/crawling/sub2#fragment2');
},
);
it('stops crawling when maxOutputs is reached', function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/main')
->addStep(
Http::crawl()
->keepUrlFragment()
->maxOutputs(4),
);
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(4)
->and($crawler->getLoader()->loadedUrls)->toHaveCount(4);
});
it('uses canonical links when useCanonicalLinks() is called', function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/main')
->addStep(
Http::crawl()
->useCanonicalLinks()
->keep(['url']),
);
$results = helper_generatorToArray($crawler->run());
$resultUrls = array_map(function (Result $result) {
return $result->get('url');
}, $results);
expect($resultUrls)
->toBe([
'http://www.example.com/crawling/main',
'http://www.example.com/crawling/sub1/sub1', // actual loaded url was sub1, but canonical is sub1/sub1
'http://www.example.com/crawling/sub2',
'http://www.example.com/crawling/sub2/sub1/sub1',
])
->and($crawler->getLoader()->loadedUrls)
->toBe([
'http://www.example.com/crawling/main',
'http://www.example.com/crawling/sub1', // => /crawling/sub1/sub1 => this URL wasn't loaded yet,
'http://www.example.com/crawling/sub2', // so when the link is discovered it won't load it.
'http://www.example.com/crawling/sub2/sub1', // => /crawling/sub1/sub1 => this URL was already loaded,
'http://www.example.com/crawling/sub2/sub1/sub1', // so the response is not yielded as a separate result.
]);
});
it('does not yield the same page twice when a URL was redirected to an already loaded page', function () {
$crawler = (new Crawler())
->input('http://www.example.com/crawling/redirect')
->addStep(Http::crawl()->keep(['url']));
$results = helper_generatorToArray($crawler->run());
$resultUrls = array_map(function (Result $result) {
return $result->get('url');
}, $results);
expect($resultUrls)
->toContain('http://www.example.com/crawling/main')
->and($resultUrls)
->not()
->toContain('http://www.example.com/crawling/redirect')
->and($this->getActualOutputForAssertion())
->toContain('Was already loaded before. Do not process this page again.');
});
it('does not produce a fatal error when the initial request fails', function () {
$crawler = (new Crawler())
->input('http://www.example.com/not-allowed')
->addStep(Http::crawl()->keep(['url']));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(0);
});
================================================
FILE: tests/_Integration/Http/ErrorResponsesTest.php
================================================
inputs(['http://localhost:8000/client-error-response'])
->addStep(Http::{$method}()->keepAs('response'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toBeEmpty();
})->with(['get', 'post', 'put', 'patch', 'delete']);
it('does not yield server error responses by default', function (string $method) {
$crawler = new ErrorCrawler();
$crawler->inputs(['http://localhost:8000/server-error-response'])
->addStep(Http::{$method}()->keepAs('response'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toBeEmpty();
})->with(['get', 'post', 'put', 'patch', 'delete']);
it('yields client error responses when yieldErrorResponses() was called', function (string $method) {
$crawler = new ErrorCrawler();
$crawler->inputs(['http://localhost:8000/client-error-response'])
->addStep(Http::{$method}()->yieldErrorResponses()->keepAs('response'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1);
})->with(['get', 'post', 'put', 'patch', 'delete']);
it('yields server error responses when yieldErrorResponses() was called', function (string $method) {
$crawler = new ErrorCrawler();
$crawler->inputs(['http://localhost:8000/server-error-response'])
->addStep(Http::{$method}()->yieldErrorResponses()->keepAs('response'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1);
})->with(['get', 'post', 'put', 'patch', 'delete']);
it(
'goes on crawling after a client error response when stopOnErrorResponse() wasn\'t called',
function (string $method) {
$crawler = new ErrorCrawler();
$crawler->inputs(['http://localhost:8000/client-error-response', 'http://localhost:8000/simple-listing'])
->addStep(Http::{$method}()->keepAs('response'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1);
},
)->with(['get', 'post', 'put', 'patch', 'delete']);
it(
'goes on crawling after a server error response when stopOnErrorResponse() wasn\'t called',
function (string $method) {
$crawler = new ErrorCrawler();
$crawler->inputs(['http://localhost:8000/server-error-response', 'http://localhost:8000/simple-listing'])
->addStep(Http::{$method}()->keepAs('response'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1);
},
)->with(['get', 'post', 'put', 'patch', 'delete']);
it(
'stops crawling (throws exception) after a client error response when the stopOnErrorResponse() method was called',
function (string $method) {
$crawler = new ErrorCrawler();
$crawler->inputs(['http://localhost:8000/client-error-response', 'http://localhost:8000/simple-listing'])
->addStep(Http::{$method}()->stopOnErrorResponse());
$crawler->runAndTraverse();
},
)->with(['get', 'post', 'put', 'patch', 'delete'])->throws(LoadingException::class);
it(
'stops crawling (throws exception) after a server error response when the stopOnErrorResponse() method was called',
function (string $method) {
$crawler = new ErrorCrawler();
$crawler->inputs(['http://localhost:8000/client-error-response', 'http://localhost:8000/simple-listing'])
->addStep(
Http::{$method}()
->stopOnErrorResponse(),
);
$crawler->runAndTraverse();
},
)->with(['get', 'post', 'put', 'patch', 'delete'])->throws(LoadingException::class);
it('does not log warnings about multiple loader hook calls when stopOnErrorResponse() is used', function () {
$crawler = new ErrorCrawler();
$crawler->inputs(['http://localhost:8000/hello-world', 'http://localhost:8000/simple-listing'])
->addStep(Http::get()->stopOnErrorResponse());
$crawler->runAndTraverse();
foreach ($crawler->getLogger()->messages as $message) {
expect($message['message'])->not->toContain(' was already called in this load call.');
}
});
================================================
FILE: tests/_Integration/Http/GzipTest.php
================================================
input('http://localhost:8000/gzip')
->addStep(Http::get()->keepAs('response'));
$results = helper_generatorToArray($crawler->run());
expect($results[0])->toBeInstanceOf(Result::class)
->and($results[0]->get('response'))->toBeInstanceOf(RespondedRequest::class)
->and(Http::getBodyString($results[0]->get('response')))->toBe('This is a gzip compressed string');
});
================================================
FILE: tests/_Integration/Http/HeadlessBrowserTest.php
================================================
useHeadlessBrowser();
return $loader;
}
}
class GetJsonFromResponseHtmlBody extends Step
{
protected function invoke(mixed $input): Generator
{
$html = Http::getBodyString($input->response);
$jsonString = (new HtmlDocument($html))->querySelector('body pre')?->text() ?? '';
yield json_decode($jsonString, true);
}
}
class GetStringFromResponseHtmlBody extends Step
{
protected function invoke(mixed $input): Generator
{
$html = Http::getBodyString($input->response);
yield (new HtmlDocument($html))->querySelector('body')?->text() ?? '';
}
}
/**
* @return Cookie[]
*/
function helper_getCookiesByDomainFromLoader(HttpLoader $loader, string $domain): array
{
$cookieJar = invade($loader)->cookieJar;
/** @var CookieJar $cookieJar */
return $cookieJar->allByDomain($domain);
}
it('automatically uses the Loader\'s user agent', function () {
$crawler = new HeadlessBrowserCrawler();
$crawler->input('http://localhost:8000/print-headers')
->addStep(Http::get())
->addStep((new GetJsonFromResponseHtmlBody())->keepAs('responseBody'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1)
->and($results[0]->get('responseBody'))->toBeArray()
->and($results[0]->get('responseBody'))->toHaveKey('User-Agent')
->and($results[0]->get('responseBody')['User-Agent'])->toBe('HeadlessBrowserBot');
});
it(
'does not use the user-agent defined in the crawler, when useNativeUserAgent() was called on the browser loader ' .
'helper',
function () {
$crawler = new HeadlessBrowserCrawler();
$crawler
->getLoader()
->browser()
->useNativeUserAgent();
$crawler->input('http://localhost:8000/print-headers')
->addStep(Http::get())
->addStep((new GetJsonFromResponseHtmlBody())->keepAs('responseBody'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1)
->and($results[0]->get('responseBody'))->toBeArray()
->and($results[0]->get('responseBody'))->toHaveKey('User-Agent')
->and($results[0]->get('responseBody')['User-Agent'])->toStartWith('Mozilla/5.0 (');
},
);
it('uses cookies', function () {
$crawler = new HeadlessBrowserCrawler();
$crawler
->input('http://localhost:8000/set-cookie')
->addStep(Http::get())
->addStep(new class extends Step {
protected function invoke(mixed $input): Generator
{
yield 'http://localhost:8000/print-cookie';
}
})
->addStep(Http::get())
->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookie'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1)
->and($results[0]->get('printed-cookie'))->toBeString()
->and($results[0]->get('printed-cookie'))->toBe('foo123');
});
it('does not use cookies when HttpLoader::dontUseCookies() was called', function () {
$crawler = new HeadlessBrowserCrawler();
$crawler->getLoader()->dontUseCookies();
$crawler
->input('http://localhost:8000/set-cookie')
->addStep(Http::get())
->addStep(new class extends Step {
protected function invoke(mixed $input): Generator
{
yield 'http://localhost:8000/print-cookie';
}
})
->addStep(Http::get())
->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookie'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1)
->and($results[0]->get('printed-cookie'))->toBeEmpty();
});
it('renders javascript', function () {
$crawler = new HeadlessBrowserCrawler();
$crawler->input('http://localhost:8000/js-rendering')
->addStep(Http::get())
->addStep(
Html::root()
->extract(['content' => '#content p']),
);
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1)
->and($results[0]->toArray())->toBe([
'content' => 'This was added through javascript',
]);
});
it('gets cookies that are set via javascript', function () {
$crawler = new HeadlessBrowserCrawler();
$cache = new FileCache(helper_cachedir());
$cache->clear();
$crawler->getLoader()->setCache($cache);
$crawler
->input('http://localhost:8000/set-js-cookie')
->addStep(Http::get());
helper_generatorToArray($crawler->run());
$cookiesInJar = helper_getCookiesByDomainFromLoader($crawler->getLoader(), 'localhost');
$testCookie = $cookiesInJar['testcookie'] ?? null;
expect($cookiesInJar)->toHaveCount(1)
->and($testCookie?->name())->toBe('testcookie')
->and($testCookie?->value())->toBe('javascriptcookie');
// Check that cookie is not added to the cookiejar when the response was served from cache.
$crawler = new HeadlessBrowserCrawler();
$crawler->getLoader()->setCache($cache);
$crawler
->input('http://localhost:8000/set-js-cookie')
->addStep(Http::get());
helper_generatorToArray($crawler->run());
$cookiesInJar = helper_getCookiesByDomainFromLoader($crawler->getLoader(), 'localhost');
expect($cookiesInJar)->toHaveCount(0);
});
it('gets a cookie that is set via a click, executed via post browser navigate hook', function () {
$crawler = new HeadlessBrowserCrawler();
$crawler
->input('http://localhost:8000/set-delayed-js-cookie')
->addStep(
Http::get()
->postBrowserNavigateHook(BrowserAction::clickElement('#consent_btn')),
)
->addStep(new class extends Step {
protected function invoke(mixed $input): Generator
{
yield 'http://localhost:8000/print-cookie';
}
})
->addStep(Http::get())
->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookie'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1)
->and($results[0]->get('printed-cookie'))->toBeString()
->and($results[0]->get('printed-cookie'))->toBe('javascriptcookie');
$cookiesInJar = helper_getCookiesByDomainFromLoader($crawler->getLoader(), 'localhost');
$testCookie = $cookiesInJar['testcookie'] ?? null;
expect($cookiesInJar)->toHaveCount(1)
->and($testCookie?->name())->toBe('testcookie')
->and($testCookie?->value())->toBe('javascriptcookie');
});
it(
'sending cookies works correctly when the loader is not configured to use the browser but two steps use the ' .
'browser by calling the useBrowser() method of Http steps',
function () {
$crawler = HttpCrawler::make()->withMozilla5CompatibleUserAgent();
$crawler
->input('http://localhost:8000/set-multiple-js-cookies')
->addStep(Http::get()->useBrowser())
->addStep(new class extends Step {
protected function invoke(mixed $input): Generator
{
yield 'http://localhost:8000/print-cookies';
}
})
->addStep(Http::get()->useBrowser())
->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookies'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1)
->and($results[0]->get('printed-cookies'))->toBeString()
->and($results[0]->get('printed-cookies'))
->toBe('cookie3=cookie3value;cookie2=cookie2value;cookie1=cookie1value');
$cookiesInJar = helper_getCookiesByDomainFromLoader($crawler->getLoader(), 'localhost');
expect($cookiesInJar)->toHaveCount(3)
->and($cookiesInJar['cookie1']->value())->toBe('cookie1value')
->and($cookiesInJar['cookie2']->value())->toBe('cookie2value')
->and($cookiesInJar['cookie3']->value())->toBe('cookie3value');
},
);
test(
'BrowserAction::clickElement(), clickInsideShadowDom(), evaluate(), moveMouseToElement(), ' .
'moveMouseToPosition(), scrollDown(), scrollUp() and typeText() work as expected',
function () {
$crawler = new HeadlessBrowserCrawler();
$crawler
->getLoader()
->browser()
->includeShadowElementsInHtml();
$crawler
->input('http://localhost:8000/browser-actions')
->addStep(
Http::get()
// Inserting the #click_element is delayed in the page, so this also tests, that the
// BrowserAction::clickElement() action automatically waits for an element matching the selector
// to be present.
->postBrowserNavigateHook(BrowserAction::clickElement('#click_element'))
->postBrowserNavigateHook(BrowserAction::screenshot(ScreenshotConfig::make(helper_storagedir())))
->postBrowserNavigateHook(BrowserAction::clickInsideShadowDom('#shadow_host', '#shadow_click_div'))
->postBrowserNavigateHook(
BrowserAction::evaluate(
'document.getElementById(\'evaluation_container\').innerHTML = \'evaluated\'',
),
)
->postBrowserNavigateHook(BrowserAction::moveMouseToElement('#mouseover_check_1'))
->postBrowserNavigateHook(BrowserAction::moveMouseToPosition(305, 405))
->postBrowserNavigateHook(BrowserAction::scrollDown(4000))
->postBrowserNavigateHook(
BrowserAction::screenshot(
ScreenshotConfig::make(helper_storagedir())
->setImageFileType('jpeg')
->setQuality(20)
->setFullPage(),
),
)
->postBrowserNavigateHook(BrowserAction::scrollUp(2000))
->postBrowserNavigateHook(BrowserAction::scrollUp(2000))
->postBrowserNavigateHook(BrowserAction::clickElement('#input'))
->postBrowserNavigateHook(BrowserAction::typeText('typing text works'))
->keep(['body', 'screenshots']),
);
$results = helper_generatorToArray($crawler->run());
$body = $results[0]->get('body');
$screenshots = $results[0]->get('screenshots');
expect($body)->toContain('
yes
')
// This also tests the `HeadlessBrowserLoaderHelper::includeShadowElementsInHtml()` method,
// because even if the click worked, with the normal way of getting HTML this wouldn't be
// included in the returned HTML.
->and($body)->toContain('
clicked
')
->and($body)->toContain('
evaluated
')
->and($body)->toContain('
mouse was here
')
->and($body)->toContain('
mouse was here
')
->and($body)->toContain('
scrolled down
')
->and($body)->toContain('
scrolled up
')
->and($body)->toContain('
typing text works
')
->and($screenshots)->toHaveCount(2)
->and($screenshots[0])->toEndWith('.png')
->and($screenshots[1])->toEndWith('.jpeg');
if (function_exists('getimagesize')) {
$screenshot1Size = getimagesize($screenshots[0]);
$screenshot2Size = getimagesize($screenshots[1]);
if (is_array($screenshot1Size) && is_array($screenshot2Size)) {
expect($screenshot1Size[1])->toBeLessThan(2100)
->and($screenshot2Size[1])->toBeGreaterThan(4000);
}
}
helper_resetStorageDir();
},
);
test('BrowserAction::waitUntilDocumentContainsElement() works as expected', function () {
$crawler = new HeadlessBrowserCrawler();
$crawler
->input('http://localhost:8000/browser-actions/wait')
->addStep(
Http::get()
->postBrowserNavigateHook(
BrowserAction::waitUntilDocumentContainsElement('#delayed_container'),
)
->keep('body'),
);
$results = helper_generatorToArray($crawler->run());
$body = $results[0]->get('body');
expect($body)->toContain('
hooray
');
});
test('BrowserAction::clickElementAndWaitForReload() works as expected', function () {
$crawler = new HeadlessBrowserCrawler();
$crawler
->input('http://localhost:8000/browser-actions/click-and-wait-for-reload')
->addStep(
Http::get()
->postBrowserNavigateHook(BrowserAction::clickElementAndWaitForReload('#click'))
->keep('body'),
);
$results = helper_generatorToArray($crawler->run());
$body = $results[0]->get('body');
expect($body)->toContain('
yes
');
});
test(
'when on the click and wait for reload page, and the element is only clicked but we don\'t wait for reload, ' .
'we don\'t get the reloaded page content',
function () {
$crawler = new HeadlessBrowserCrawler();
$crawler
->input('http://localhost:8000/browser-actions/click-and-wait-for-reload')
->addStep(
Http::get()
->postBrowserNavigateHook(BrowserAction::clickElement('#click'))
->keep('body'),
);
$results = helper_generatorToArray($crawler->run());
$body = $results[0]->get('body');
expect($body)->not()->toContain('
yes
');
},
);
test(
'when on the click and wait for reload page, and the element is clicked and we also wait for reload, we get the ' .
'reloaded page content',
function () {
$crawler = new HeadlessBrowserCrawler();
$crawler
->input('http://localhost:8000/browser-actions/click-and-wait-for-reload')
->addStep(
Http::get()
->postBrowserNavigateHook(BrowserAction::clickElement('#click'))
->postBrowserNavigateHook(BrowserAction::waitForReload())
->keep('body'),
);
$results = helper_generatorToArray($crawler->run());
$body = $results[0]->get('body');
expect($body)->toContain('
yes
');
},
);
test('BrowserAction::evaluateAndWaitForReload() works as expected', function () {
$crawler = new HeadlessBrowserCrawler();
$crawler
->input('http://localhost:8000/browser-actions/evaluate-and-wait-for-reload')
->addStep(
Http::get()
->postBrowserNavigateHook(
BrowserAction::evaluateAndWaitForReload(
'window.location.href = \'http://localhost:8000/browser-actions/' .
'evaluate-and-wait-for-reload-reloaded\'',
),
)
->keep('body'),
);
$results = helper_generatorToArray($crawler->run());
$body = $results[0]->get('body');
expect($body)->toContain('
yay
');
});
test('BrowserAction::wait() works as expected', function () {
$crawler = new HeadlessBrowserCrawler();
$crawler
->input('http://localhost:8000/browser-actions/wait')
->addStep(
Http::get()
->postBrowserNavigateHook(BrowserAction::wait(0.3))
->keep('body'),
);
$results = helper_generatorToArray($crawler->run());
$body = $results[0]->get('body');
expect($body)->toContain('
hooray
');
});
it('executes the javascript code provided via HeadlessBrowserLoaderHelper::setPageInitScript()', function () {
$crawler = new HeadlessBrowserCrawler();
$crawler
->getLoader()
->browser()
->setPageInitScript('window._secret_content = \'secret content\'');
$crawler
->input('http://localhost:8000/page-init-script')
->addStep(Http::get())
->addStep(Html::root()->extract(['content' => '#content']));
$results = helper_generatorToArray($crawler->run());
expect($results[0]->get('content'))->toBe('secret content');
});
it('gets the source of an XML response without being wrapped in an HTML document', function () {
$crawler = new HeadlessBrowserCrawler();
$crawler
->input('http://localhost:8000/rss-feed')
->addStep(Http::get()->keep(['body']));
$results = helper_generatorToArray($crawler->run());
expect($results[0]->get('body'))->toStartWith('' . PHP_EOL . 'input('http://localhost:8000/broken-mime-type-rss')
->addStep(Http::get()->keep(['body']));
$results = helper_generatorToArray($crawler->run());
expect($results[0]->get('body'))->toStartWith('');
},
);
================================================
FILE: tests/_Integration/Http/Html/PaginatedListingTest.php
================================================
input('http://localhost:8000/paginated-listing');
$crawler
->addStep(Http::get()->paginate('#nextPage'))
->addStep(Html::getLinks('#listing .item a')->keepAs('url'))
->addStep(Http::get())
->addStep(
Html::first('article')
->extract(['title' => 'h1', 'number' => '.someNumber'])
->keep(),
);
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(10)
->and($results[0]->toArray())->toBe([
'url' => 'http://localhost:8000/paginated-listing/items/1',
'title' => 'Some Item 1',
'number' => '10',
])
->and($results[9]->toArray())->toBe([
'url' => 'http://localhost:8000/paginated-listing/items/10',
'title' => 'Some Item 10',
'number' => '100',
]);
});
================================================
FILE: tests/_Integration/Http/Html/SimpleListingTest.php
================================================
input('http://localhost:8000/simple-listing');
$crawler->addStep(Http::get())
->addStep(Html::getLinks('.listingItem a'))
->addStep(Http::get())
->addStep(
Html::first('article')
->extract([
'title' => 'h1',
'date' => '.date',
'author' => '.articleAuthor',
])
->keep(),
);
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(3)
->and($results[0]->toArray())->toBe([
'title' => 'Some Article 1',
'date' => '2022-04-13',
'author' => 'Christian Olear',
])
->and($results[1]->toArray())->toBe([
'title' => 'Some Article 2',
'date' => '2022-04-14',
'author' => 'Christian Olear',
])
->and($results[2]->toArray())->toBe([
'title' => 'Some Article 3',
'date' => '2022-04-15',
'author' => 'Christian Olear',
]);
});
================================================
FILE: tests/_Integration/Http/PaginationTest.php
================================================
input('http://localhost:8000/paginated-listing')
->addStep(Http::get()->paginate('#pagination'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(5);
});
it('only iterates pagination until max pages limit is reached', function () {
$crawler = new PaginationCrawler();
$crawler->input('http://localhost:8000/paginated-listing')
->addStep(Http::get()->paginate('#pagination', 2));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(2)
->and($this->getActualOutputForAssertion())->toContain('Max pages limit reached');
});
it('resets the finished paginating state after each processed (/paginated) input', function () {
$crawler = new PaginationCrawler();
$crawler
->inputs(['http://localhost:8000/paginated-listing', 'http://localhost:8000/paginated-listing?foo=bar'])
->addStep(Http::get()->paginate('#pagination', 2)->outputKey('response'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(4);
});
================================================
FILE: tests/_Integration/Http/ProxyingTest.php
================================================
*/
public static array $processes = [8001 => null, 8002 => null, 8003 => null];
}
beforeEach(function () {
$startedProcesses = false;
foreach (ProxyServerProcesses::PORTS as $port) {
if (!ProxyServerProcesses::$processes[$port]) {
ProxyServerProcesses::$processes[$port] = Process::fromShellCommandline(
'php -S localhost:' . $port . ' ' . __DIR__ . '/../ProxyServer.php',
);
ProxyServerProcesses::$processes[$port]->start();
$startedProcesses = true;
}
}
if ($startedProcesses) {
usleep(100_000);
}
});
afterAll(function () {
foreach (ProxyServerProcesses::PORTS as $port) {
ProxyServerProcesses::$processes[$port]?->stop(3, SIGINT);
ProxyServerProcesses::$processes[$port] = null;
}
});
it('uses a proxy when the useProxy() method of the loader was called', function () {
$crawler = helper_getFastCrawler();
$crawler->getLoader()->useProxy('http://localhost:8001');
$crawler
->input('http://www.crwlr.software/packages')
->addStep(Http::get()->keep(['body']));
$results = iterator_to_array($crawler->run());
expect($results[0])
->toBeInstanceOf(Result::class)
->and($results[0]->get('body'))
->toContain('Proxy Server Response for http://www.crwlr.software/packages');
});
it('uses correct method, headers and HTTP version in the proxied request', function () {
$crawler = helper_getFastCrawler();
$crawler->getLoader()->useProxy('http://localhost:8001');
$crawler
->input('http://www.crwlr.software/packages')
->addStep(
Http::put(['Accept-Encoding' => 'gzip, deflate, br'], 'Hello World', '1.0')
->keep(['body']),
);
$results = iterator_to_array($crawler->run());
expect($results[0])
->toBeInstanceOf(Result::class)
->and($results[0]->get('body'))
->toContain('Protocol Version: HTTP/1.0')
->toContain('Request Method: PUT')
->toContain('Request Body: Hello World')
->toContain('["Accept-Encoding"]=>' . PHP_EOL . ' string(17) "gzip, deflate, br"');
});
it('uses rotating proxies when the useRotatingProxies() method of the loader was called', function () {
$crawler = helper_getFastCrawler();
$crawler->getLoader()->useRotatingProxies([
'http://localhost:8001',
'http://localhost:8002',
'http://localhost:8003',
]);
$crawler
->input([
'http://www.crwlr.software/packages/crawler/v1.1/getting-started',
'http://www.crwlr.software/packages/url/v2.0/getting-started',
'http://www.crwlr.software/packages/query-string/v1.0/getting-started',
'http://www.crwlr.software/packages/robots-txt/v1.1/getting-started',
])
->addStep(Http::get()->keep(['body']));
$results = iterator_to_array($crawler->run());
expect($results)->toHaveCount(4)
->and($results[0])
->toBeInstanceOf(Result::class)
->and($results[0]->get('body'))
->toContain('Port: 8001') // First request with first proxy
->and($results[1])
->toBeInstanceOf(Result::class)
->and($results[1]->get('body'))
->toContain('Port: 8002') // Second request with second proxy
->and($results[2])
->toBeInstanceOf(Result::class)
->and($results[2]->get('body'))
->toContain('Port: 8003') // Third request with third proxy
->and($results[3])
->toBeInstanceOf(Result::class)
->and($results[3]->get('body'))
->toContain('Port: 8001'); // And finally the fourth request with the first proxy again.
});
it('can also use a proxy when using the headless browser', function () {
$crawler = helper_getFastCrawler();
$crawler
->getLoader()
->useHeadlessBrowser()
->useProxy('http://localhost:8001');
$crawler
->input('http://www.crwlr.software/blog')
->addStep(
Http::get(['Accept-Language' => 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7'])
->keep(['body']),
);
$results = iterator_to_array($crawler->run());
expect($results[0])
->toBeInstanceOf(Result::class)
->and($results[0]->get('body'))
->toContain('["Accept-Language"]=>' . PHP_EOL . ' string(35) "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"');
});
it('can also use rotating proxies when using the headless browser', function () {
$crawler = helper_getFastCrawler();
$crawler
->getLoader()
->useHeadlessBrowser()
->useRotatingProxies([
'http://localhost:8001',
'http://localhost:8002',
]);
$crawler
->input([
'http://www.crwlr.software/packages/crawler/v1.1',
'http://www.crwlr.software/packages/url/v2.0',
'http://www.crwlr.software/packages/query-string/v1.0',
])
->addStep(Http::get()->keep(['body']));
$results = iterator_to_array($crawler->run());
expect($results)->toHaveCount(3)
->and($results[0])
->toBeInstanceOf(Result::class)
->and($results[0]->get('body'))
->toContain('Port: 8001') // First request with first proxy
->and($results[1])
->toBeInstanceOf(Result::class)
->and($results[1]->get('body'))
->toContain('Port: 8002') // Second request with second proxy
->and($results[2])
->toBeInstanceOf(Result::class)
->and($results[2]->get('body'))
->toContain('Port: 8001'); // And finally the third request with the first proxy again.
});
================================================
FILE: tests/_Integration/Http/PublisherExampleTest.php
================================================
input('http://localhost:8000/publisher/authors')
->addStep(Http::get())
->addStep(Html::getLinks('#authors a'))
->addStep(Http::get())
->addStep(
Html::root()
->extract([
'author' => 'h1',
'bookUrls' => Dom::cssSelector('#author-data .books a.book')->attribute('href')->toAbsoluteUrl(),
])
->keep(['author']),
)
->addStep(Http::get()->useInputKey('bookUrls'))
->addStep(
Html::root()
->extract(['book' => 'h1'])
->keep(),
);
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(5)
->and($results[0]->toArray())->toBe([
'author' => 'John Example',
'book' => 'Some novel',
])
->and($results[1]->toArray())->toBe([
'author' => 'John Example',
'book' => 'Another novel',
])
->and($results[2]->toArray())->toBe([
'author' => 'Susan Example',
'book' => 'Poems #1',
])
->and($results[3]->toArray())->toBe([
'author' => 'Susan Example',
'book' => 'Poems #2',
])
->and($results[4]->toArray())->toBe([
'author' => 'Susan Example',
'book' => 'Poems #3',
]);
});
it('turns an array of URLs to nested extracted data from those child pages using sub crawlers', function () {
$crawlerBuilder = new class {
public function build(): \Crwlr\Crawler\Crawler
{
$crawler = new PublisherExampleCrawler();
return $crawler
->input('http://localhost:8000/publisher/authors')
->addStep(Http::get())
->addStep(Html::getLinks('#authors a'))
->addStep(Http::get())
->addStep($this->extractAuthorData());
}
private function extractAuthorData(): Html
{
return Html::root()
->extract([
'name' => 'h1',
'age' => '#author-data .age',
'bornIn' => '#author-data .born-in',
'books' => Dom::cssSelector('#author-data .books a.book')->link(),
])
->subCrawlerFor('books', function (\Crwlr\Crawler\Crawler $crawler) {
return $crawler
->addStep(Http::get())
->addStep(
$this->extractBookData(),
);
});
}
private function extractBookData(): Html
{
return Html::root()
->extract(['title' => 'h1', 'editions' => Dom::cssSelector('#editions a')->link()])
->subCrawlerFor('editions', function (\Crwlr\Crawler\Crawler $crawler) {
return $crawler
->addStep(Http::get())
->addStep($this->extractEditionData());
});
}
private function extractEditionData(): Html
{
return Html::root()
->extract(['year' => '.year', 'publisher' => '.publishingCompany']);
}
};
$results = helper_generatorToArray($crawlerBuilder->build()->run());
expect($results)->toHaveCount(2)
->and($results[0]->toArray())->toBe([
'name' => 'John Example',
'age' => '51',
'bornIn' => 'Lisbon',
'books' => [
[
'title' => 'Some novel',
'editions' => [
['year' => '1996', 'publisher' => 'Foo'],
['year' => '2005', 'publisher' => 'Foo'],
],
],
[
'title' => 'Another novel',
'editions' => [
['year' => '2001', 'publisher' => 'Foo'],
['year' => '2009', 'publisher' => 'Bar'],
['year' => '2017', 'publisher' => 'Bar'],
],
],
],
])
->and($results[1]->toArray())->toBe([
'name' => 'Susan Example',
'age' => '49',
'bornIn' => 'Athens',
'books' => [
[
'title' => 'Poems #1',
'editions' => [
['year' => '2008', 'publisher' => 'Poems'],
['year' => '2009', 'publisher' => 'Poems'],
],
],
[
'title' => 'Poems #2',
'editions' => [
['year' => '2011', 'publisher' => 'Poems'],
['year' => '2014', 'publisher' => 'New Poems'],
],
],
[
'title' => 'Poems #3',
'editions' => [
['year' => '2013', 'publisher' => 'Poems'],
['year' => '2017', 'publisher' => 'New Poems'],
],
],
],
]);
});
test('it can also keep the URLs, provided to the sub crawler', function () {
$crawlerBuilder = new class {
public function build(): \Crwlr\Crawler\Crawler
{
$crawler = new PublisherExampleCrawler();
return $crawler
->input('http://localhost:8000/publisher/authors')
->addStep(Http::get())
->addStep(Html::getLinks('#authors a'))
->addStep(Http::get())
->addStep($this->extractAuthorData());
}
private function extractAuthorData(): Html
{
return Html::root()
->extract([
'name' => 'h1',
'age' => '#author-data .age',
'bornIn' => '#author-data .born-in',
'books' => Dom::cssSelector('#author-data .books a.book')->link(),
])
->subCrawlerFor('books', function (\Crwlr\Crawler\Crawler $crawler) {
return $crawler
->addStep(Http::get()->keepInputAs('url'))
->addStep($this->extractBookData());
});
}
private function extractBookData(): Html
{
return Html::root()
->extract(['title' => 'h1', 'editions' => Dom::cssSelector('#editions a')->link()])
->subCrawlerFor('editions', function (\Crwlr\Crawler\Crawler $crawler) {
return $crawler
->addStep(Http::get()->keepInputAs('url'))
->addStep($this->extractEditionData());
});
}
private function extractEditionData(): Html
{
return Html::root()
->extract(['year' => '.year', 'publisher' => '.publishingCompany']);
}
};
$results = helper_generatorToArray($crawlerBuilder->build()->run());
expect($results)->toHaveCount(2)
->and($results[0]->toArray())->toBe([
'name' => 'John Example',
'age' => '51',
'bornIn' => 'Lisbon',
'books' => [
[
'url' => 'http://localhost:8000/publisher/books/1',
'title' => 'Some novel',
'editions' => [
[
'url' => 'http://localhost:8000/publisher/books/1/edition/1',
'year' => '1996',
'publisher' => 'Foo',
],
[
'url' => 'http://localhost:8000/publisher/books/1/edition/2',
'year' => '2005',
'publisher' => 'Foo',
],
],
],
[
'url' => 'http://localhost:8000/publisher/books/2',
'title' => 'Another novel',
'editions' => [
[
'url' => 'http://localhost:8000/publisher/books/2/edition/1',
'year' => '2001',
'publisher' => 'Foo',
],
[
'url' => 'http://localhost:8000/publisher/books/2/edition/2',
'year' => '2009',
'publisher' => 'Bar',
],
[
'url' => 'http://localhost:8000/publisher/books/2/edition/3',
'year' => '2017',
'publisher' => 'Bar',
],
],
],
],
])
->and($results[1]->toArray())->toBe([
'name' => 'Susan Example',
'age' => '49',
'bornIn' => 'Athens',
'books' => [
[
'url' => 'http://localhost:8000/publisher/books/3',
'title' => 'Poems #1',
'editions' => [
[
'url' => 'http://localhost:8000/publisher/books/3/edition/1',
'year' => '2008',
'publisher' => 'Poems',
],
[
'url' => 'http://localhost:8000/publisher/books/3/edition/2',
'year' => '2009',
'publisher' => 'Poems',
],
],
],
[
'url' => 'http://localhost:8000/publisher/books/4',
'title' => 'Poems #2',
'editions' => [
[
'url' => 'http://localhost:8000/publisher/books/4/edition/1',
'year' => '2011',
'publisher' => 'Poems',
],
[
'url' => 'http://localhost:8000/publisher/books/4/edition/2',
'year' => '2014',
'publisher' => 'New Poems',
],
],
],
[
'url' => 'http://localhost:8000/publisher/books/5',
'title' => 'Poems #3',
'editions' => [
[
'url' => 'http://localhost:8000/publisher/books/5/edition/1',
'year' => '2013',
'publisher' => 'Poems',
],
[
'url' => 'http://localhost:8000/publisher/books/5/edition/2',
'year' => '2017',
'publisher' => 'New Poems',
],
],
],
],
]);
});
================================================
FILE: tests/_Integration/Http/QueryParamPaginationTest.php
================================================
input('http://localhost:8000/query-param-pagination')
->addStep(
Http::post(body: 'page=1')
->paginate(
Paginator::queryParams(5)
->inBody()
->increase('page')
->stopWhen(PaginatorStopRules::isEmptyInJson('data.items')),
)->keep(['body']),
);
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(4);
});
it('also paginates using query params sent in the request body, when used in combination with static URL', function () {
$crawler = new QueryParamPaginationCrawler();
$crawler
->input('foo')
->addStep(
Http::post(body: 'page=1')
->staticUrl('http://localhost:8000/query-param-pagination')
->paginate(
Paginator::queryParams(3)
->inBody()
->increase('page'),
)->keep(['body']),
);
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(3);
});
it('paginates using URL query params', function () {
$crawler = new QueryParamPaginationCrawler();
$crawler
->input('http://localhost:8000/query-param-pagination?page=1')
->addStep(
Http::get()
->paginate(
Paginator::queryParams(5)
->inUrl()
->increase('page')
->stopWhen(PaginatorStopRules::isEmptyInJson('data.items')),
)->keep(['body']),
);
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(4);
});
it('paginates only until the max pages limit', function () {
$crawler = new QueryParamPaginationCrawler();
$crawler
->input('http://localhost:8000/query-param-pagination?page=1')
->addStep(
Http::get()
->paginate(
QueryParamsPaginator::paramsInUrl(2)
->increase('page')
->stopWhen(PaginatorStopRules::isEmptyInJson('data.items')),
)->keep(['body']),
);
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(2);
});
it('resets the finished paginating state after each processed (/paginated) input', function () {
$crawler = new QueryParamPaginationCrawler();
$crawler
->inputs([
'http://localhost:8000/query-param-pagination?page=1',
'http://localhost:8000/query-param-pagination?page=1&foo=bar',
])
->addStep(
Http::get()
->paginate(
QueryParamsPaginator::paramsInUrl(2)
->increase('page')
->stopWhen(PaginatorStopRules::isEmptyInJson('data.items')),
)->keep(['body']),
);
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(4);
});
================================================
FILE: tests/_Integration/Http/RedirectTest.php
================================================
input('http://localhost:8000/redirect?stopAt=5')
->addStep(Http::get())
->addStep((new GetResponseBodyAsString())->keepAs('body'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1)
->and($results[0]->get('body'))->toBe('success after 5 redirects');
});
it('stops at 10 redirects by default', function () {
$crawler = new RedirectTestCrawler();
$crawler
->input('http://localhost:8000/redirect?stopAt=11')
->addStep(Http::get())
->addStep((new GetResponseBodyAsString())->keepAs('body'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(0);
$logOutput = $this->getActualOutputForAssertion();
expect($logOutput)->toContain('Failed to load http://localhost:8000/redirect?stopAt=11: Too many redirects.');
});
test('you can set your own max redirects limit', function () {
$crawler = new class extends HttpCrawler {
protected function userAgent(): UserAgentInterface
{
return new UserAgent('RedirectBot');
}
protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
{
$loader = parent::loader($userAgent, $logger);
if ($loader instanceof HttpLoader) {
$loader->setMaxRedirects(15);
}
return $loader;
}
};
$crawler
->input('http://localhost:8000/redirect?stopAt=11')
->addStep(Http::get())
->addStep((new GetResponseBodyAsString())->keepAs('body'));
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1)
->and($results[0]->get('body'))->toBe('success after 11 redirects');
});
================================================
FILE: tests/_Integration/Http/RequestParamsFromInputTest.php
================================================
'http://localhost:8000/print-headers',
'body' => 'test',
'headers' => [
'header-x' => 'foo',
'header-y' => ['bar'],
],
'header-y' => 'baz',
'header-z' => ['quz'],
];
}
};
$crawler = helper_getFastCrawler();
$crawler
->input('anything')
->addStep($paramsStep)
->addStep(
Http::get()
->useInputKeyAsBody('body')
->useInputKeyAsHeaders('headers')
->useInputKeyAsHeader('header-y', 'header-y')
->useInputKeyAsHeader('header-z', 'header-z'),
)
->addStep(Json::all());
$results = helper_generatorToArray($crawler->run());
expect($results)->toHaveCount(1);
$result = $results[0]->toArray();
expect($result['Content-Length'])->toBe('4');
expect($result['header-x'])->toBe('foo');
expect($result['header-y'])->toBe('bar, baz');
expect($result['header-z'])->toBe('quz');
});
================================================
FILE: tests/_Integration/Http/RetryErrorResponsesTest.php
================================================
input('http://localhost:8000' . $path)
->addStep(Http::get());
$start = microtime(true);
helper_generatorToArray($crawler->run());
$end = microtime(true);
$diff = $end - $start;
expect($diff)->toBeGreaterThan(3.0);
expect($diff)->toBeLessThan(3.5);
})->with(['/too-many-requests', '/service-unavailable']);
it(
'starts the first retry after the number of seconds returned in the Retry-After HTTP header',
function (string $path) {
$crawler = new RetryErrorResponsesCrawler();
$crawler
->input('http://localhost:8000' . $path . '/retry-after')
->addStep(Http::get());
$start = microtime(true);
helper_generatorToArray($crawler->run());
$end = microtime(true);
$diff = $end - $start;
expect($diff)->toBeGreaterThan(4.0);
expect($diff)->toBeLessThan(4.5);
},
)->with(['/too-many-requests', '/service-unavailable']);
it('goes on crawling when a retry receives a successful response', function (string $path) {
$crawler = new RetryErrorResponsesCrawler();
$crawler->input('http://localhost:8000' . $path . '/succeed-on-second-attempt')
->addStep(Http::get());
$start = microtime(true);
$results = helper_generatorToArray($crawler->run());
$end = microtime(true);
$diff = $end - $start;
expect($results)->toHaveCount(1);
expect($diff)->toBeGreaterThan(1.0);
expect($diff)->toBeLessThan(1.5);
})->with(['/too-many-requests', '/service-unavailable']);
================================================
FILE: tests/_Integration/Http/RobotsTxtTest.php
================================================
input('http://localhost:8000/hello-world')
->addStep(Http::get())
->addStep(Html::root()->extract('body')->keepAs('body'));
$results = helper_generatorToArray($crawler->run());
expect($results[0]->get('body'))->toBe('Hello World!');
$logger = $crawler->getLogger();
foreach ($logger->messages as $message) {
expect($message['message'])->not->toContain(' was already called in this load call.');
}
});
it('also does not warn about loader hooks being called multiple times when loadOrFail() is used', function () {
// See comment in the test above.
$crawler = new RobotsTxtCrawler();
$crawler
->input('http://localhost:8000/hello-world')
->addStep(Http::get()->stopOnErrorResponse())
->addStep(Html::root()->extract('body')->keepAs('body'));
$results = helper_generatorToArray($crawler->run());
expect($results[0]->get('body'))->toBe('Hello World!');
$logger = $crawler->getLogger();
foreach ($logger->messages as $message) {
expect($message['message'])->not->toContain(' was already called in this load call.');
}
});
================================================
FILE: tests/_Integration/Http/TimeoutTest.php
================================================
1,
'timeout' => 1,
]);
}
};
$crawler->input('http://localhost:8000/sleep')
->addStep(Http::get());
$crawler->runAndTraverse();
expect($this->getActualOutputForAssertion())->toContain('Operation timed out');
});
================================================
FILE: tests/_Integration/ProxyServer.php
================================================
= $stopAt) {
echo 'success after ' . $redirectNo . ' redirects';
return;
} else {
$stopAt = '&stopAt=' . $stopAt;
}
} else {
$stopAt = '';
}
header('Location: http://localhost:8000/redirect?no=' . ($redirectNo + 1) . $stopAt);
}
if (str_starts_with($route, '/non-utf-8-charset')) {
return include(__DIR__ . '/_Server/NonUtf8.php');
}
if (str_starts_with($route, '/page-init-script')) {
return include(__DIR__ . '/_Server/PageInitScript.php');
}
if ($route === '/rss-feed') {
header('Content-Type: text/xml; charset=utf-8');
return include(__DIR__ . '/_Server/RssFeed.php');
}
if ($route === '/broken-mime-type-rss') {
header('Content-Type: application/rss+xml; charset=UTF-8');
return include(__DIR__ . '/_Server/BrokenMimeTypeRss.php');
}
if ($route === '/robots.txt') {
return <<Prevent Homograph Attacks using the crwlr/url Package - crwlr.software
Prevent Homograph Attacks using the crwlr/url Package
2022-01-19
This post is not crawling/scraping related, but about another
valuable use case for the url package, to prevent so-called
homograph attacks.
About the attack
Homograph attacks are using internationalized domain names (IDN) for
malicious links including domains that look like trusted organizations.
You might know attacks where they want to trick you with typos
like faecbook or things like zeros instead of Os (g00gle).
Using internationalized domain names this kind of attack is even
harder to spot because they are using characters that almost exactly
look like other characters (also depending on the font they're
displayed with).
Can you see the difference between those two As?
a а
No? But in fact they aren't the same. The second one is a Cyrillic
character.
You can check it e.g. by using PHP's ord function.
Browsers already implemented mechanisms to warn users that a page
they're visiting might not be as legitimate as they thought.
But still: if on your website, you are linking to urls originating
from user input, it'd be a good idea to have an eye on urls
containing internationalized domain names.
So you see, it's very easy to identify IDN urls with it. Of course
there are many legitimate IDN domains, so you might not want to
automatically block all of them. I'd suggest you could put some kind
of monitoring in place that notifies you about users posting links
to IDNs.
Maybe you're operating in a country where IDNs are very common. Maybe
in that case you can find a way to automatically sort out legitimate
uses from your area.
================================================
FILE: tests/_Integration/_Server/BrokenMimeTypeRss.php
================================================
Lorem ipsum
https://www.example.com/
Lorem ipsum dolor sit ametFri, 10 Jan 2025 10:48:01 +0000en
hourly
1 Foo
https://www.example.com/some-article
https://www.example.com/some-article#commentsFri, 10 Jan 2025 10:48:01 +0000https://www.example.com/?a=123Lorem ipsum dolor
sit amet
]]>
================================================
FILE: tests/_Integration/_Server/BrowserActions/ClickAndWaitForReload.php
================================================
Hello World
Click here
yes
================================================
FILE: tests/_Integration/_Server/BrowserActions/EvaluateAndWaitForReload.php
================================================
Hello World
================================================
FILE: tests/_Integration/_Server/BrowserActions/EvaluateAndWaitForReloadReloaded.php
================================================
Hello World
yay
================================================
FILE: tests/_Integration/_Server/BrowserActions/Main.php
================================================
Hello World
mouse wasn't here yet
mouse wasn't here yet
not scrolled up yet
not scrolled down yet
================================================
FILE: tests/_Integration/_Server/BrowserActions/Wait.php
================================================
Hello World
================================================
FILE: tests/_Integration/_Server/Publisher/BookDetailPage.php
================================================
Book