Repository: crwlrsoft/crawler Branch: main Commit: d6680f9e698a Files: 326 Total size: 1.1 MB Directory structure: gitextract_d22hbn5_/ ├── .editorconfig ├── .gitattributes ├── .github/ │ └── workflows/ │ └── ci.yml ├── .gitignore ├── .php-cs-fixer.php ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── bin/ │ └── add-git-hooks ├── composer.json ├── git-hooks/ │ └── pre-commit ├── phpstan.neon ├── phpunit.xml ├── src/ │ ├── Cache/ │ │ ├── CacheItem.php │ │ ├── Exceptions/ │ │ │ ├── MissingZlibExtensionException.php │ │ │ └── ReadingCacheFailedException.php │ │ └── FileCache.php │ ├── Crawler.php │ ├── HttpCrawler/ │ │ └── AnonymousHttpCrawlerBuilder.php │ ├── HttpCrawler.php │ ├── Input.php │ ├── Io.php │ ├── Loader/ │ │ ├── Http/ │ │ │ ├── Browser/ │ │ │ │ ├── Screenshot.php │ │ │ │ └── ScreenshotConfig.php │ │ │ ├── Cache/ │ │ │ │ └── RetryManager.php │ │ │ ├── Cookies/ │ │ │ │ ├── Cookie.php │ │ │ │ ├── CookieJar.php │ │ │ │ ├── Date.php │ │ │ │ └── Exceptions/ │ │ │ │ └── InvalidCookieException.php │ │ │ ├── Exceptions/ │ │ │ │ └── LoadingException.php │ │ │ ├── HeadlessBrowserLoaderHelper.php │ │ │ ├── HttpLoader.php │ │ │ ├── Messages/ │ │ │ │ └── RespondedRequest.php │ │ │ ├── Politeness/ │ │ │ │ ├── RetryErrorResponseHandler.php │ │ │ │ ├── RobotsTxtHandler.php │ │ │ │ ├── Throttler.php │ │ │ │ └── TimingUnits/ │ │ │ │ └── MultipleOf.php │ │ │ └── ProxyManager.php │ │ ├── Loader.php │ │ └── LoaderInterface.php │ ├── Logger/ │ │ ├── CliLogger.php │ │ └── PreStepInvocationLogger.php │ ├── Output.php │ ├── Result.php │ ├── Steps/ │ │ ├── BaseStep.php │ │ ├── Csv.php │ │ ├── Dom/ │ │ │ ├── DomDocument.php │ │ │ ├── HtmlDocument.php │ │ │ ├── HtmlElement.php │ │ │ ├── Node.php │ │ │ ├── NodeList.php │ │ │ ├── XmlDocument.php │ │ │ └── XmlElement.php │ │ ├── Dom.php │ │ ├── Exceptions/ │ │ │ └── PreRunValidationException.php │ │ ├── Filters/ │ │ │ ├── AbstractFilter.php │ │ │ ├── ArrayFilter.php │ │ │ ├── ClosureFilter.php │ │ │ ├── ComparisonFilter.php │ │ │ ├── Enums/ │ │ │ │ ├── ComparisonFilterRule.php │ │ │ │ ├── StringFilterRule.php │ │ │ │ ├── StringLengthFilterRule.php │ │ │ │ └── UrlFilterRule.php │ │ │ ├── Filter.php │ │ │ ├── FilterInterface.php │ │ │ ├── Filterable.php │ │ │ ├── NegatedFilter.php │ │ │ ├── StringFilter.php │ │ │ ├── StringLengthFilter.php │ │ │ └── UrlFilter.php │ │ ├── Group.php │ │ ├── Html/ │ │ │ ├── CssSelector.php │ │ │ ├── DomQuery.php │ │ │ ├── Exceptions/ │ │ │ │ └── InvalidDomQueryException.php │ │ │ ├── GetLink.php │ │ │ ├── GetLinks.php │ │ │ ├── MetaData.php │ │ │ ├── SchemaOrg.php │ │ │ ├── SelectorTarget.php │ │ │ └── XPathQuery.php │ │ ├── Html.php │ │ ├── Json.php │ │ ├── Loading/ │ │ │ ├── GetSitemapsFromRobotsTxt.php │ │ │ ├── Http/ │ │ │ │ ├── AbstractPaginator.php │ │ │ │ ├── Browser/ │ │ │ │ │ └── BrowserAction.php │ │ │ │ ├── Document.php │ │ │ │ ├── Paginate.php │ │ │ │ ├── Paginator.php │ │ │ │ └── Paginators/ │ │ │ │ ├── QueryParams/ │ │ │ │ │ ├── AbstractQueryParamManipulator.php │ │ │ │ │ ├── Decrementor.php │ │ │ │ │ ├── Incrementor.php │ │ │ │ │ └── QueryParamManipulator.php │ │ │ │ ├── QueryParamsPaginator.php │ │ │ │ ├── SimpleWebsitePaginator.php │ │ │ │ └── StopRules/ │ │ │ │ ├── Contains.php │ │ │ │ ├── IsEmptyInDom.php │ │ │ │ ├── IsEmptyInHtml.php │ │ │ │ ├── IsEmptyInJson.php │ │ │ │ ├── IsEmptyInXml.php │ │ │ │ ├── IsEmptyResponse.php │ │ │ │ ├── NotContains.php │ │ │ │ ├── PaginatorStopRules.php │ │ │ │ └── StopRule.php │ │ │ ├── Http.php │ │ │ ├── HttpBase.php │ │ │ ├── HttpCrawl.php │ │ │ └── LoadingStep.php │ │ ├── Refiners/ │ │ │ ├── AbstractRefiner.php │ │ │ ├── DateTime/ │ │ │ │ └── DateTimeFormat.php │ │ │ ├── DateTimeRefiner.php │ │ │ ├── Html/ │ │ │ │ └── RemoveFromHtml.php │ │ │ ├── HtmlRefiner.php │ │ │ ├── RefinerInterface.php │ │ │ ├── String/ │ │ │ │ ├── AbstractStringRefiner.php │ │ │ │ ├── StrAfterFirst.php │ │ │ │ ├── StrAfterLast.php │ │ │ │ ├── StrBeforeFirst.php │ │ │ │ ├── StrBeforeLast.php │ │ │ │ ├── StrBetweenFirst.php │ │ │ │ ├── StrBetweenLast.php │ │ │ │ └── StrReplace.php │ │ │ ├── StringRefiner.php │ │ │ ├── Url/ │ │ │ │ ├── AbstractUrlRefiner.php │ │ │ │ ├── WithFragment.php │ │ │ │ ├── WithHost.php │ │ │ │ ├── WithPath.php │ │ │ │ ├── WithPort.php │ │ │ │ ├── WithQuery.php │ │ │ │ ├── WithScheme.php │ │ │ │ └── WithoutPort.php │ │ │ └── UrlRefiner.php │ │ ├── Sitemap/ │ │ │ └── GetUrlsFromSitemap.php │ │ ├── Sitemap.php │ │ ├── Step.php │ │ ├── StepInterface.php │ │ ├── StepOutputType.php │ │ └── Xml.php │ ├── Stores/ │ │ ├── JsonFileStore.php │ │ ├── SimpleCsvFileStore.php │ │ ├── Store.php │ │ └── StoreInterface.php │ ├── UserAgents/ │ │ ├── BotUserAgent.php │ │ ├── BotUserAgentInterface.php │ │ ├── UserAgent.php │ │ └── UserAgentInterface.php │ └── Utils/ │ ├── Gzip.php │ ├── HttpHeaders.php │ ├── OutputTypeHelper.php │ ├── RequestKey.php │ └── TemplateString.php └── tests/ ├── Cache/ │ ├── CacheItemTest.php │ ├── FileCacheTest.php │ └── _cachefilecontent ├── CrawlerTest.php ├── HttpCrawler/ │ └── AnonymousHttpCrawlerBuilderTest.php ├── IoTest.php ├── Loader/ │ ├── Http/ │ │ ├── Browser/ │ │ │ └── ScreenshotConfigTest.php │ │ ├── Cache/ │ │ │ └── RetryManagerTest.php │ │ ├── Cookies/ │ │ │ ├── CookieJarTest.php │ │ │ ├── CookieTest.php │ │ │ └── DateTest.php │ │ ├── HeadlessBrowserLoaderHelperTest.php │ │ ├── HttpLoaderPolitenessTest.php │ │ ├── HttpLoaderTest.php │ │ ├── Messages/ │ │ │ └── RespondedRequestTest.php │ │ ├── Politeness/ │ │ │ ├── RobotsTxtHandlerTest.php │ │ │ ├── ThrottlerTest.php │ │ │ └── TimingUnits/ │ │ │ └── MultipleOfTest.php │ │ └── ProxyManagerTest.php │ └── LoaderTest.php ├── Logger/ │ ├── CliLoggerTest.php │ └── PreStepInvocationLoggerTest.php ├── Pest.php ├── ResultTest.php ├── Steps/ │ ├── BaseStepTest.php │ ├── CsvTest.php │ ├── Dom/ │ │ ├── HtmlDocumentTest.php │ │ ├── HtmlElementTest.php │ │ ├── NodeListTest.php │ │ ├── NodeTest.php │ │ ├── XmlDocumentTest.php │ │ ├── XmlElementTest.php │ │ └── _Stubs/ │ │ ├── HtmlNodeStub.php │ │ └── XmlNodeStub.php │ ├── DomTest.php │ ├── Filters/ │ │ ├── ArrayFilterTest.php │ │ ├── ClosureFilterTest.php │ │ ├── ComparisonFilterTest.php │ │ ├── Enums/ │ │ │ ├── ComparisonFilterRuleTest.php │ │ │ ├── StringFilterRuleTest.php │ │ │ ├── StringLengthFilterRuleTest.php │ │ │ └── UrlFilterRuleTest.php │ │ ├── FilterTest.php │ │ ├── NegatedFilterTest.php │ │ ├── StringFilterTest.php │ │ ├── StringLengthFilterTest.php │ │ └── UrlFilterTest.php │ ├── GroupTest.php │ ├── Html/ │ │ ├── CssSelectorTest.php │ │ ├── Exceptions/ │ │ │ └── InvalidDomQueryExceptionTest.php │ │ ├── GetLinkTest.php │ │ ├── GetLinksTest.php │ │ ├── MetaDataTest.php │ │ ├── SchemaOrgTest.php │ │ └── XPathQueryTest.php │ ├── HtmlTest.php │ ├── JsonTest.php │ ├── Loading/ │ │ ├── GetSitemapsFromRobotsTxtTest.php │ │ ├── Http/ │ │ │ ├── DocumentTest.php │ │ │ └── Paginators/ │ │ │ ├── AbstractPaginatorTest.php │ │ │ ├── QueryParams/ │ │ │ │ ├── AbstractQueryParamManipulatorTest.php │ │ │ │ ├── DecrementorTest.php │ │ │ │ └── IncrementorTest.php │ │ │ ├── QueryParamsPaginatorTest.php │ │ │ ├── SimpleWebsitePaginatorTest.php │ │ │ └── StopRules/ │ │ │ ├── ContainsTest.php │ │ │ ├── IsEmptyInHtmlTest.php │ │ │ ├── IsEmptyInJsonTest.php │ │ │ ├── IsEmptyInXmlTest.php │ │ │ ├── IsEmptyResponseTest.php │ │ │ └── NotContainsTest.php │ │ ├── HttpTest.php │ │ └── LoadingStepTest.php │ ├── Refiners/ │ │ ├── AbstractRefinerTest.php │ │ ├── DateTime/ │ │ │ └── DateTimeFormatTest.php │ │ ├── Html/ │ │ │ └── RemoveFromHtmlTest.php │ │ ├── String/ │ │ │ ├── AfterFirstTest.php │ │ │ ├── AfterLastTest.php │ │ │ ├── BeforeFirstTest.php │ │ │ ├── BeforeLastTest.php │ │ │ ├── BetweenFirstTest.php │ │ │ ├── BetweenLastTest.php │ │ │ └── ReplaceTest.php │ │ └── Url/ │ │ ├── WithFragmentTest.php │ │ ├── WithHostTest.php │ │ ├── WithPathTest.php │ │ ├── WithPortTest.php │ │ ├── WithQueryTest.php │ │ ├── WithSchemeTest.php │ │ └── WithoutPortTest.php │ ├── Sitemap/ │ │ └── GetUrlsFromSitemapTest.php │ ├── StepTest.php │ ├── XmlTest.php │ └── _Files/ │ ├── Csv/ │ │ ├── basic.csv │ │ ├── enclosure.csv │ │ ├── escape.csv │ │ ├── separator.csv │ │ └── with-column-headlines.csv │ ├── Html/ │ │ ├── basic.html │ │ ├── bookstore.html │ │ └── event.html │ └── Xml/ │ ├── bookstore.xml │ ├── events.xml │ └── rss-with-bom.xml ├── Stores/ │ ├── JsonFileStoreTest.php │ ├── SimpleCsvFileStoreTest.php │ └── _files/ │ └── .gitkeep ├── UserAgents/ │ ├── BotUserAgentTest.php │ └── UserAgentTest.php ├── Utils/ │ ├── GzipTest.php │ ├── HttpHeadersTest.php │ ├── OutputTypeHelperTest.php │ ├── RequestKeyTest.php │ └── TemplateStringTest.php ├── _Integration/ │ ├── GroupTest.php │ ├── Http/ │ │ ├── CharsetTest.php │ │ ├── CrawlingTest.php │ │ ├── ErrorResponsesTest.php │ │ ├── GzipTest.php │ │ ├── HeadlessBrowserTest.php │ │ ├── Html/ │ │ │ ├── PaginatedListingTest.php │ │ │ └── SimpleListingTest.php │ │ ├── PaginationTest.php │ │ ├── ProxyingTest.php │ │ ├── PublisherExampleTest.php │ │ ├── QueryParamPaginationTest.php │ │ ├── RedirectTest.php │ │ ├── RequestParamsFromInputTest.php │ │ ├── RetryErrorResponsesTest.php │ │ ├── RobotsTxtTest.php │ │ └── TimeoutTest.php │ ├── ProxyServer.php │ ├── Server.php │ └── _Server/ │ ├── BlogPostWithJsonLd.php │ ├── BrokenMimeTypeRss.php │ ├── BrowserActions/ │ │ ├── ClickAndWaitForReload.php │ │ ├── EvaluateAndWaitForReload.php │ │ ├── EvaluateAndWaitForReloadReloaded.php │ │ ├── Main.php │ │ └── Wait.php │ ├── Crawling.php │ ├── HelloWorld.php │ ├── JsGeneratedContent.php │ ├── NonUtf8.php │ ├── PageInitScript.php │ ├── PaginatedListing/ │ │ └── Detail.php │ ├── PaginatedListing.php │ ├── PrintCookie.php │ ├── PrintCookies.php │ ├── PrintHeaders.php │ ├── Publisher/ │ │ ├── AuthorDetailPage.php │ │ ├── AuthorsListPage.php │ │ ├── BookDetailPage.php │ │ └── EditionDetailPage.php │ ├── QueryParamPagination.php │ ├── RssFeed.php │ ├── ServiceUnavailable.php │ ├── SetCookie.php │ ├── SetCookieJs.php │ ├── SetDelayedCookieJs.php │ ├── SetMultipleCookiesJs.php │ ├── SimpleListing/ │ │ └── Detail.php │ ├── SimpleListing.php │ └── TooManyRequests.php ├── _Stubs/ │ ├── AbstractTestPaginator.php │ ├── Crawlers/ │ │ ├── DummyOne.php │ │ ├── DummyTwo/ │ │ │ ├── DummyTwoLoader.php │ │ │ ├── DummyTwoLogger.php │ │ │ └── DummyTwoUserAgent.php │ │ └── DummyTwo.php │ ├── DummyLogger.php │ ├── PhantasyLoader.php │ └── RespondedRequestChild.php └── _Temp/ ├── _cachedir/ │ └── .gitkeep └── _storagedir/ └── .gitkeep ================================================ FILE CONTENTS ================================================ ================================================ FILE: .editorconfig ================================================ # EditorConfig is awesome: http://EditorConfig.org root = true [*] charset = utf-8 end_of_line = lf indent_style = space indent_size = 4 insert_final_newline = true trim_trailing_whitespace = true [*.md] trim_trailing_whitespace = false [*.yml] indent_size = 2 [_cachefilecontent] insert_final_newline = false ================================================ FILE: .gitattributes ================================================ .github export-ignore bin/add-git-hooks export-ignore git-hooks export-ignore tests export-ignore .editorconfig export-ignore .gitattributes export-ignore .gitignore export-ignore .php-cs-fixer.php export-ignore phpstan.neon export-ignore phpunit.xml export-ignore ================================================ FILE: .github/workflows/ci.yml ================================================ name: CI on: pull_request jobs: tests: name: PestPHP Tests runs-on: ubuntu-latest strategy: matrix: php-versions: ['8.1', '8.2', '8.3', '8.4', '8.5'] steps: - name: Checkout repository uses: actions/checkout@v4 - name: Install PHP uses: shivammathur/setup-php@v2 with: php-version: ${{ matrix.php-versions }} - name: Install dependencies run: composer install --prefer-dist --no-progress - name: Run tests run: composer test - name: Run integration tests run: composer test-integration tests84: name: PestPHP Tests Running only on PHP >= 8.4 runs-on: ubuntu-latest strategy: matrix: php-versions: ['8.4', '8.5'] steps: - name: Checkout repository uses: actions/checkout@v4 - name: Install PHP uses: shivammathur/setup-php@v2 with: php-version: ${{ matrix.php-versions }} - name: Install dependencies run: composer install --prefer-dist --no-progress - name: Run tests run: composer test-php84 stanAndCs: name: Static Analysis (phpstan) and Code Style (PHP CS Fixer) runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v4 - name: Install PHP uses: shivammathur/setup-php@v2 with: php-version: '8.1' coverage: none - name: Install dependencies run: composer install --prefer-dist --no-progress - name: Run PHPStan run: composer stan - name: Run PHP CS Fixer run: composer cs ================================================ FILE: .gitignore ================================================ composer.lock vendor .php_cs.cache .php-cs-fixer.cache .phpunit.result.cache .phpunit.cache /cachedir /storedir /tests/_Temp/_cachedir/* !/tests/_Temp/_cachedir/.gitkeep ================================================ FILE: .php-cs-fixer.php ================================================ exclude(['tests/_Integration/_Server', '.github', 'bin', 'git-hooks']) ->in(__DIR__); return (new Config()) ->setFinder($finder) ->setParallelConfig(ParallelConfigFactory::detect()) ->setRules([ '@PER-CS' => true, 'strict_param' => true, 'array_syntax' => ['syntax' => 'short'], 'no_unused_imports' => true, 'operator_linebreak' => ['only_booleans' => true, 'position' => 'end'], ]) ->setRiskyAllowed(true) ->setUsingCache(true); ================================================ FILE: CHANGELOG.md ================================================ # Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] ## [3.5.6] - 2026-01-05 ### Fixed * Potential issues found with PHPStan 2 on level 8. ## [3.5.5] - 2025-08-05 ### Fixed * Removed the overriding `validateAndSanitizeInput()` method from the `Paginate` HTTP step to ensure features like `staticUrl()` and `useInputKeyAsUrl()` work correctly. * The `Paginate` HTTP step now also supports receiving an array of URLs, initiating pagination separately for each one. ### Deprecated * The `Crwlr\Crawler\Steps\Loading\Http\Paginate` class. It shall be removed and its behavior implemented in the `Http` class directly, in the next major version. ## [3.5.4] - 2025-07-28 ### Fixed * An issue in the `SimpleWebsitePaginator` when used with stop rules. ## [3.5.3] - 2025-06-10 ### Fixed * Issues with passing cookies from the cookie jar to the headless browser when using the `useBrowser()` method on `Http` steps, in cases where the loader wasn’t globally configured to use the browser for all requests. ## [3.5.2] - 2025-05-16 ### Fixed * The `Result::toArray()` method now converts all objects contained in the Result array (at any level of the array) to arrays. Also, if the only element in a result array has some autogenerated key containing "unnamed", but the value also is an associative array with string keys, the method only returns that child array. ## [3.5.1] - 2025-04-23 ### Fixed * An issue that occurred, when a step uses the `PreStepInvocationLogger`. As refiners also use the logger, a newer logger (replacing the `PreStepInvocationLogger`) is now also passed to all registered refiners of a step. * Enable applying refiners to output properties with array value. E.g. if a step outputs an array of URLs (`['https://...', 'https://...']`), a `UrlRefiner` will be applied to all those URLs. ## [3.5.0] - 2025-04-10 ### Added * Dynamically building request URLs from extracted data: `Http` steps now have a new `staticUrl()` method, and you can also use variables within that static URL - as well as in request headers and the body - like `https://www.example.com/foo/[crwl:some_extracted_property]`. These variables will be replaced with the corresponding properties from input data (also works with kept data). * New Refiners: * `DateTimeRefiner::reformat('Y-m-d H:i:s')` to reformat a date time string to a different format. Tries to automatically recognize the input format. If this does not work, you can provide an input format to use as the second argument. * `HtmlRefiner::remove('#foo')` to remove nodes matching the given selector from selected HTML. * Steps that produce multiple outputs per input can now group them per input by calling the new `Step::oneOutputPerInput()` method. ## [3.4.5] - 2025-04-09 ### Fixed * When feeding an `Http` step with a string that is not a valid URL (e.g. `https://`), the exception when trying to parse it as a URL is caught, and an error logged. ## [3.4.4] - 2025-04-04 ### Fixed * As sometimes, XML parsing errors occur because of characters that aren't valid within XML documents, the library now catches XML parsing errors, tries to find and replace invalid characters (with transliterates or HTML entities) and retries parsing the document. Works best when you additionally install the `voku/portable-ascii` composer package. ## [3.4.3] - 2025-04-03 ### Fixed * When providing an empty base selector to an `Html` step (`Html::each('')`, `Html::first('')`, `Html::last('')`), it won't fail with an error, but instead log a warning, that it most likely doesn't make sense. * The `Step::keep()` methods now also work when applied to child steps within a group step. ## [3.4.2] - 2025-03-08 ### Fixed * Issue when using `Http::get()->useBrowser()->postBrowserNavigateHook()`. Previously in this case, when the loader is configured to use the HTTP client, the post browser navigate hook was actually not set because of an issue with the order, things happened internally. ## [3.4.1] - 2025-03-08 ### Fixed * Since, when using the Chrome browser for loading, we can only execute GET requests: * The loader now automatically switches to the HTTP client for POST, PUT, PATCH, and DELETE requests and logs a warning. * A warning is logged when attempting to use "Post Browser Navigate Hooks" with POST, PUT, PATCH, or DELETE requests. * Consequently, the `useBrowser()` method, introduced in v3.4.0, is also limited to GET requests. ## [3.4.0] - 2025-03-06 ### Added * Two new methods to the base class of all `Http` steps: * `skipCache()` – Allows using the cache while skipping it for a specific loading step. * `useBrowser()` – Switches the loader to use a (headless) Chrome browser for loading calls in a specific step and then reverts the loader to its previous setting. * Introduced the new `BrowserAction::screenshot()` post browser navigate hook. It accepts an instance of the new `ScreenshotConfig` class, allowing you to configure various options (see the methods of `ScreenshotConfig`). If successful, the screenshot file paths are included in the `RespondedRequest` output object of the `Http` step. ## [3.3.0] - 2025-03-02 ### Added * New `BrowserAction`s to use with the `postBrowserNavigateHook()` method: * `BrowserAction::clickInsideShadowDom()` * `BrowserAction::moveMouseToElement()` * `BrowserAction::moveMouseToPosition()` * `BrowserAction::scrollDown()` * `BrowserAction::scrollUp()` * `BrowserAction::typeText()` * `BrowserAction::waitForReload()` * A new method in `HeadlessBrowserLoaderHelper` to include the HTML content of shadow DOM elements in the returned HTML. Use it like this: `$crawler->getLoader()->browser()->includeShadowElementsInHtml()`. ### Changed * The `BrowserAction::clickElement()` action, now automatically waits for an element matching the selector to be rendered, before performing the click. This means you don't need to put a `BrowserAction::waitUntilDocumentContainsElement()` before it. It works the same in the new `BrowserAction::clickInsideShadowDom()` and `BrowserAction::moveMouseToElement()` actions. ### Deprecated * `BrowserAction::clickElementAndWaitForReload()` and `BrowserAction::evaluateAndWaitForReload()`. As a replacement, please use `BrowserAction::clickElement()` or `BrowserAction::evaluate()` and `BrowserAction::waitForReload()` separately. ## [3.2.5] - 2025-02-26 ### Fixed * When a child step is nested in the `extract()` method of an `Html` or `Xml` step, and does not use `each()` as the base, the extracted value is an array with the keys defined in the `extract()` call, rather than an array of such arrays as it would be with `each()` as base. ## [3.2.4] - 2025-02-25 ### Fixed * Trying to load a relative reference URI (no scheme and host/authority, only path) via the `HttpLoader` now immediately logs (or throws when `loadOrFail()` is used) an error instead of trying to actually load it. ## [3.2.3] - 2025-01-28 ### Fixed * Fix deprecation warning triggered in the `DomQuery` class, when trying to get the value of an HTML/XML attribute that does not exist on the element. ## [3.2.2] - 2025-01-17 ### Fixed * Warnings about loader hooks being called multiple times, when using a `BotUserAgent` and therefore loading and respecting the robots.txt file, or when using the `Http::stopOnErrorResponse()` method. ## [3.2.1] - 2025-01-13 ### Fixed * Reuse previously opened page when using the (headless) Chrome browser, instead of opening a new page for each request. ## [3.2.0] - 2025-01-12 ### Added * `RespondedRequest::isServedFromCache()` to determine whether a response was served from cache or actually loaded. ## [3.1.5] - 2025-01-10 ### Fixed * Another improvement for getting XML source when using the browser, in cases where Chrome doesn't identify the response as an XML document (even though a Content-Type header is sent). ## [3.1.4] - 2025-01-10 ### Fixed * `HttpLoader::dontUseCookies()` now also works when using the Chrome browser. Cookies are cleared before every request. ## [3.1.3] - 2025-01-10 ### Fixed * Further improve getting the raw response body from non-HTML documents via Chrome browser. ## [3.1.2] - 2025-01-08 ### Fixed * When loading a non-HTML document (e.g., XML) via the Chrome browser, the library now retrieves the original source. Previously, it returned the outerHTML of the rendered document, which wrapped the content in an HTML structure. ## [3.1.1] - 2025-01-07 ### Fixed * When the `validateAndSanitize()` method of a step throws an `InvalidArgumentException`, the exception is now caught, logged and the step is not invoked with the invalid input. This improves fault tolerance. Feeding a step with one invalid input shouldn't cause the whole crawler run to fail. Exceptions other than `InvalidArgumentException` remain uncaught. ## [3.1.0] - 2025-01-03 ### Added * New method `HeadlessBrowserLoaderHelper::setPageInitScript()` (`$crawler->getLoader()->browser()->setPageInitScript()`) to provide javascript code that is executed on every new browser page before navigating anywhere. * New method `HeadlessBrowserLoaderHelper::useNativeUserAgent()` (`$crawler->getLoader()->browser()->useNativeUserAgent()`) to allow using the native `User-Agent` that your Chrome browser sends by default. ## [3.0.4] - 2024-12-18 ### Fixed * Minor improvement for the `DomQuery` (base for `Dom::cssSelector()` and `Dom::xPath()`): enable providing an empty string as selector, to simply get the node that the selector is applied to. ## [3.0.3] - 2024-12-11 ### Fixed * Improved fix for non UTF-8 characters in HTML documents declared as UTF-8. ## [3.0.2] - 2024-12-11 ### Fixed * When the new PHP 8.4 DOM API is used, and HTML declared as UTF-8 contains non UTF-8 compatible characters, it does not replace them with a � character, but instead removes it. This behaviour is consistent with the data returned by Symfony DomCrawler. ## [3.0.1] - 2024-12-10 ### Undeprecated * Removed deprecations for all XPath functionality (`Dom::xPath()`, `XPathQuery` class and `Node::queryXPath()`), because it's still available with the net DOM API in PHP 8.4. ## [3.0.0] - 2024-12-08 The primary change in version 3.0.0 is that the library now leverages PHP 8.4’s new DOM API when used in an environment with PHP >= 8.4. To maintain compatibility with PHP < 8.4, an abstraction layer has been implemented. This layer dynamically uses either the Symfony DomCrawler component or the new DOM API, depending on the PHP version. Since no direct interaction with an instance of the Symfony DomCrawler library was required at the step level provided by the library, it is highly likely that you won’t need to make any changes to your code to upgrade to v3. To ensure a smooth transition, please review the points under “Changed.” ### Changed * __BREAKING__: The `DomQuery::innerText()` method (a.k.a. `Dom::cssSelector('...')->innerText()`) has been removed. `innerText` exists only in the Symfony DomCrawler component, and its usefulness is questionable. If you still require this variant of the DOM element text, please let us know or create a pull request yourself. Thank you! * __BREAKING__: The `DomQueryInterface` was removed. As the `DomQuery` class offers a lot more functionality than the interface defines, the purpose of the interface was questionable. Please use the abstract `DomQuery` class instead. This also means that some method signatures, type hinting the interface, have changed. Look for occurrences of `DomQueryInterface` and replace them. * __BREAKING__: The visibility of the `DomQuery::filter()` method was changed from public to protected. It is still needed in the `DomQuery` class, but outside of it, it is probably better and easier to directly use the new DOM abstraction (see the `src/Steps/Dom` directory). If you are extending the `DomQuery` class (which is not recommended), be aware that the argument now takes a `Node` (from the new DOM abstraction) instead of a Symfony `Crawler`. * __BREAKING__: The `Step::validateAndSanitizeToDomCrawlerInstance()` method was removed. Please use the `Step::validateAndSanitizeToHtmlDocumentInstance()` and `Step::validateAndSanitizeToXmlDocumentInstance()` methods instead. * __BREAKING__: The second argument in `Closure`s passed to the `Http::crawl()->customFilter()` has changed from an instance of Symfony `Crawler` class, to an `HtmlElement` instance from the new DOM abstraction (`Crwlr\Crawler\Steps\Dom\HtmlElement`). * __BREAKING__: The Filter class was split into `AbstractFilter` (base class for actual filter classes) and `Filter` only hosting the static function for easy instantiation, because otherwise each filter class also has all the static methods. * __BREAKING__: Further, the signatures of some methods that are mainly here for internal usage, have changed due to the new DOM abstraction: * The static `GetLink::isSpecialNonHttpLink()` method now needs an instance of `HtmlElement` instead of a Symfony `Crawler`. * `GetUrlsFromSitemap::fixUrlSetTag()` now takes an `XmlDocument` instead of a Symfony `Crawler`. * The `DomQuery::apply()` method now takes a `Node` instead of a Symfony `Crawler`. ### Deprecated * `Dom::xPath()` method and * the `XPathQuery` class as well as * the new `Node::queryXPath()` method. ### Added * New step output filter `Filter::arrayHasElement()`. When a step produces array output with a property being a numeric array, you can now filter outputs by checking if one element of that array property, matches certain filter criteria. Example: The outputs look like `['foo' => 'bar', 'baz' => ['one', 'two', 'three']]`. You can filter all outputs where `baz` contains `two` like: `Filter::arrayHasElement()->where('baz', Filter::equal('two'))`. ## [2.1.3] - 2024-11-05 ### Fixed * Improvements for deprecations in PHP 8.4. ## [2.1.2] - 2024-10-22 ### Fixed * Issue when converting cookie objects received from the chrome-php library. ## [2.1.1] - 2024-10-21 ### Fixed * Also add cookies, set during headless browser usage, to the cookie jar. When switching back to the (guzzle) HTTP client the cookies should also be sent. * Don't call `Loader::afterLoad()` when `Loader::beforeLoad()` was not called before. This can potentially happen, when an exception is thrown before the call to the `beforeLoad` hook, but it is caught and the `afterLoader` hook method is called anyway. As this most likely won't make sense to users, the `afterLoad` hook callback functions will just not be called in this case. * The `Throttler` class now has protected methods `_internalTrackStartFor()`, `_requestToUrlWasStarted()` and `_internalTrackEndFor()`. When extending the `Throttler` class (be careful, actually that's not really recommended) they can be used to check if a request to a URL was actually started before. ## [2.1.0] - 2024-10-19 ### Added * The new `postBrowserNavigateHook()` method in the `Http` step classes, which allows to define callback functions that are triggered after the headless browser navigated to the specified URL. They are called with the chrome-php `Page` object as argument, so you can interact with the page. Also, there is a new class `BrowserAction` providing some simple actions (like wait for element, click element,...) as Closures via static methods. You can use it like `Http::get()->postBrowserNavigateHook(BrowserAction::clickElement('#element'))`. ## [2.0.1] - 2024-10-15 ### Fixed * Issue with the `afterLoad` hook of the `HttpLoader`, introduced in v2. Calling the hook was commented out, which slipped through because the test case was faulty. ## [2.0.0] - 2024-10-15 ### Changed * __BREAKING__: Removed methods `BaseStep::addToResult()`, `BaseStep::addLaterToResult()`, `BaseStep::addsToOrCreatesResult()`, `BaseStep::createsResult()`, and `BaseStep::keepInputData()`. These methods were deprecated in v1.8.0 and should be replaced with `Step::keep()`, `Step::keepAs()`, `Step::keepFromInput()`, and `Step::keepInputAs()`. * __BREAKING__: Added the following keep methods to the `StepInterface`: `StepInterface::keep()`, `StepInterface::keepAs()`, `StepInterface::keepFromInput()`, `StepInterface::keepInputAs()`, as well as `StepInterface::keepsAnything()`, `StepInterface::keepsAnythingFromInputData()` and `StepInterface::keepsAnythingFromOutputData()`. If you have a class that implements this interface without extending `Step` (or `BaseStep`), you will need to implement these methods yourself. However, it is strongly recommended to extend `Step` instead. * __BREAKING__: With the removal of the `addToResult()` method, the library no longer uses `toArrayForAddToResult()` methods on output objects. Instead, please use `toArrayForResult()`. Consequently, `RespondedRequest::toArrayForAddToResult()` has been renamed to `RespondedRequest::toArrayForResult()`. * __BREAKING__: Removed the `result` and `addLaterToResult` properties from `Io` objects (`Input` and `Output`). These properties were part of the `addToResult` feature and are now removed. Instead, use the `keep` property where kept data is added. * __BREAKING__: The signature of the `Crawler::addStep()` method has changed. You can no longer provide a result key as the first parameter. Previously, this key was passed to the `Step::addToResult()` method internally. Now, please handle this call yourself. * __BREAKING__: The return type of the `Crawler::loader()` method no longer allows `array`. This means it's no longer possible to provide multiple loaders from the crawler. Instead, use the new functionality to directly provide a custom loader to a step described below. As part of this change, the `UnknownLoaderKeyException` was also removed as it is now obsolete. If you have any references to this class, please make sure to remove them. * __BREAKING__: Refactored the abstract `LoadingStep` class to a trait and removed the `LoadingStepInterface`. Loading steps should now extend the `Step` class and use the trait. As multiple loaders are no longer supported, the `addLoader` method was renamed to `setLoader`. Similarly, the methods `useLoader()` and `usesLoader()` for selecting loaders by key are removed. Now, you can directly provide a different loader to a single step using the trait's new `withLoader()` method (e.g., `Http::get()->withLoader($loader)`). The trait now also uses phpdoc template tags, for a generic loader type. You can define the loader type by putting `/** @use LoadingStep */` above `use LoadingStep;` in your step class. Then your IDE and static analysis (if supported) will know what type of loader, the trait methods return and accept. * __BREAKING__: Removed the `PaginatorInterface` to allow for better extensibility. The old `Crwlr\Crawler\Steps\Loading\Http\Paginators\AbstractPaginator` class has also been removed. Please use the newer, improved version `Crwlr\Crawler\Steps\Loading\Http\AbstractPaginator`. This newer version has also changed: the first argument `UriInterface $url` is removed from the `processLoaded()` method, as the URL also is part of the request (`Psr\Http\Message\RequestInterface`) which is now the first argument. Additionally, the default implementation of the `getNextRequest()` method is removed. Child implementations must define this method themselves. If your custom paginator still has a `getNextUrl()` method, note that it is no longer needed by the library and will not be called. The `getNextRequest()` method now fulfills its original purpose. * __BREAKING__: Removed methods from `HttpLoader`: * `$loader->setHeadlessBrowserOptions()` => use `$loader->browser()->setOptions()` instead * `$loader->addHeadlessBrowserOptions()` => use `$loader->browser()->addOptions()` instead * `$loader->setChromeExecutable()` => use `$loader->browser()->setExecutable()` instead * `$loader->browserHelper()` => use `$loader->browser()` instead * __BREAKING__: Removed method `RespondedRequest::cacheKeyFromRequest()`. Use `RequestKey::from()` instead. * __BREAKING__: The `HttpLoader::retryCachedErrorResponses()` method now returns an instance of the new `Crwlr\Crawler\Loader\Http\Cache\RetryManager` class. This class provides the methods `only()` and `except()` to restrict retries to specific HTTP response status codes. Previously, this method returned the `HttpLoader` itself (`$this`), so if you're using it in a chain and calling other loader methods after it, you will need to refactor your code. * __BREAKING__: Removed the `Microseconds` class from this package. It has been moved to the `crwlr/utils` package, which you can use instead. ### Added * New methods `FileCache::prolong()` and `FileCache::prolongAll()` to allow prolonging the time to live for cached responses. ### Fixed * The `maxOutputs()` method is now also available and working on `Group` steps. * Improved warning messages for step validations that are happening before running a crawler. * A `PreRunValidationException` when the crawler finds a problem with the setup, before actually running, is not only logged as an error via the logger, but also rethrown to the user. This way the user won't get the impression, that the crawler ran successfully without looking at the log messages. ## [1.10.0] - 2024-08-05 ### Added * URL refiners: `UrlRefiner::withScheme()`, `UrlRefiner::withHost()`, `UrlRefiner::withPort()`, `UrlRefiner::withoutPort()`, `UrlRefiner::withPath()`, `UrlRefiner::withQuery()`, `UrlRefiner::withoutQuery()`, `UrlRefiner::withFragment()` and `UrlRefiner::withoutFragment()`. * New paginator stop rules `PaginatorStopRules::contains()` and `PaginatorStopRules::notContains()`. * Static method `UserAgent::mozilla5CompatibleBrowser()` to get a `UserAgent` instance with the user agent string `Mozilla/5.0 (compatible)` and also the new method `withMozilla5CompatibleUserAgent` in the `AnonymousHttpCrawlerBuilder` that you can use like this: `HttpCrawler::make()->withMozilla5CompatibleUserAgent()`. ## [1.9.5] - 2024-07-25 ### Fixed * Prevent PHP warnings when an HTTP response includes a `Content-Type: application/x-gzip` header, but the content is not actually compressed. This issue also occurred with cached responses, because compressed content is decoded during caching. Upon retrieval from the cache, the header indicated compression, but the content was already decoded. ## [1.9.4] - 2024-07-24 ### Fixed * When using `HttpLoader::cacheOnlyWhereUrl()` to restrict caching, the filter rule is not only applied when adding newly loaded responses to the cache, but also for using cached responses. Example: a response for `https://www.example.com/foo` is already available in the cache, but `$loader->cacheOnlyWhereUrl(Filter::urlPathStartsWith('/bar/'))` was called, the cached response is not used. ## [1.9.3] - 2024-07-05 ### Fixed * Add `HttpLoader::browser()` as a replacement for `HttpLoader::browserHelper()` and deprecate the `browserHelper()` method. It's an alias and just because it will read a little better: `$loader->browser()->xyz()` vs. `$loader->browserHelper()->xyz()`. `HttpLoader::browserHelper()` will be removed in v2.0. * Also deprecate `HttpLoader::setHeadlessBrowserOptions()`, `HttpLoader::addHeadlessBrowserOptions()` and `HttpLoader::setChromeExecutable()`. Use `$loader->browser()->setOptions()`, `$loader->browser()->addOptions()` and `$loader->browser()->setExecutable()` instead. ## [1.9.2] - 2024-06-18 ### Fixed * Issue with setting the headless chrome executable, introduced in 1.9.0. ## [1.9.1] - 2024-06-17 ### Added * Also add `HeadlessBrowserLoaderHelper::getTimeout()` to get the currently configured timeout value. ## [1.9.0] - 2024-06-17 ### Added * New methods `HeadlessBrowserLoaderHelper::setTimeout()` and `HeadlessBrowserLoaderHelper::waitForNavigationEvent()` to allow defining the timeout for the headless chrome in milliseconds (default 30000 = 30 seconds) and the navigation event (`load` (default), `DOMContentLoaded`, `firstMeaningfulPaint`, `networkIdle`, etc.) to wait for when loading a URL. ## [1.8.0] - 2024-06-05 ### Added * New methods `Step::keep()` and `Step::keepAs()`, as well as `Step::keepFromInput()` and `Step::keepInputAs()`, as alternatives to `Step::addToResult()` (or `Step::addLaterToResult()`). The `keep()` method can be called without any argument, to keep all from the output data. It can be called with a string, to keep a certain key or with an array to keep a list of keys. If the step yields scalar value outputs (not an associative array or object with keys) you need to use the `keepAs()` method with the key you want the output value to have in the kept data. The methods `keepFromInput()` and `keepInputAs()` work the same, but uses the input (not the output) that the step receives. Most likely only needed with a first step, to keep data from initial inputs (or in a sub crawler, see below). Kept properties can also be accessed with the `Step::useInputKey()` method, so you can easily reuse properties from multiple steps ago as input. * New method `Step::outputType()` with default implementation returning `StepOutputType::Mixed`. Please consider implementing this method yourself in all your custom steps, because it is going to be required in v2 of the library. It allows detecting (potential) problems in crawling procedures immediately when starting a run instead of failing after already running a while. * New method `Step::subCrawlerFor()`, allowing to fill output properties from an actual full child crawling procedure. As the first argument, you give it a key from the step's output, that the child crawler uses as input(s). As the second argument you need to provide a `Closure` that receives a clone of the current `Crawler` without steps and with initial inputs, set from the current output. In the `Closure` you then define the crawling procedure by adding steps as you're used to do it, and return it. This allows to achieve nested output data, scraped from different (sub-)pages, more flexible and less complicated as with the usual linear crawling procedure and `Step::addToResult()`. ### Deprecated * The `Step::addToResult()`, `Step::addLaterToResult()` and `Step::keepInputData()` methods. Instead, please use the new keep methods. This can cause some migration work for v2, because especially the add to result methods are a pretty central functionality, but the new "keep" methodology (plus the new sub crawler feature) will make a lot of things easier, less complex and the library will most likely work more efficiently in v2. ### Fixed * When a cache file was generated with compression, and you're trying to read it with a `FileCache` instance without compression enabled, it also works. When unserializing the file content fails it tries decoding the string first before unserializing it. ## [1.7.2] - 2024-03-19 ### Fixed * When the `useInputKey()` method is used on a step and the defined key does not exist in input, it logs a warning and does not invoke the step instead of throwing an `Exception`. ## [1.7.1] - 2024-03-11 ### Fixed * A PHP error that happened when the loader returns `null` for the initial request in the `Http::crawl()` step. ## [1.7.0] - 2024-03-04 ### Added * Allow getting the whole decoded JSON as array with the new `Json::all()` and also allow to get the whole decoded JSON, when using `Json::get()`, inside a mapping using either empty string or `*` as target. Example: `Json::get(['all' => '*'])`. `*` only works, when there is no key `*` in the decoded data. ### Fixed * Make it work with responses loaded by a headless browser. If decoding the input string fails, it now checks if it could be HTML. If that's the case, it extracts the text content of the `` and tries to decode this instead. ## [1.6.2] - 2024-02-26 ### Fixed * When using `HttpLoader::cacheOnlyWhereUrl()` and a request was redirected (maybe even multiple times), previously all URLs in the chain had to match the filter rule. As this isn't really practicable, now only one of the URLs has to match the rule. ## [1.6.1] - 2024-02-16 ### Changed * Make method `HttpLoader::addToCache()` public, so steps can update a cached response with an extended version. ## [1.6.0] - 2024-02-13 ### Added * Enable dot notation in `Step::addToResult()`, so you can get data from nested output, like: `$step->addToResult(['url' => 'response.url', 'status' => 'response.status', 'foo' => 'bar'])`. * When a step adds output properties to the result, and the output contains objects, it tries to serialize those objects to arrays, by calling `__serialize()`. If you want an object to be serialized differently for that purpose, you can define a `toArrayForAddToResult()` method in that class. When that method exists, it's preferred to the `__serialize()` method. * Implemented above-mentioned `toArrayForAddToResult()` method in the `RespondedRequest` class, so on every step that somehow yields a `RespondedRequest` object, you can use the keys `url`, `uri`, `status`, `headers` and `body` with the `addToResult()` method. Previously this only worked for `Http` steps, because it defines output key aliases (`HttpBase::outputKeyAliases()`). Now, in combination with the ability to use dot notation when adding data to the result, if your custom step returns nested output like `['response' => RespondedRequest, 'foo' => 'bar']`, you can add response data to the result like this `$step->addToResult(['url' => 'response.url', 'body' => 'response.body'])`. ### Fixed * Improvement regarding the timing when a store (`Store` class instance) is called by the crawler with a final crawling result. When a crawling step initiates a crawling result (so, `addToResult()` was called on the step instance), the crawler has to wait for all child outputs (resulting from one step-input) until it calls the store, because the child outputs can all add data to the same final result object. But previously this was not only the case for all child outputs starting from a step where `addToResult()` was called, but all children of one initial crawler input. So with this change, in a lot of cases, the store will earlier be called with finished `Result` objects and memory usage will be lowered. ## [1.5.3] - 2024-02-07 ### Fixed * Merge `HttpBaseLoader` back to `HttpLoader`. It's probably not a good idea to have multiple loaders. At least not multiple loaders just for HTTP. It should be enough to publicly expose the `HeadlessBrowserLoaderHelper` via `HttpLoader::browserHelper()` for the extension steps. But keep the `HttpBase` step, to share the general HTTP functionality implemented there. ## [1.5.2] - 2024-02-07 ### Fixed * Issue in `GetUrlsFromSitemap` (`Sitemap::getUrlsFromSitemap()`) step when XML content has no line breaks. ## [1.5.1] - 2024-02-06 ### Fixed * For being more flexible to build a separate headless browser loader (in an extension package) extract the most basic HTTP loader functionality to a new `HttpBaseLoader` and important functionality for the headless browser loader to a new `HeadlessBrowserLoaderHelper`. Further, also share functionality from the `Http` steps via a new abstract `HttpBase` step. It's considered a fix, because there's no new functionality, just refactoring existing code for better extendability. ## [1.5.0] - 2024-01-29 ### Added * The `DomQuery` class (parent of `CssSelector` (`Dom::cssSelector`) and `XPathQuery` (`Dom::xPath`)) has a new method `formattedText()` that uses the new crwlr/html-2-text package to convert the HTML to formatted plain text. You can also provide a customized instance of the `Html2Text` class to the `formattedText()` method. ### Fixed * The `Http::crawl()` step won't yield a page again if a newly found URL responds with a redirect to a previously loaded URL. ## [1.4.0] - 2024-01-14 ### Added * The `QueryParamsPaginator` can now also increase and decrease non first level query param values like `foo[bar][baz]=5` using dot notation: `QueryParamsPaginator::paramsInUrl()->increaseUsingDotNotation('foo.bar.baz', 5)`. ## [1.3.5] - 2023-12-20 ### Fixed * The `FileCache` can now also read uncompressed cache files when compression is activated. ## [1.3.4] - 2023-12-19 ### Fixed * Reset paginator state after finishing paginating for one base input, to enable paginating multiple listings of the same structure. ## [1.3.3] - 2023-12-01 ### Fixed * Add forgotten getter method to get the DOM query that is attached to an `InvalidDomQueryException` instance. ## [1.3.2] - 2023-12-01 ### Fixed * When creating a `CssSelector` or `XPathQuery` instance with invalid selector/query syntax, an `InvalidDomQueryException` is now immediately thrown. This change is considered to be not only non-breaking, but actually a fix, because the `CssSelector` would otherwise throw an exception later when the `apply()` method is called. The `XPathQuery` would silently return no result without notifying you of the invalid query and generate a PHP warning. ## [1.3.1] - 2023-11-30 ### Fixed * Support usage with the new Symfony major version v7. ## [1.3.0] - 2023-10-28 ### Added * New methods `HttpLoader::useProxy()` and `HttpLoader::useRotatingProxies([...])` to define proxies that the loader shall use. They can be used with a guzzle HTTP client instance (default) and when the loader uses the headless Chrome browser. Using them when providing some other PSR-18 implementation will throw an exception. * New `QueryParamsPaginator` to paginate by increasing and/or decreasing one or multiple query params, either in the URL or in the body of requests. Can be created via static method `Crwlr\Crawler\Steps\Loading\Http\Paginator::queryParams()`. * New method `stopWhen` in the new `Crwlr\Crawler\Steps\Loading\Http\AbstractPaginator` class (for more info see the deprecation below). You can pass implementations of the new `StopRule` interface or custom closures to that method and then, every time the Paginator receives a loaded response to process, those stop rules are called with the response. If any of the conditions of the stop rules is met, the Paginator stops paginating. Of course also added a few stop rules to use with that new method: `IsEmptyInHtml`, `IsEmptyInJson`, `IsEmptyInXml` and `IsEmptyResponse`, also available via static methods: `PaginatorStopRules::isEmptyInHtml()`, `PaginatorStopRules::isEmptyInJson()`, `PaginatorStopRules::isEmptyInXml()` and `PaginatorStopRules::isEmptyResponse()`. ### Deprecated * Deprecated the `Crwlr\Crawler\Steps\Loading\Http\PaginatorInterface` and the `Crwlr\Crawler\Steps\Loading\Http\Paginators\AbstractPaginator`. Instead, added a new version of the `AbstractPaginator` as `Crwlr\Crawler\Steps\Loading\Http\AbstractPaginator` that can be used. Usually there shouldn't be a problem switching from the old to the new version. If you want to make your custom paginator implementation ready for v2 of the library, extend the new `AbstractPaginator` class, implement your own `getNextRequest` method (new requirement, with a default implementation in the abstract class, which will be removed in v2) and check if properties and methods of your existing class don't collide with the new properties and methods in the abstract class. ### Fixed * The `HttpLoader::load()` implementation won't throw any exception, because it shouldn't kill a crawler run. When you want any loading error to end the whole crawler execution `HttpLoader::loadOrFail()` should be used. Also adapted the phpdoc in the `LoaderInterface`. ## [1.2.2] - 2023-09-19 ### Fixed * Fix in `HttpCrawl` (`Http::crawl()`) step: when a page contains a broken link, that can't be resolved and throws an `Exception` from the URL library, ignore the link and log a warning message. * Minor fix for merging HTTP headers when an `Http` step gets both, statically defined headers and headers to use from array input. ## [1.2.1] - 2023-08-21 ### Fixed * When a URL redirects, the `trackRequestEndFor()` method of the `HttpLoader`'s `Throttler` instance is called only once at the end and with the original request URL. ## [1.2.0] - 2023-08-18 ### Added * New `onCacheHit` hook in the `Loader` class (in addition to `beforeLoad`, `onSuccess`, `onError` and `afterLoad`) that is called in the `HttpLoader` class when a response for a request was found in the cache. ### Deprecated * Moved the `Microseconds` value object class to the crwlr/utils package, as it is a very useful and universal tool. The class in this package still exists, but just extends the class from the utils package and will be removed in v2. So, if you're using this class, please change to use the version from the utils package. ## [1.1.6] - 2023-07-20 ### Fixed * Throttling now also works when using the headless browser. ## [1.1.5] - 2023-07-14 ### Fixed * The `Http::crawl()` step, as well as the `Html::getLink()` and `Html::getLinks()` steps now ignore links, when the `href` attribute starts with `mailto:`, `tel:` or `javascript:`. For the crawl step it obviously makes no sense, but it's also considered a bugfix for the getLink(s) steps, because they are meant to deliver absolute HTTP URLs. If you want to get the values of such links, use the HTML data extraction step. ## [1.1.4] - 2023-07-14 ### Fixed * The `Http::crawl()` step now also work with sitemaps as input URL, where the `` tag contains attributes that would cause the symfony DomCrawler to not find any elements. ## [1.1.3] - 2023-06-29 ### Fixed * Improved `Json` step: if the target of the "each" (like `Json::each('target', [...])`) does not exist in the input JSON data, the step yields nothing and logs a warning. ## [1.1.2] - 2023-05-28 ### Fixed * Using the `only()` method of the `MetaData` (`Html::metaData()`) step class, the `title` property was always contained in the output, even if not listed in the `only` properties. This is fixed now. ## [1.1.1] - 2023-05-28 ### Fixed * There was an issue when adding multiple associative arrays with the same key to a `Result` object: let's say you're having a step producing array output like: `['bar' => 'something', 'baz' => 'something else']` and it (the whole array) shall be added to the result property `foo`. When the step produced multiple such array outputs, that led to a result like `['bar' => '...', 'baz' => '...', ['bar' => '...', 'baz' => '...'], ['bar' => '...', 'baz' => '...']`. Now it's fixed to result in `[['bar' => '...', 'baz' => '...'], ['bar' => '...', 'baz' => '...'], ['bar' => '...', 'baz' => '...']`. ## [1.1.0] - 2023-05-21 ### Added * `Http` steps can now receive body and headers from input data (instead of statically defining them via argument like `Http::method(headers: ...)`) using the new methods `useInputKeyAsBody()` and `useInputKeyAsHeader(, )` or `useInputKeyAsHeaders()`. Further, when invoked with associative array input data, the step will by default use the value from `url` or `uri` for the request URL. If the input array contains the URL in a key with a different name, you can use the new `useInputKeyAsUrl()` method. That was basically already possible with the existing `useInputKey()` method, because the URL is the main input argument for the step. But if you want to use it in combination with the other new `useInputKeyAsXyz()` methods, you have to use `useInputKeyAsUrl()`, because using `useInputKey()` would invoke the whole step with that key only. * `Crawler::runAndDump()` as a simple way to just run a crawler and dump all results, each as an array. * `addToResult()` now also works with serializable objects. * If you know certain keys that the output of a step will contain, you can now also define aliases for those keys, to be used with `addToResult()`. The output of an `Http` step (`RespondedRequest`) contains the keys `requestUri` and `effectiveUri`. The aliases `url` and `uri` refer to `effectiveUri`, so `addToResult(['url'])` will add the `effectiveUri` as `url` to the result object. * The `GetLink` (`Html::getLink()`) and `GetLinks` (`Html::getLinks()`) steps, as well as the abstract `DomQuery` (parent of `CssSelector` (/`Dom::cssSelector`) and `XPathQuery` (/`Dom::xPath`)) now have a method `withoutFragment()` to get links respectively URLs without their fragment part. * The `HttpCrawl` step (`Http::crawl()`) has a new method `useCanonicalLinks()`. If you call it, the step will not yield responses if its canonical link URL was already yielded. And if it discovers a link, and some document pointing to that URL via canonical link was already loaded, it treats it as if it was already loaded. Further this feature also sets the canonical link URL as the `effectiveUri` of the response. * All filters can now be negated by calling the `negate()` method, so the `evaluate()` method will return the opposite bool value when called. The `negate()` method returns an instance of `NegatedFilter` that wraps the original filter. * New method `cacheOnlyWhereUrl()` in the `HttpLoader` class, that takes an instance of the `FilterInterface` as argument. If you define one or multiple filters using this method, the loader will cache only responses for URLs that match all the filters. ### Fixed * The `HttpCrawl` step (`Http::crawl()`) by default now removes the fragment part of URLs to not load the same page multiple times, because in almost any case, servers won't respond with different content based on the fragment. That's why this change is considered non-breaking. For the rare cases when servers respond with different content based on the fragment, you can call the new `keepUrlFragment()` method of the step. * Although the `HttpCrawl` step (`Http::crawl()`) already respected the limit of outputs defined via the `maxOutputs()` method, it actually didn't stop loading pages. The limit had no effect on loading, only on passing on outputs (responses) to the next step. This is fixed in this version. * A so-called byte order mark at the beginning of a file (/string) can cause issues. So just remove it, when a step's input string starts with a UTF-8 BOM. * There seems to be an issue in guzzle when it gets a PSR-7 request object with a header with multiple string values (as array, like: `['accept-encoding' => ['gzip', 'deflate', 'br']]`). When testing it happened that it only sent the last part (in this case `br`). Therefore, the `HttpLoader` now prepares headers before sending (in this case to: `['accept-encoding' => ['gzip, deflate, br']]`). * You can now also use the output key aliases when filtering step outputs. You can even use keys that are only present in the serialized version of an output object. ## [1.0.2] - 2023-03-20 ### Fixed * JSON step: another fix for JSON strings having keys without quotes with empty string value. ## [1.0.1] - 2023-03-17 ### Fixed * JSON step: improve attempt to fix JSON string having keys without quotes. ## [1.0.0] - 2023-02-08 ### Added * New method `Step::refineOutput()` to manually refine step output values. It takes either a `Closure` or an instance of the new `RefinerInterface` as argument. If the step produces array output, you can provide a key from the array output, to refine, as first argument and the refiner as second argument. You can call the method multiple times and all the refiners will be applied to the outputs in the order you add them. If you want to refine multiple output array keys with a `Closure`, you can skip providing a key and the `Closure` will receive the full output array for refinement. As mentioned you can provide an instance of the `RefinerInterface`. There are already a few implementations: `StringRefiner::afterFirst()`, `StringRefiner::afterLast()`, `StringRefiner::beforeFirst()`, `StringRefiner::beforeLast()`, `StringRefiner::betweenFirst()`, `StringRefiner::betweenLast()` and `StringRefiner::replace()`. * New method `Step::excludeFromGroupOutput()` to exclude a normal steps output from the combined output of a group that it's part of. * New method `HttpLoader::setMaxRedirects()` to customize the limit of redirects to follow. Works only when using the HTTP client. * New filters to filter by string length, with the same options as the comparison filters (equal, not equal, greater than,...). * New `Filter::custom()` that you can use with a Closure, so you're not limited to the available filters only. * New method `DomQuery::link()` as a shortcut for `DomQuery::attribute('href')->toAbsoluteUrl()`. * New static method `HttpCrawler::make()` returning an instance of the new class `AnonymousHttpCrawlerBuilder`. This makes it possible to create your own Crawler instance with a one-liner like: `HttpCrawler::make()->withBotUserAgent('MyCrawler')`. There's also a `withUserAgent()` method to create an instance with a normal (non bot) user agent. ### Changed * __BREAKING__: The `FileCache` now also respects the `ttl` (time to live) argument and by default it is one hour (3600 seconds). If you're using the cache and expect the items to live (basically) forever, please provide a high enough value for default the time to live. When you try to get a cache item that is already expired, it (the file) is immediately deleted. * __BREAKING__: The `TooManyRequestsHandler` (and with that also the constructor argument in the `HttpLoader`) was renamed to `RetryErrorResponseHandler`. It now reacts the same to 503 (Service Unavailable) responses as to the 429 (Too Many Requests) responses. If you're actively passing your own instance to the `HttpLoader`, you need to update it. * You can now have multiple different loaders in a `Crawler`. To use this, return an array containing your loaders from the protected `Crawler::loader()` method with keys to name them. You can then selectively use them by calling the `Step::useLoader()` method on a loading step with the key of the loader it should use. ### Removed * __BREAKING__: The loop feature. The only real world use case should be paginating listings and this should be solved with the Paginator feature. * __BREAKING__: `Step::dontCascade()` and `Step::cascades()` because with the change in v0.7, that groups can only produce combined output, there should be no use case for this anymore. If you want to exclude one steps output from the combined group output, you can use the new `Step::excludeFromGroupOutput()` method. ## [0.7.0] - 2023-01-13 ### Added * New functionality to paginate: There is the new `Paginate` child class of the `Http` step class (easy access via `Http::get()->paginate()`). It takes an instance of the `PaginatorInterface` and uses it to iterate through pagination links. There is one implementation of that interface, the `SimpleWebsitePaginator`. The `Http::get()->paginate()` method uses it by default, when called just with a CSS selector to get pagination links. Paginators receive all loaded pages and implement the logic to find pagination links. The paginator class is also called before sending a request, with the request object that is about to be sent as an argument (`prepareRequest()`). This way, it should even be doable to implement more complex pagination functionality. For example when pagination is built using POST request with query strings in the request body. * New methods `stopOnErrorResponse()` and `yieldErrorResponses()` that can be used with `Http` steps. By calling `stopOnErrorResponse()` the step will throw a `LoadingException` when a response has a 4xx or 5xx status code. By calling the `yieldErrorResponse()` even error responses will be yielded and passed on to the next steps (this was default behaviour until this version. See the breaking change below). * The body of HTTP responses with a `Content-Type` header containing `application/x-gzip` are automatically decoded when `Http::getBodyString()` is used. Therefore, added `ext-zlib` to suggested in `composer.json`. * New methods `addToResult()` and `addLaterToResult()`. `addToResult()` is a single replacement for `setResultKey()` and `addKeysToResult()` (they are removed, see `Changed` below) that can be used for array and non array output. `addLaterToResult()` is a new method that does not create a Result object immediately, but instead adds the output of the current step to all the Results that will later be created originating from the current output. * New methods `outputKey()` and `keepInputData()` that can be used with any step. Using the `outputKey()` method, the step will convert non array output to an array and use the key provided as an argument to this method as array key for the output value. The `keepInputData()` method allows you to forward data from the step's input to the output. If the input is non array you can define a key using the method's argument. This is useful e.g. if you're having data in the initial inputs that you also want to add to the final crawling results. * New method `createsResult()` that can be used with any step, so you can differentiate if a step creates a Result object, or just keeps data to add to results later (new `addLaterToResult()` method). But primarily relevant for library internal use. * The `FileCache` class can compress the cache data now to save disk space. Use the `useCompression()` method to do so. * New method `retryCachedErrorResponses()` in `HttpLoader`. When called, the loader will only use successful responses (status code < 400) from the cache and therefore retry already cached error responses. * New method `writeOnlyCache()` in `HttpLoader` to only write to, but don't read from the response cache. Can be used to renew cached responses. * `Filter::urlPathMatches()` to filter URL paths using a regex. * Option to provide a chrome executable name to the `chrome-php/chrome` library via `HttpLoader::setChromeExecutable()`. ### Changed * __BREAKING__: Group steps can now only produce combined outputs, as previously done when `combineToSingleOutput()` method was called. The method is removed. * __BREAKING__: `setResultKey()` and `addKeysToResult()` are removed. Calls to those methods can both be replaced with calls to the new `addToResult()` method. * __BREAKING__: `getResultKey()` is also removed with `setResultKey()`. It's removed without replacement, as it doesn't really make sense any longer. * __BREAKING__: Error responses (4xx as well as 5xx), by default, won't produce any step outputs any longer. If you want to receive error responses, use the new `yieldErrorResponses()` method. * __BREAKING__: Removed the `httpClient()` method in the `HttpCrawler` class. If you want to provide your own HTTP client, implement a custom `loader` method passing your client to the `HttpLoader` instead. * __Deprecated__ the loop feature (class `Loop` and `Crawler::loop()` method). Probably the only use case is iterating over paginated list pages, which can be done using the new Paginator functionality. It will be removed in v1.0. * In case of a 429 (Too Many Requests) response, the `HttpLoader` now automatically waits and retries. By default, it retries twice and waits 10 seconds for the first retry and a minute for the second one. In case the response also contains a `Retry-After` header with a value in seconds, it complies to that. Exception: by default it waits at max `60` seconds (you can set your own limit if you want), if the `Retry-After` value is higher, it will stop crawling. If all the retries also receive a `429` it also throws an Exception. * Removed logger from `Throttler` as it doesn't log anything. * Fail silently when `robots.txt` can't be parsed. * Default timeout configuration for the default guzzle HTTP client: `connect_timeout` is `10` seconds and `timeout` is `60` seconds. * The `validateAndSanitize...()` methods in the abstract `Step` class, when called with an array with one single element, automatically try to use that array element as input value. * With the `Html` and `Xml` data extraction steps you can now add layers to the data that is being extracted, by just adding further `Html`/`Xml` data extraction steps as values in the mapping array that you pass as argument to the `extract()` method. * The base `Http` step can now also be called with an array of URLs as a single input. Crawl and Paginate steps still require a single URL input. ### Fixed * The `CookieJar` now also works with `localhost` or other hosts without a registered domain name. * Improve the `Sitemap::getUrlsFromSitemap()` step to also work when the `` tag contains attributes that would cause the symfony DomCrawler to not find any elements. * Fixed possibility of infinite redirects in `HttpLoader` by adding a redirects limit of 10. ## [0.6.0] - 2022-10-03 ### Added * New step `Http::crawl()` (class `HttpCrawl` extending the normal `Http` step class) for conventional crawling. It loads all pages of a website (same host or domain) by following links. There's also a lot of options like depth, filtering by paths, and so on. * New steps `Sitemap::getSitemapsFromRobotsTxt()` (`GetSitemapsFromRobotsTxt`) and `Sitemap::getUrlsFromSitemap()` (`GetUrlsFromSitemap`) to get sitemap (URLs) from a robots.txt file and to get all the URLs from those sitemaps. * New step `Html::metaData()` to get data from meta tags (and title tag) in HTML documents. * New step `Html::schemaOrg()` (`SchemaOrg`) to get schema.org structured data in JSON-LD format from HTML documents. * The abstract `DomQuery` class (parent of the `CssSelector` and `XPathQuery` classes) now has some methods to narrow the selected matches further: `first()`, `last()`, `nth(n)`, `even()`, `odd()`. ### Changed * __BREAKING__: Removed `PoliteHttpLoader` and traits `WaitPolitely` and `CheckRobotsTxt`. Converted the traits to classes `Throttler` and `RobotsTxtHandler` which are dependencies of the `HttpLoader`. The `HttpLoader` internally gets default instances of those classes. The `RobotsTxtHandler` will respect robots.txt rules by default if you use a `BotUserAgent` and it won't if you use a normal `UserAgent`. You can access the loader's `RobotsTxtHandler` via `HttpLoader::robotsTxt()`. You can pass your own instance of the `Throttler` to the loader and also access it via `HttpLoader::throttle()` to change settings. ### Fixed * Getting absolute links via the `GetLink` and `GetLinks` steps and the `toAbsoluteUrl()` method of the `CssSelector` and `XPathQuery` classes, now also look for `` tags in HTML when resolving the URLs. * The `SimpleCsvFileStore` can now also save results with nested data (but only second level). It just concatenates the values separated with a ` | `. ## [0.5.0] - 2022-09-03 ### Added * You can now call the new `useHeadlessBrowser` method on the `HttpLoader` class to use a headless Chrome browser to load pages. This is enough to get HTML after executing javascript in the browser. For more sophisticated tasks a separate Loader and/or Steps should better be created. * With the `maxOutputs()` method of the abstract `Step` class you can now limit how many outputs a certain step should yield at max. That's for example helpful during development, when you want to run the crawler only with a small subset of the data/requests it will actually have to process when you eventually remove the limits. When a step has reached its limit, it won't even call the `invoke()` method any longer until the step is reset after a run. * With the new `outputHook()` method of the abstract `Crawler` class you can set a closure that'll receive all the outputs from all the steps. Should be only for debugging reasons. * The `extract()` method of the `Html` and `Xml` (children of `Dom`) steps now also works with a single selector instead of an array with a mapping. Sometimes you'll want to just get a simple string output e.g. for a next step, instead of an array with mapped extracted data. * In addition to `uniqueOutputs()` there is now also `uniqueInputs()`. It works exactly the same as `uniqueOutputs()`, filtering duplicate input values instead. Optionally also by a key when expected input is an array or an object. * In order to be able to also get absolute links when using the `extract()` method of Dom steps, the abstract `DomQuery` class now has a method `toAbsoluteUrl()`. The Dom step will automatically provide the `DomQuery` instance with the base url, presumed that the input was an instance of the `RespondedRequest` class and resolve the selected value against that base url. ### Changed * Remove some not so important log messages. * Improve behavior of group step's `combineToSingleOutput()`. When steps yield multiple outputs, don't combine all yielded outputs to one. Instead, combine the first output from the first step with the first output from the second step, and so on. * When results are not explicitly composed, but the outputs of the last step are arrays with string keys, it sets those keys on the Result object instead of setting a key `unnamed` with the whole array as value. ### Fixed * The static methods `Html::getLink()` and `Html::getLinks()` now also work without argument, like the `GetLink` and `GetLinks` classes. * When a `DomQuery` (CSS selector or XPath query) doesn't match anything, its `apply()` method now returns `null` (instead of an empty string). When the `Html(/Xml)::extract()` method is used with a single, not matching selector/query, nothing is yielded. When it's used with an array with a mapping, it yields an array with null values. If the selector for one of the methods `Html(/Xml)::each()`, `Html(/Xml)::first()` or `Html(/Xml)::last()` doesn't match anything, that's not causing an error any longer, it just won't yield anything. * Removed the (unnecessary) second argument from the `Loop::withInput()` method because when `keepLoopingWithoutOutput()` is called and `withInput()` is called after that call, it resets the behavior. * Issue when date format for expires date in cookie doesn't have dashes in `d-M-Y` (so `d M Y`). ## [0.4.1] - 2022-05-10 ### Fixed * The `Json` step now also works with Http responses as input. ## [0.4.0] - 2022-05-06 ### Added * The `BaseStep` class now has `where()` and `orWhere()` methods to filter step outputs. You can set multiple filters that will be applied to all outputs. When setting a filter using `orWhere` it's linked to the previously added Filter with "OR". Outputs not matching one of the filters, are not yielded. The available filters can be accessed through static methods on the new `Filter` class. Currently available filters are comparison filters (equal, greater/less than,...), a few string filters (contains, starts/ends with) and url filters (scheme, domain, host,...). * The `GetLink` and `GetLinks` steps now have methods `onSameDomain()`, `notOnSameDomain()`, `onDomain()`, `onSameHost()`, `notOnSameHost()`, `onHost()` to restrict the which links to find. * Automatically add the crawler's logger to the `Store` so you can also log messages from there. This can be breaking as the `StoreInterface` now also requires the `addLogger` method. The new abstract `Store` class already implements it, so you can just extend it. ### Changed * The `Csv` step can now also be used without defining a column mapping. In that case it will use the values from the first line (so this makes sense when there are column headlines) as output array keys. ## [0.3.0] - 2022-04-27 ### Added * By calling `monitorMemoryUsage()` you can tell the Crawler to add log messages with the current memory usage after every step invocation. You can also set a limit in bytes when to start monitoring and below the limit it won't log memory usage. ### Fixed * Previously the __use of Generators__ actually didn't make a lot of sense, because the outputs of one step were only iterated and passed on to the next step, after the current step was invoked with all its inputs. That makes steps with a lot of inputs bottlenecks and causes bigger memory consumption. So, changed the crawler to immediately pass on outputs of one step to the next step if there is one. ## [0.2.0] - 2022-04-25 ### Added * `uniqueOutputs()` method to Steps to get only unique output values. If outputs are array or object, you can provide a key that will be used as identifier to check for uniqueness. Otherwise, the arrays or objects will be serialized for comparison which will probably be slower. * `runAndTraverse()` method to Crawler, so you don't need to manually traverse the Generator, if you don't need the results where you're calling the crawler. * Implement the behaviour for when a `Group` step should add something to the Result using `setResultKey()` or `addKeysToResult()`, which was still missing. For groups this will only work when using `combineToSingleOutput`. ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to this Package That you're reading this must mean you consider contributing to this package. So first off: Awesome! 👍🤘 ## Bugs In case you encounter any bugs please [file an issue](https://github.com/crwlrsoft/crawler/issues/new). Describe the issue as well as you can and provide an example to reproduce it. Maybe you're not 100 percent sure whether what you've discovered is a bug or the intended behavior. You can still file an issue and tell us which results you'd expect. If you know how to fix the issue you're welcome to send a pull request. 💪 ## New Features If you have ideas for new features you can tell us about it on [Twitter](https://twitter.com/crwlrsoft) or via [crwlr.software](https://www.crwlr.software/contact) or just send a pull request. Please keep in mind that there is no guarantee that your feature will be merged. ## Conventions ### Coding Style This package follows the [PSR-12](https://www.php-fig.org/psr/psr-12/) coding standard. You can run PHP CS Fixer via `composer cs` for a dry run or `composer cs-fix` to automatically fix code style issues. ### Code quality tools When you're making changes to this package please always run tests and linting. Commands: `composer test` `composer test-integration` `composer cs` `composer stan` Ideally you add the pre-commit git hook that is shipped with this repo that will run tests and linting. Add it to your local clone by running: `composer add-git-hooks` The integration tests start a simple PHP web server for the testing purpose on port 8000. If you have anything else running on that port, the integration tests won't work. Also, please don't forget to add new test cases if necessary. ### Documentation For any code change that changes/adds something for users of the package, please don't forget to add an entry to the `CHANGELOG.md` file. ## Appreciation When your pull request is merged I will show some love and tweet about it. Also, if you meet me in person I will be glad to buy you a beer. ================================================ FILE: LICENSE ================================================ Copyright (c) 2026 Christian Olear Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================

crwlr.software logo

# Library for Rapid (Web) Crawler and Scraper Development This library provides kind of a framework and a lot of ready to use, so-called __steps__, that you can use as building blocks, to build your own crawlers and scrapers with. To give you an overview, here's a list of things that it helps you with: * [Crawler __Politeness__](https://www.crwlr.software/packages/crawler/the-crawler/politeness) 😇 (respecting robots.txt, throttling,...) * Load URLs using * [a __(PSR-18) HTTP client__](https://www.crwlr.software/packages/crawler/the-crawler/loaders) (default is of course Guzzle) * or a [__headless browser__](https://www.crwlr.software/packages/crawler/the-crawler/loaders#using-a-headless-browser) (chrome) to get source after Javascript execution * [Get __absolute links__ from HTML documents](https://www.crwlr.software/packages/crawler/included-steps/html#html-get-link) 🔗 * [Get __sitemaps__ from robots.txt and get all URLs from those sitemaps](https://www.crwlr.software/packages/crawler/included-steps/sitemap) * [__Crawl__ (load) all pages of a website](https://www.crwlr.software/packages/crawler/included-steps/http#crawling) 🕷 * [Use __cookies__ (or don't)](https://www.crwlr.software/packages/crawler/the-crawler/loaders#http-loader) 🍪 * [Use any __HTTP methods__ (GET, POST,...) and send any headers or body](https://www.crwlr.software/packages/crawler/included-steps/http#http-requests) * [Easily iterate over __paginated__ list pages](https://www.crwlr.software/packages/crawler/included-steps/http#paginating) 🔁 * Extract data from: * [__HTML__](https://www.crwlr.software/packages/crawler/included-steps/html#extracting-data) and also [__XML__](https://www.crwlr.software/packages/crawler/included-steps/xml) (using CSS selectors or XPath queries) * [__JSON__](https://www.crwlr.software/packages/crawler/included-steps/json) (using dot notation) * [__CSV__](https://www.crwlr.software/packages/crawler/included-steps/csv) (map columns) * [Extract __schema.org__ structured data](https://www.crwlr.software/packages/crawler/included-steps/html#schema-org) in __JSON-LD__ format from HTML documents * [Keep memory usage low](https://www.crwlr.software/packages/crawler/crawling-procedure#memory-usage) by using PHP __Generators__ 💪 * [__Cache__ HTTP responses](https://www.crwlr.software/packages/crawler/response-cache) during development, so you don't have to load pages again and again after every code change * [Get __logs__](https://www.crwlr.software/packages/crawler/the-crawler#loggers) about what your crawler is doing (accepts any PSR-3 LoggerInterface) * And a lot more... ## Documentation You can find the documentation at [crwlr.software](https://www.crwlr.software/packages/crawler/getting-started). ## Contributing If you consider contributing something to this package, read the [contribution guide (CONTRIBUTING.md)](CONTRIBUTING.md). ================================================ FILE: bin/add-git-hooks ================================================ #!/usr/bin/env php 0) { $lastLine = array_pop($output); if (trim($lastLine) !== '') { printLine($lastLine); return; } } } function printLine(string $string) { echo $string . PHP_EOL; } function printLines(array $lines) { echo implode(PHP_EOL, $lines) . PHP_EOL; } function printBlankLine() { printLine(''); } function red(string $string): string { return color('0;31', $string); } function green(string $string): string { return color('0;32', $string); } function blue(string $string): string { return color('0;34', $string); } function color(string $colorCode, string $string): string { return "\e[" . $colorCode . "m" . $string . "\e[0m"; } ================================================ FILE: phpstan.neon ================================================ parameters: level: 8 paths: - src - tests excludePaths: analyse: - tests/_Integration/_Server reportUnmatchedIgnoredErrors: false ignoreErrors: - "#^Call to an undefined method Pest\\\\PendingCalls\\\\TestCall\\|Pest\\\\Support\\\\HigherOrderTapProxy\\:\\:(with|throws)\\(\\).$#" - "#^Access to an undefined property Spatie\\\\Invade\\\\Invader#" - "#^Call to an undefined method Spatie\\\\Invade\\\\Invader#" - "#^Call to protected method [a-zA-Z]{5,30}\\(\\) of class PHPUnit\\\\Framework\\\\TestCase.#" - "#^(?:Parameter|Method) .+ has invalid (return )?type Dom\\\\.+\\.#" - "#^Call to .+ on an unknown class Dom\\\\.+\\.#" - "#^Property .+ has unknown class Dom\\\\.+ as its type\\.#" - "#^Class Dom\\\\.+ not found.#" - "#^Access to property .+ on an unknown class Dom\\\\.+\\.#" - "#^PHPDoc tag .+ contains unknown class Dom\\\\.+\\.#" - "#^Call to an undefined (static )?method Dom\\\\.+::.+\\(\\)\\.#" - "#^Access to an undefined property Dom\\\\.+::\\$.+\\.#" - "#^Function .+ has invalid return type Dom\\\\.+\\.#" - "#^(?:Used )?(?:C|c)onstant DOM\\\\.+ not found\\.#" - "#^Instantiated class Dom\\\\.+ not found.#" ================================================ FILE: phpunit.xml ================================================ ./tests ./app ./src ================================================ FILE: src/Cache/CacheItem.php ================================================ value) && method_exists($this->value, 'cacheKey')) { $this->key = $this->value->cacheKey(); } else { $this->key = md5(serialize($this->value)); } } else { $this->key = $key; } } public function key(): string { return $this->key; } public function value(): mixed { return $this->value; } /** * @throws Exception */ public function isExpired(): bool { $ttl = $this->ttl instanceof DateInterval ? $this->ttl : new DateInterval('PT' . $this->ttl . 'S'); return time() > $this->createdAt->add($ttl)->getTimestamp(); } /** * Get a new instance with same data but a different time to live. */ public function withTtl(DateInterval|int $ttl): CacheItem { return new CacheItem($this->value, $this->key, $ttl, $this->createdAt); } /** * @return mixed[] */ public function __serialize(): array { return [ 'value' => $this->value, 'key' => $this->key, 'ttl' => $this->ttl, 'createdAt' => $this->createdAt, ]; } /** * @param mixed[] $data */ public function __unserialize(array $data): void { $this->value = $data['value']; $this->key = $data['key']; $this->ttl = $data['ttl']; $this->createdAt = $data['createdAt']; } } ================================================ FILE: src/Cache/Exceptions/MissingZlibExtensionException.php ================================================ useCompression = true; return $this; } public function ttl(DateInterval|int $ttl): static { $this->ttl = $ttl; return $this; } /** * @throws MissingZlibExtensionException|ReadingCacheFailedException|Exception|InvalidArgumentException */ public function has(string $key): bool { if (file_exists($this->basePath . '/' . $key)) { $cacheItem = $this->getCacheItem($key); if (!$cacheItem->isExpired()) { return true; } $this->delete($key); } return false; } /** * @throws ReadingCacheFailedException|MissingZlibExtensionException|Exception|InvalidArgumentException */ public function get(string $key, mixed $default = null): mixed { if (file_exists($this->basePath . '/' . $key)) { $cacheItem = $this->getCacheItem($key); if (!$cacheItem->isExpired()) { return $cacheItem->value(); } $this->delete($key); } return $default; } /** * @throws MissingZlibExtensionException */ public function set(string $key, mixed $value, DateInterval|int|null $ttl = null): bool { if (!$value instanceof CacheItem) { $value = new CacheItem($value, $key, $ttl ?? $this->ttl); } elseif ($value->key() !== $key) { $value = new CacheItem($value->value(), $key, $ttl ?? $value->ttl); } return $this->saveCacheItem($value); } public function delete(string $key): bool { return unlink($this->basePath . '/' . $key); } public function prolong(string $key, DateInterval|int $ttl): bool { try { $item = $this->getCacheItem($key); return $this->saveCacheItem($item->withTtl($ttl)); } catch (Throwable) { return false; } } /** * @throws InvalidArgumentException */ public function clear(): bool { $allFiles = scandir($this->basePath); if (is_array($allFiles)) { foreach ($allFiles as $file) { if ($file !== '.' && $file !== '..' && $file !== '.gitkeep' && !$this->delete($file)) { return false; } } } return true; } public function prolongAll(DateInterval|int $ttl): bool { $allFiles = scandir($this->basePath); if (is_array($allFiles)) { foreach ($allFiles as $file) { if ($file !== '.' && $file !== '..' && $file !== '.gitkeep' && !$this->prolong($file, $ttl)) { return false; } } } return true; } /** * @return iterable * @throws MissingZlibExtensionException|ReadingCacheFailedException|InvalidArgumentException */ public function getMultiple(iterable $keys, mixed $default = null): iterable { $items = []; foreach ($keys as $key) { $items[$key] = $this->get($key, $default); } return $items; } /** * @param iterable $values * @throws MissingZlibExtensionException */ public function setMultiple(iterable $values, DateInterval|int|null $ttl = null): bool { foreach ($values as $key => $value) { if (!$this->set($key, $value, $ttl)) { return false; } } return true; } public function deleteMultiple(iterable $keys): bool { foreach ($keys as $key) { if (!$this->delete($key)) { return false; } } return true; } /** * @throws MissingZlibExtensionException * @throws ReadingCacheFailedException */ protected function getCacheItem(string $key): CacheItem { $fileContent = $this->getFileContents($key); if ($this->useCompression) { $fileContent = $this->decode($fileContent); } $unserialized = $this->unserialize($fileContent); if (!$unserialized instanceof CacheItem) { $unserialized = new CacheItem($unserialized, $key); } return $unserialized; } /** * @throws MissingZlibExtensionException */ protected function saveCacheItem(CacheItem $item): bool { $content = serialize($item); if ($this->useCompression) { $content = $this->encode($content); } return file_put_contents($this->basePath . '/' . $item->key(), $content) !== false; } protected function unserialize(string $content): mixed { // Temporarily set a new error handler, so unserializing a compressed string does not result in a PHP warning. set_error_handler(function ($errno, $errstr) { return $errno === E_WARNING && str_starts_with($errstr, 'unserialize(): Error at offset 0 of '); }); $unserialized = unserialize($content); if ($unserialized === false) { // if unserializing fails, try if the string is compressed. try { $content = $this->decode($content); $unserialized = unserialize($content); } catch (Throwable) { } } restore_error_handler(); return $unserialized; } /** * @throws ReadingCacheFailedException */ protected function getFileContents(string $key): string { $fileContent = file_get_contents($this->basePath . '/' . $key); if ($fileContent === false) { throw new ReadingCacheFailedException('Failed to read cache file.'); } return $fileContent; } /** * @throws MissingZlibExtensionException */ protected function encode(string $content): string { try { return Gzip::encode($content, true); } catch (MissingZlibExtensionException) { throw new MissingZlibExtensionException( 'Can\'t compress response cache data. Compression needs PHP ext-zlib installed.', ); } } /** * @throws MissingZlibExtensionException */ protected function decode(string $content): string { try { return Gzip::decode($content, true); } catch (MissingZlibExtensionException) { throw new MissingZlibExtensionException('FileCache compression needs PHP ext-zlib installed.'); } } } ================================================ FILE: src/Crawler.php ================================================ */ protected array $steps = []; protected ?StoreInterface $store = null; protected bool|int $monitorMemoryUsage = false; protected ?Closure $outputHook = null; public function __construct() { $this->userAgent = $this->userAgent(); $this->logger = $this->logger(); $this->loader = $this->loader($this->userAgent, $this->logger); } public function __clone(): void { $this->inputs = []; $this->steps = []; $this->store = null; $this->outputHook = null; } abstract protected function userAgent(): UserAgentInterface; /** * @param UserAgentInterface $userAgent * @param LoggerInterface $logger * @return LoaderInterface */ abstract protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface; public static function group(): Group { return new Group(); } public static function setMemoryLimit(string $memoryLimit): false|string { return ini_set('memory_limit', $memoryLimit); } public static function getMemoryLimit(): false|string { return ini_get('memory_limit'); } public function getSubCrawler(): Crawler { return clone $this; } public function getUserAgent(): UserAgentInterface { return $this->userAgent; } public function setUserAgent(UserAgentInterface $userAgent): static { $this->userAgent = $userAgent; $this->loader = $this->loader($userAgent, $this->logger); return $this; } public function getLogger(): LoggerInterface { return $this->logger; } /** * @return LoaderInterface|array */ public function getLoader(): LoaderInterface|array { return $this->loader; } public function setStore(StoreInterface $store): static { $store->addLogger($this->logger); $this->store = $store; return $this; } public function input(mixed $input): static { $this->inputs[] = $input; return $this; } /** * @param mixed[] $inputs */ public function inputs(array $inputs): static { $this->inputs = array_merge($this->inputs, $inputs); return $this; } /** * @param StepInterface $step * @return $this * @throws InvalidArgumentException */ public function addStep(StepInterface $step): static { $step->addLogger($this->logger); if (method_exists($step, 'setLoader')) { $step->setLoader($this->loader); } if ($step instanceof BaseStep) { $step->setParentCrawler($this); } $this->steps[] = $step; return $this; } /** * Run the crawler and traverse results * * When you've set a store, or you just don't need the results for any other reason (e.g. you use the crawler for * cache warming) where you're calling the crawler, use this method. * * @throws Exception */ public function runAndTraverse(): void { foreach ($this->run() as $result) { } } /** * Easy way to just crawl and dump the results * * @throws Exception */ public function runAndDump(): void { foreach ($this->run() as $result) { var_dump($result->toArray()); } } /** * Run the Crawler * * Handles calling all the steps and cascading the data from step to step. * It returns a Generator, so when using this method directly, you need to traverse the Generator, otherwise nothing * happens. Alternatively you can use runAndTraverse(). * * @return Generator * @throws Exception|PreRunValidationException */ public function run(): Generator { $this->validateSteps(); $inputs = $this->prepareInput(); if ($this->firstStep()) { foreach ($inputs as $input) { $results = $this->invokeStepsRecursive($input, $this->firstStep(), 0); /** @var Generator $results */ yield from $results; } } $this->reset(); } /** * Use this method if you want the crawler to add log messages with the current memory usage after every step * invocation. * * @param int|null $ifAboveXBytes You can provide an int of bytes as a limit above which the crawler should log * the usage. */ public function monitorMemoryUsage(?int $ifAboveXBytes = null): static { $this->monitorMemoryUsage = $ifAboveXBytes ?? true; return $this; } public function outputHook(Closure $callback): static { $this->outputHook = $callback; return $this; } protected function logger(): LoggerInterface { return new CliLogger(); } /** * @return Generator */ protected function invokeStepsRecursive(Input $input, StepInterface $step, int $stepIndex): Generator { $outputs = $step->invokeStep($input); $nextStep = $this->nextStep($stepIndex); if (!$nextStep) { yield from $this->storeAndReturnOutputsAsResults($outputs); return; } foreach ($outputs as $output) { if ($this->monitorMemoryUsage !== false) { $this->logMemoryUsage(); } $this->outputHook?->call($this, $output, $stepIndex, $step); yield from $this->invokeStepsRecursive( new Input($output), $nextStep, $stepIndex + 1, ); } } /** * @param Generator $outputs * @return Generator */ protected function storeAndReturnOutputsAsResults(Generator $outputs): Generator { foreach ($outputs as $output) { $this->outputHook?->call($this, $output, count($this->steps) - 1, end($this->steps)); $result = new Result(); foreach ($output->keep as $key => $value) { $result->set($key, $value); } if (!$this->lastStep()?->keepsAnything()) { if ($output->isArrayWithStringKeys()) { foreach ($output->get() as $key => $value) { $result->set($key, $value); } } else { $result->set('unnamed', $output->get()); } } $this->store?->store($result); yield $result; } } /** * @throws PreRunValidationException */ protected function validateSteps(): void { $previousStep = null; foreach ($this->steps as $index => $step) { if ($index > 0) { $previousStep = $this->steps[$index - 1]; } if (method_exists($step, 'validateBeforeRun')) { try { $step->validateBeforeRun($previousStep ?? $this->inputs); } catch (PreRunValidationException $exception) { $this->logger->error( 'Pre-Run validation error in step number ' . ($index + 1) . ': ' . $exception->getMessage(), ); throw $exception; } } } } /** * @return Input[] * @throws Exception */ protected function prepareInput(): array { return array_map(function ($input) { return new Input($input); }, $this->inputs); } protected function logMemoryUsage(): void { $memoryUsage = memory_get_usage(); if (!is_int($this->monitorMemoryUsage) || $memoryUsage > $this->monitorMemoryUsage) { $this->logger->info('memory usage: ' . $memoryUsage); } } protected function firstStep(): ?StepInterface { return $this->steps[0] ?? null; } protected function lastStep(): ?BaseStep { $lastStep = end($this->steps); if (!$lastStep instanceof BaseStep) { return null; } return $lastStep; } protected function nextStep(int $afterIndex): ?StepInterface { return $this->steps[$afterIndex + 1] ?? null; } protected function reset(): void { $this->inputs = []; foreach ($this->steps as $step) { $step->resetAfterRun(); } } } ================================================ FILE: src/HttpCrawler/AnonymousHttpCrawlerBuilder.php ================================================ setUserAgent(new BotUserAgent($productToken)); return $instance; } public function withUserAgent(string|UserAgentInterface $userAgent): HttpCrawler { $instance = new class extends HttpCrawler { protected function userAgent(): UserAgentInterface { return new UserAgent('temp'); } }; $userAgent = $userAgent instanceof UserAgentInterface ? $userAgent : new UserAgent($userAgent); $instance->setUserAgent($userAgent); return $instance; } public function withMozilla5CompatibleUserAgent(): HttpCrawler { return $this->withUserAgent(UserAgent::mozilla5CompatibleBrowser()); } } ================================================ FILE: src/HttpCrawler.php ================================================ value = $value->value; $this->keep = $value->keep; } } public function withValue(mixed $value): static { return new static($value, $this->keep); } public function withPropertyValue(string $key, mixed $value): static { if (!$this->isArrayWithStringKeys()) { return new static($this); } $newValue = $this->value; $newValue[$key] = $value; return $this->withValue($newValue); } public function get(): mixed { return $this->value; } public function getProperty(string $key, mixed $fallbackValue = null): mixed { if (is_array($this->value)) { return $this->value[$key] ?? $fallbackValue; } elseif (is_object($this->value)) { $array = OutputTypeHelper::objectToArray($this->value); return $array[$key] ?? $fallbackValue; } return $fallbackValue; } /** * Sets and returns a key to use as identifier * * To only get unique results from a step use the key this method creates for comparison. * In case the output values are arrays or objects and contain a unique identifier that can be used, provide that * key name, so it doesn't need to create a key from the whole array/object. */ public function setKey(?string $useFromValue = null): string { if ($useFromValue && is_array($this->value) && array_key_exists($useFromValue, $this->value)) { $this->key = $this->valueToString($this->value[$useFromValue]); } elseif ($useFromValue && is_object($this->value) && property_exists($this->value, $useFromValue)) { $this->key = $this->valueToString($this->value->{$useFromValue}); } else { $this->key = $this->valueToString($this->value); } return $this->key; } public function getKey(): string|int|float|bool|null { if ($this->key === null) { $this->setKey(); } return $this->key; } /** * @param mixed[] $data */ public function keep(array $data): static { $this->keep = array_merge_recursive($this->keep, $data); return $this; } public function isArrayWithStringKeys(): bool { if (!is_array($this->value)) { return false; } foreach ($this->value as $key => $value) { if (!is_string($key)) { return false; } } return true; } protected function valueToString(mixed $value): string { if (is_array($value) || is_object($value)) { return md5(serialize($this->value)); } elseif (is_int($value) || is_float($value)) { return (string) $value; } elseif (is_bool($value)) { return $value ? 'true' : 'false'; } elseif (is_null($value)) { return 'null'; } return $value; } } ================================================ FILE: src/Loader/Http/Browser/Screenshot.php ================================================ getCurrentUrl()) . '-' . Microseconds::now()->value . '.' . $this->fileType; return $this->storePath . (!str_ends_with($this->storePath, '/') ? '/' : '') . $filename; } public function setImageFileType(string $type): self { if (in_array($type, ['jpeg', 'png', 'webp'], true)) { $this->fileType = $type; if (in_array($type, ['jpeg', 'webp'], true) && $this->quality === null) { $this->quality = 80; } elseif ($type === 'png' && $this->quality !== null) { $this->quality = null; } } return $this; } public function setQuality(int $quality): self { if (in_array($this->fileType, ['jpeg', 'webp'], true) && $quality > 0 && $quality <= 100) { $this->quality = $quality; } return $this; } public function setFullPage(): self { $this->fullPage = true; return $this; } /** * @return array */ public function toChromePhpScreenshotConfig(Page $page): array { $config = ['format' => $this->fileType]; if ($this->quality && in_array($this->fileType, ['jpeg', 'webp'], true)) { $config['quality'] = $this->quality; } if ($this->fullPage) { $config['captureBeyondViewport'] = true; $config['clip'] = $page->getFullPageClip(); } return $config; } } ================================================ FILE: src/Loader/Http/Cache/RetryManager.php ================================================ only = $statusCodes; return $this; } /** * @param int|int[] $statusCodes */ public function except(int|array $statusCodes): static { $statusCodes = is_array($statusCodes) ? $statusCodes : [$statusCodes]; $this->except = $statusCodes; return $this; } public function shallBeRetried(int $statusCode): bool { return $statusCode >= 400 && ($this->except === null || !in_array($statusCode, $this->except, true)) && ($this->only === null || in_array($statusCode, $this->only, true)); } } ================================================ FILE: src/Loader/Http/Cookies/Cookie.php ================================================ receivedFromUrl = $receivedFromUrl instanceof Url ? $receivedFromUrl : Url::parse($receivedFromUrl); if ( !is_string($this->receivedFromUrl->host()) || empty($this->receivedFromUrl->host()) ) { throw new InvalidCookieException('Url where cookie was received from has no host or domain'); } $this->receivedFromHost = $this->receivedFromUrl->host(); $this->setDomain($this->receivedFromUrl->domain() ?? $this->receivedFromUrl->host()); $this->parseSetCookieHeader($this->setCookieHeader); } /** * @throws Exception */ public function shouldBeSentTo(string|UriInterface|Url $url): bool { $url = $url instanceof Url ? $url : Url::parse($url); $urlHost = $url->host() ?? ''; return str_contains($urlHost, $this->domain()) && (!$this->hasHostPrefix() || $urlHost === $this->receivedFromHost) && (!$this->secure() || $url->scheme() === 'https' || in_array($urlHost, ['localhost', '127.0.0.1'], true)) && (!$this->path() || $this->pathMatches($url)) && !$this->isExpired(); } public function __toString(): string { return $this->name() . '=' . $this->value(); } public function receivedFromUrl(): UriInterface { return new Uri($this->receivedFromUrl); } public function name(): string { return $this->cookieName; } public function value(): string { return $this->cookieValue; } public function expires(): ?Date { return $this->expires; } public function maxAge(): ?int { return $this->maxAge; } public function isExpired(): bool { if ($this->expires() === null && $this->maxAge() === null) { return false; } $nowTimestamp = time(); if ($this->expires() instanceof Date && $nowTimestamp >= $this->expires()->dateTime()->getTimestamp()) { return true; } return $this->maxAge() !== null && ($this->maxAge() <= 0 || $nowTimestamp > ($this->receivedAtTimestamp + $this->maxAge())); } public function domain(): string { return $this->domain; } public function path(): ?string { return $this->path; } public function secure(): bool { return $this->secure; } public function httpOnly(): bool { return $this->httpOnly; } public function sameSite(): string { return $this->sameSite; } /** * @throws Exception */ public function isReceivedSecure(): bool { return $this->receivedFromUrl->scheme() === 'https'; } public function hasSecurePrefix(): bool { return str_starts_with($this->cookieName, '__Secure-'); } public function hasHostPrefix(): bool { return str_starts_with($this->cookieName, '__Host-'); } /** * @throws InvalidCookieException */ protected function parseSetCookieHeader(string $setCookieHeader): void { $splitAtSemicolon = explode(';', $setCookieHeader); $splitFirstPart = explode('=', trim(array_shift($splitAtSemicolon)), 2); if (count($splitFirstPart) !== 2) { throw new InvalidCookieException('Invalid cookie string'); } [$this->cookieName, $this->cookieValue] = $splitFirstPart; foreach ($splitAtSemicolon as $attribute) { $this->parseAttribute($attribute); } $this->checkPrefixes(); } /** * @throws InvalidCookieException */ protected function parseAttribute(string $attribute): void { $splitAtEquals = explode('=', trim($attribute), 2); $attributeName = strtolower($splitAtEquals[0]); $attributeValue = $splitAtEquals[1] ?? ''; if ($attributeName === 'expires') { $this->setExpires($attributeValue); } elseif ($attributeName === 'max-age') { $this->setMaxAge($attributeValue); } elseif ($attributeName === 'domain') { $this->setDomain($attributeValue, true); } elseif ($attributeName === 'path') { $this->setPath($attributeValue); } elseif ($attributeName === 'secure') { $this->setSecure(); } elseif ($attributeName === 'httponly') { $this->httpOnly = true; } elseif ($attributeName === 'samesite') { $this->setSameSite($attributeValue); } } /** * @see https://datatracker.ietf.org/doc/html/draft-west-cookie-prefixes#section-3 * @throws InvalidCookieException * @throws Exception */ protected function checkPrefixes(): void { if ($this->hasSecurePrefix() || $this->hasHostPrefix()) { if (!$this->isReceivedSecure()) { throw new InvalidCookieException( 'Cookie is prefixed with __Secure- or __Host- but was not sent via https', ); } if (!$this->secure()) { throw new InvalidCookieException( 'Cookie is prefixed with __Secure- or __Host- but Secure flag was not sent', ); } } if ($this->hasHostPrefix()) { if ($this->domainSetViaAttribute) { throw new InvalidCookieException('Cookie with __Host- prefix must not contain a Domain attribute'); } if ($this->path !== '/') { throw new InvalidCookieException('Cookie with __Host- prefix must have a Path attribute with value /'); } } } protected function setExpires(string $value): void { $this->expires = new Date($value); } protected function setMaxAge(string $value): void { $this->maxAge = (int) $value; $this->receivedAtTimestamp = time(); } /** * @throws InvalidCookieException * @throws Exception */ protected function setDomain(string $value, bool $viaAttribute = false): void { if (str_starts_with($value, '.')) { $value = substr($value, 1); } if (!str_contains($this->receivedFromHost, $value)) { throw new InvalidCookieException( 'Setting cookie for ' . $value . ' from ' . $this->receivedFromUrl->host() . ' is not allowed.', ); } $this->domain = $value; if ($viaAttribute) { $this->domainSetViaAttribute = true; } } protected function setPath(string $path): void { $this->path = $path; } /** * @throws InvalidCookieException * @throws Exception */ protected function setSecure(): void { if (!$this->isReceivedSecure()) { throw new InvalidCookieException( 'Secure flag can\'t be set when cookie was sent from non-https document url.', ); } $this->secure = true; } /** * @throws InvalidCookieException */ protected function setSameSite(string $value): void { $value = strtolower($value); if (!in_array(strtolower($value), ['strict', 'lax', 'none'], true)) { throw new InvalidCookieException('Invalid value for attribute SameSite'); } $this->sameSite = ucfirst($value); } /** * @throws Exception */ protected function pathMatches(Url $url): bool { $path = $this->path() ?? ''; $urlPath = $url->path() ?? ''; return str_starts_with($urlPath, $path) && ( $urlPath === $path || $path === '/' || str_starts_with($urlPath, $path . '/') ); } } ================================================ FILE: src/Loader/Http/Cookies/CookieJar.php ================================================ jar)) { return $this->jar[$domain]; } return []; } public function flush(): void { $this->jar = []; } /** * @throws InvalidCookieException * @throws Exception */ public function addFrom(string|UriInterface|Url $url, ResponseInterface|CookiesCollection $response): void { if ($response instanceof CookiesCollection) { $this->addFromBrowserCookieCollection($url, $response); } else { $cookieHeaders = $response->getHeader('set-cookie'); if (!empty($cookieHeaders)) { $url = !$url instanceof Url ? Url::parse($url) : $url; $domain = $this->getForDomainFromUrl($url); if ($domain) { foreach ($cookieHeaders as $cookieHeader) { $cookie = new Cookie($url, $cookieHeader); $this->jar[$domain][$cookie->name()] = $cookie; } } } } } /** * @throws InvalidCookieException * @throws Exception */ public function addFromBrowserCookieCollection(string|UriInterface|Url $url, CookiesCollection $collection): void { if ($collection->count() === 0) { return; } if (!$url instanceof Url) { $url = Url::parse($url); } $domain = $this->getForDomainFromUrl($url); if ($domain) { foreach ($collection as $cookie) { $setCookie = new Cookie($url, $this->buildSetCookieHeaderFromBrowserCookie($cookie)); $this->jar[$domain][$setCookie->name()] = $setCookie; } } } /** * @return Cookie[] * @throws Exception */ public function getFor(string|UriInterface $url): array { $forDomain = $this->getForDomainFromUrl($url); if (!$forDomain || !array_key_exists($forDomain, $this->jar)) { return []; } $cookiesToSend = []; foreach ($this->jar[$forDomain] as $cookie) { if ($cookie->shouldBeSentTo($url)) { $cookiesToSend[] = $cookie; } } return $cookiesToSend; } /** * @throws Exception */ protected function getForDomainFromUrl(string|UriInterface|Url $url): ?string { if (!$url instanceof Url) { $url = Url::parse($url); } $forDomain = empty($url->domain()) ? $url->host() : $url->domain(); if (!is_string($forDomain)) { return null; } return $forDomain; } protected function buildSetCookieHeaderFromBrowserCookie(BrowserCookie $cookie): string { $attributes = [ 'domain' => 'Domain', 'expires' => 'Expires', 'max-age' => 'Max-Age', 'path' => 'Path', 'secure' => 'Secure', 'httpOnly' => 'HttpOnly', 'sameSite' => 'SameSite', ]; $parts = [sprintf('%s=%s', $cookie->getName(), $cookie->getValue())]; foreach ($attributes as $name => $setCookieName) { $setCookieValue = $cookie->offsetGet($name); if (empty($setCookieValue)) { continue; } // "Expires" attribute if ($name === 'expires') { if ($setCookieValue !== -1) { $parts[] = sprintf('%s=%s', $setCookieName, $this->formatExpiresValue($setCookieValue)); } continue; } // Flag attributes if ($setCookieValue === true) { $parts[] = $setCookieName; continue; } $parts[] = sprintf('%s=%s', $setCookieName, $setCookieValue); } return implode('; ', $parts); } private function formatExpiresValue(mixed $value): string { if (is_numeric($value)) { $value = (string) $value; if (str_contains($value, '.')) { $expires = strlen(explode('.', $value, 2)[1]) <= 3 ? DateTime::createFromFormat('U.v', $value) : DateTime::createFromFormat('U.u', $value); } else { $expires = DateTime::createFromFormat('U', $value); } if ($expires !== false) { return $expires->format('l, d M Y H:i:s T'); } } return (string) $value; } } ================================================ FILE: src/Loader/Http/Cookies/Date.php ================================================ dateTime instanceof DateTime) { $dateTime = DateTime::createFromFormat(DateTimeInterface::COOKIE, $this->httpDateString); if (!$dateTime instanceof DateTime) { $dateTime = DateTime::createFromFormat('l, d M Y H:i:s T', $this->httpDateString); if (!$dateTime instanceof DateTime) { throw new InvalidArgumentException('Can\'t parse date string ' . $this->httpDateString); } } $this->dateTime = $dateTime; } return $this->dateTime; } } ================================================ FILE: src/Loader/Http/Cookies/Exceptions/InvalidCookieException.php ================================================ getMessage(), previous: $previousException, ); } public static function make(string|UriInterface $uri, ?int $httpStatusCode = null): self { if ($uri instanceof UriInterface) { $uri = (string) $uri; } $message = 'Failed to load ' . $uri; if ($httpStatusCode !== null) { $message .= ' (' . $httpStatusCode . ').'; } else { $message .= '.'; } $instance = new self($message); if ($httpStatusCode !== null) { $instance->httpStatusCode = $httpStatusCode; } return $instance; } } ================================================ FILE: src/Loader/Http/HeadlessBrowserLoaderHelper.php ================================================ */ protected array $options = [ 'windowSize' => [1920, 1000], ]; protected bool $optionsDirty = false; protected ?Browser $browser = null; protected ?Page $page = null; protected ?string $proxy = null; protected ?string $waitForEvent = null; protected int $timeout = 30_000; protected ?string $pageInitScript = null; protected bool $useNativeUserAgent = false; protected bool $includeShadowElements = false; /** * @var Closure[] */ protected array $tempPostNavigateHooks = []; public function __construct( private ?BrowserFactory $browserFactory = null, protected ?LoggerInterface $logger = null, ) {} /** * Set temporary post navigate hooks * * They will be executed after the next call to navigateToPageAndGetRespondedRequest() * and forgotten afterward. * * @param Closure[] $hooks */ public function setTempPostNavigateHooks(array $hooks): static { $this->tempPostNavigateHooks = $hooks; return $this; } /** * @throws OperationTimedOut * @throws CommunicationException * @throws NoResponseAvailable * @throws NavigationExpired * @throws InvalidResponse * @throws CannotReadResponse * @throws ResponseHasError * @throws JavascriptException * @throws Exception */ public function navigateToPageAndGetRespondedRequest( RequestInterface $request, Throttler $throttler, ?string $proxy = null, ?CookieJar $cookieJar = null, ): RespondedRequest { if (!$this->page || $this->shouldRenewBrowser($proxy)) { $this->page = $this->getBrowser($request, $proxy)->createPage(); } else { try { $this->page->assertNotClosed(); } catch (TargetDestroyed) { $this->page = $this->getBrowser($request, $proxy)->createPage(); } } if ($cookieJar === null) { $this->page->getSession()->sendMessageSync(new Message('Network.clearBrowserCookies')); } $statusCode = 200; $responseHeaders = []; $requestId = null; $this->page->getSession()->once( "method:Network.responseReceived", function ($params) use (&$statusCode, &$responseHeaders, &$requestId) { $statusCode = $params['response']['status']; $responseHeaders = $this->sanitizeResponseHeaders($params['response']['headers']); $requestId = $params['requestId'] ?? null; }, ); $throttler->trackRequestStartFor($request->getUri()); $this->navigate($request->getUri()->__toString()); $throttler->trackRequestEndFor($request->getUri()); $hookActionData = $this->callPostNavigateHooks(); if (is_string($requestId) && $this->page && !$this->responseIsHtmlDocument($this->page)) { $html = $this->tryToGetRawResponseBody($this->page, $requestId) ?? $this->getHtmlFromPage(); } else { $html = $this->getHtmlFromPage(); } $this->addCookiesToJar($cookieJar, $request->getUri()); return new RespondedRequest( $request, new Response($statusCode, $responseHeaders, $html), $hookActionData['screenshots'] ?? [], ); } public function getOpenBrowser(): ?Browser { return $this->browser; } public function getOpenPage(): ?Page { return $this->page; } /** * @throws Exception */ public function closeBrowser(): void { if ($this->browser) { if ($this->page) { $this->page->close(); $this->page = null; } $this->browser->close(); $this->browser = null; } } public function setExecutable(string $executable): static { $this->executable = $executable; return $this; } /** * @param array $options */ public function setOptions(array $options): static { $this->options = $options; $this->optionsDirty = true; return $this; } /** * @param array $options */ public function addOptions(array $options): static { foreach ($options as $key => $value) { $this->options[$key] = $value; } $this->optionsDirty = true; return $this; } public function waitForNavigationEvent(string $eventName): static { $this->waitForEvent = $eventName; return $this; } public function getTimeout(): int { return $this->timeout; } public function setTimeout(int $timeout): static { $this->timeout = $timeout; return $this; } /** * @param string[] $headers * @return string[] */ public function sanitizeResponseHeaders(array $headers): array { foreach ($headers as $key => $value) { $headers[$key] = explode(PHP_EOL, $value)[0]; } return $headers; } /** * @param string $scriptSource * @return $this */ public function setPageInitScript(string $scriptSource): static { $this->pageInitScript = $scriptSource; return $this; } public function useNativeUserAgent(): static { $this->useNativeUserAgent = true; return $this; } public function includeShadowElementsInHtml(): static { $this->includeShadowElements = true; return $this; } /** * @throws OperationTimedOut * @throws CommunicationException * @throws NavigationExpired * @throws NoResponseAvailable * @throws InvalidResponse * @throws CannotReadResponse * @throws ResponseHasError */ protected function navigate(string $url): void { if ($this->waitForEvent) { $this->page?->navigate($url)->waitForNavigation($this->waitForEvent, $this->timeout); } else { $this->page?->navigate($url)->waitForNavigation(timeout: $this->timeout); } } /** * @return array */ protected function callPostNavigateHooks(): array { $returnData = []; if (!empty($this->tempPostNavigateHooks)) { foreach ($this->tempPostNavigateHooks as $hook) { $returnValue = $hook->call($this, $this->page, $this->logger); if ($returnValue instanceof Screenshot) { if (!array_key_exists('screenshots', $returnData)) { $returnData['screenshots'] = [$returnValue]; } else { $returnData['screenshots'][] = $returnValue; } } } } $this->tempPostNavigateHooks = []; return $returnData; } /** * @throws CommunicationException * @throws OperationTimedOut * @throws NoResponseAvailable * @throws InvalidCookieException */ protected function addCookiesToJar(?CookieJar $cookieJar, UriInterface $requestUrl): void { if (!$cookieJar) { return; } $cookies = $this->page?->getCookies(); if ($cookies) { $cookieJar->addFrom($requestUrl, $cookies); } } /** * @throws Exception */ protected function getBrowser( RequestInterface $request, ?string $proxy = null, ): Browser { if (!$this->browser || $this->shouldRenewBrowser($proxy)) { $this->closeBrowser(); $options = $this->optionsFromRequest($request, $proxy); if (!$this->browserFactory) { $this->browserFactory = new BrowserFactory($this->executable); } $this->browser = $this->browserFactory->createBrowser($options); if ($this->pageInitScript) { $this->browser->setPagePreScript($this->pageInitScript); } $this->optionsDirty = false; } return $this->browser; } protected function shouldRenewBrowser(?string $proxy): bool { return $this->optionsDirty || ($proxy !== $this->proxy); } /** * @param RequestInterface $request * @return array */ protected function optionsFromRequest(RequestInterface $request, ?string $proxy = null): array { $options = $this->options; if (isset($request->getHeader('User-Agent')[0]) && !$this->useNativeUserAgent) { $options['userAgent'] = $request->getHeader('User-Agent')[0]; } elseif ($this->useNativeUserAgent && !empty($request->getHeader('User-Agent'))) { $request = $request->withoutHeader('User-Agent'); } $options['headers'] = array_merge( $options['headers'] ?? [], $this->prepareRequestHeaders($request->getHeaders()), ); if (!empty($proxy)) { $this->proxy = $options['proxyServer'] = $proxy; } else { $this->proxy = null; } return $options; } /** * @param mixed[] $headers * @return array */ protected function prepareRequestHeaders(array $headers = []): array { $headers = $this->removeHeadersCausingErrorWithHeadlessBrowser($headers); return array_map(function ($headerValue) { return is_array($headerValue) ? implode(';', $headerValue) : $headerValue; }, $headers); } /** * @param mixed[] $headers * @return mixed[] */ protected function removeHeadersCausingErrorWithHeadlessBrowser(array $headers = []): array { $removeHeaders = ['host']; foreach ($headers as $headerName => $headerValue) { if (in_array(strtolower($headerName), $removeHeaders, true)) { unset($headers[$headerName]); } } return $headers; } protected function responseIsHtmlDocument(?Page $page = null): bool { if (!$page) { return false; } try { return $page->evaluate( <<getReturnValue(3000); } catch (Throwable $e) { return true; } } /** * In production, retrieving the raw response body using the Network.getResponseBody message sometimes failed. * Waiting briefly before sending the message appeared to resolve the issue. * So, this method tries up to three times with a brief wait between each attempt. */ protected function tryToGetRawResponseBody(Page $page, string $requestId): ?string { for ($i = 1; $i <= 3; $i++) { try { $message = $page->getSession()->sendMessageSync(new Message('Network.getResponseBody', [ 'requestId' => $requestId, ])); if ($message->isSuccessful() && $message->getData()['result']['body']) { return $message->getData()['result']['body']; } } catch (Throwable) { } usleep($i * 100000); } return null; } /** * @throws CommunicationException * @throws JavascriptException */ protected function getHtmlFromPage(): string { if ($this->page instanceof Page && $this->includeShadowElements) { try { // Found this script on // https://stackoverflow.com/questions/69867758/how-can-i-get-all-the-html-in-a-document-or-node-containing-shadowroot-elements return $this->page->evaluate(<<getReturnValue(); } catch (Throwable) { return $this->page->getHtml(); } } return $this->page?->getHtml() ?? ''; } } ================================================ FILE: src/Loader/Http/HttpLoader.php ================================================ 10, 'timeout' => 60, ]; protected int $maxRedirects = 10; protected ?RetryManager $retryCachedErrorResponses = null; protected bool $writeOnlyCache = false; /** * @var array */ protected array $cacheUrlFilters = []; protected bool $skipCacheForNextRequest = false; protected ?ProxyManager $proxies = null; /** * @param mixed[] $defaultGuzzleClientConfig */ public function __construct( UserAgentInterface $userAgent, ?ClientInterface $httpClient = null, ?LoggerInterface $logger = null, ?Throttler $throttler = null, protected RetryErrorResponseHandler $retryErrorResponseHandler = new RetryErrorResponseHandler(), array $defaultGuzzleClientConfig = [], ) { parent::__construct($userAgent, $logger); $this->retryErrorResponseHandler->setLogger($this->logger); $this->httpClient = $httpClient ?? new Client($this->mergeClientConfigWithDefaults($defaultGuzzleClientConfig)); $this->onSuccess(function (RequestInterface $request, ResponseInterface $response, LoggerInterface $logger) { $logger->info('Loaded ' . $request->getUri()->__toString()); }); $this->onError(function (RequestInterface $request, Exception|Error|ResponseInterface $exceptionOrResponse, $logger) { $logMessage = 'Failed to load ' . $request->getUri()->__toString() . ': '; if ($exceptionOrResponse instanceof ResponseInterface) { $logMessage .= 'got response ' . $exceptionOrResponse->getStatusCode() . ' - ' . $exceptionOrResponse->getReasonPhrase(); } else { $logMessage .= $exceptionOrResponse->getMessage(); } $logger->error($logMessage); }); $this->cookieJar = new CookieJar(); $this->throttler = $throttler ?? new Throttler(); } /** * @param mixed $subject * @return RespondedRequest|null */ public function load(mixed $subject): ?RespondedRequest { $this->_resetCalledHooks(); try { $request = $this->validateSubjectType($subject); } catch (InvalidArgumentException|Exception $exception) { $url = $subject instanceof RequestInterface ? (string) $subject->getUri() : (string) $subject; $this->logger->error('Invalid input URL: ' . $url . ' - ' . $exception->getMessage()); return null; } try { if (!$this->isAllowedToBeLoaded($request->getUri())) { return null; } $isFromCache = false; $respondedRequest = $this->tryLoading($request, $isFromCache); if ($respondedRequest->response->getStatusCode() < 400) { $this->callHook('onSuccess', $request, $respondedRequest->response); } else { $this->callHook('onError', $request, $respondedRequest->response); } if (!$isFromCache) { $this->addToCache($respondedRequest); } return $respondedRequest; } catch (Throwable $exception) { // Don't move to finally so hooks don't run before it. $this->throttler->trackRequestEndFor($request->getUri()); $this->callHook('onError', $request, $exception); return null; } finally { $this->callHook('afterLoad', $request); $this->_resetCalledHooks(); } } /** * @throws LoadingException|InvalidArgumentException|Exception */ public function loadOrFail(mixed $subject): RespondedRequest { $this->_resetCalledHooks(); $request = $this->validateSubjectType($subject); try { $this->isAllowedToBeLoaded($request->getUri(), true); $isFromCache = false; $respondedRequest = $this->tryLoading($request, $isFromCache); if ($respondedRequest->response->getStatusCode() >= 400) { throw LoadingException::make($request->getUri(), $respondedRequest->response->getStatusCode()); } $this->callHook('onSuccess', $request, $respondedRequest->response); $this->callHook('afterLoad', $request); if (!$isFromCache) { $this->addToCache($respondedRequest); } return $respondedRequest; } catch (Throwable $exception) { $this->_resetCalledHooks(); throw LoadingException::from($exception); } } public function dontUseCookies(): static { $this->useCookies = false; return $this; } public function flushCookies(): void { $this->cookieJar->flush(); } public function useHeadlessBrowser(): static { $this->useHeadlessBrowser = true; return $this; } /** * @throws Exception */ public function useHttpClient(): static { $this->useHeadlessBrowser = false; $this->browser()->closeBrowser(); return $this; } public function usesHeadlessBrowser(): bool { return $this->useHeadlessBrowser; } public function setMaxRedirects(int $maxRedirects): static { $this->maxRedirects = $maxRedirects; return $this; } public function robotsTxt(): RobotsTxtHandler { if (!$this->robotsTxtHandler) { $this->robotsTxtHandler = new RobotsTxtHandler($this, $this->logger); } return $this->robotsTxtHandler; } public function throttle(): Throttler { return $this->throttler; } public function retryCachedErrorResponses(): RetryManager { $this->retryCachedErrorResponses = new RetryManager(); return $this->retryCachedErrorResponses; } public function writeOnlyCache(): static { $this->writeOnlyCache = true; return $this; } public function cacheOnlyWhereUrl(FilterInterface $filter): static { $this->cacheUrlFilters[] = $filter; return $this; } /** * @throws Exception */ public function useProxy(string $proxyUrl): void { $this->checkIfProxiesCanBeUsed(); $this->proxies = new ProxyManager([$proxyUrl]); } /** * @param string[] $proxyUrls * @throws Exception */ public function useRotatingProxies(array $proxyUrls): void { $this->checkIfProxiesCanBeUsed(); $this->proxies = new ProxyManager($proxyUrls); } public function browser(): HeadlessBrowserLoaderHelper { if (!$this->browserHelper) { $this->browserHelper = new HeadlessBrowserLoaderHelper(logger: $this->logger); } return $this->browserHelper; } /** * @throws \Psr\SimpleCache\InvalidArgumentException */ public function addToCache(RespondedRequest $respondedRequest): void { if ($this->cache && $this->shouldResponseBeCached($respondedRequest)) { $this->cache->set($respondedRequest->cacheKey(), $respondedRequest); } } public function skipCacheForNextRequest(): static { $this->skipCacheForNextRequest = true; return $this; } /** * @throws LoadingException|Throwable|\Psr\SimpleCache\InvalidArgumentException */ protected function tryLoading( RequestInterface $request, bool &$isFromCache, ): RespondedRequest { $request = $this->prepareRequest($request); $this->callHook('beforeLoad', $request); $respondedRequest = $this->shouldRequestBeServedFromCache($request) ? $this->getFromCache($request) : null; if ($respondedRequest) { $isFromCache = true; $respondedRequest->setIsServedFromCache(); $this->callHook('onCacheHit', $request, $respondedRequest->response); } $this->skipCacheForNextRequest = false; if (!$respondedRequest) { $respondedRequest = $this->waitForGoAndLoad($request); } return $respondedRequest; } /** * @throws ClientExceptionInterface * @throws GuzzleException * @throws LoadingException * @throws CommunicationException * @throws CannotReadResponse * @throws InvalidResponse * @throws ResponseHasError * @throws JavascriptException * @throws NavigationExpired * @throws NoResponseAvailable * @throws OperationTimedOut * @throws Exception */ protected function waitForGoAndLoad(RequestInterface $request): RespondedRequest { $this->throttler->waitForGo($request->getUri()); $respondedRequest = $this->loadViaClientOrHeadlessBrowser($request); if ($this->retryErrorResponseHandler->shouldWait($respondedRequest)) { $respondedRequest = $this->retryErrorResponseHandler->handleRetries( $respondedRequest, function () use ($request) { $request = $this->prepareRequest($request); return $this->loadViaClientOrHeadlessBrowser($request); }, ); } return $respondedRequest; } /** * @throws ClientExceptionInterface * @throws GuzzleException * @throws LoadingException * @throws CommunicationException * @throws CannotReadResponse * @throws InvalidResponse * @throws ResponseHasError * @throws JavascriptException * @throws NavigationExpired * @throws NoResponseAvailable * @throws OperationTimedOut */ protected function loadViaClientOrHeadlessBrowser(RequestInterface $request): RespondedRequest { if ($this->useHeadlessBrowser) { $proxy = $this->proxies?->getProxy() ?? null; return $this->browser()->navigateToPageAndGetRespondedRequest( $request, $this->throttler, $proxy, $this->useCookies ? $this->cookieJar : null, ); } return $this->handleRedirects($request); } /** * @throws ClientExceptionInterface * @throws LoadingException * @throws GuzzleException * @throws Exception */ protected function handleRedirects( RequestInterface $request, ?RespondedRequest $respondedRequest = null, int $redirectNumber = 0, ): RespondedRequest { if ($redirectNumber >= $this->maxRedirects) { throw new LoadingException('Too many redirects.'); } if (!$respondedRequest) { $this->throttler->trackRequestStartFor($request->getUri()); } if ($this->proxies && $this->httpClient instanceof Client) { $response = $this->sendProxiedRequestUsingGuzzle($request, $this->httpClient); } else { $response = $this->httpClient->sendRequest($request); } if (!$respondedRequest) { $respondedRequest = new RespondedRequest($request, $response); } else { $respondedRequest->setResponse($response); } $this->addCookiesToJar($respondedRequest); if ($respondedRequest->isRedirect()) { $this->logger()->info('Load redirect to: ' . $respondedRequest->effectiveUri()); $newRequest = $request->withUri(Url::parsePsr7($respondedRequest->effectiveUri())); $redirectNumber++; return $this->handleRedirects($newRequest, $respondedRequest, $redirectNumber); } else { $this->throttler->trackRequestEndFor($respondedRequest->request->getUri()); } return $respondedRequest; } /** * @throws GuzzleException */ protected function sendProxiedRequestUsingGuzzle(RequestInterface $request, Client $client): ResponseInterface { return $client->request( $request->getMethod(), $request->getUri(), [ 'headers' => $request->getHeaders(), 'proxy' => $this->proxies?->getProxy(), 'version' => $request->getProtocolVersion(), 'body' => $request->getBody(), ], ); } /** * @return void * @throws Exception */ protected function checkIfProxiesCanBeUsed(): void { if (!$this->usesHeadlessBrowser() && !$this->httpClient instanceof Client) { throw new Exception( 'The included proxy feature can only be used when using a guzzle HTTP client or headless chrome ' . 'browser for loading.', ); } } /** * @param mixed[] $config * @return mixed[] */ protected function mergeClientConfigWithDefaults(array $config): array { $merged = $this->defaultGuzzleClientConfig; foreach ($config as $key => $value) { $merged[$key] = $value; } return $merged; } /** * @throws LoadingException * @throws Exception */ protected function isAllowedToBeLoaded(UriInterface $uri, bool $throwsException = false): bool { if (!$this->robotsTxt()->isAllowed($uri)) { $message = 'Crawler is not allowed to load ' . $uri . ' according to robots.txt file.'; $this->logger->warning($message); if ($throwsException) { throw new LoadingException($message); } return false; } return true; } /** * @throws \Psr\SimpleCache\InvalidArgumentException * @throws Exception */ protected function getFromCache(RequestInterface $request): ?RespondedRequest { if (!$this->cache || $this->writeOnlyCache) { return null; } $key = RequestKey::from($request); if ($this->cache->has($key)) { $this->logger->info('Found ' . $request->getUri()->__toString() . ' in cache.'); $respondedRequest = $this->cache->get($key); // Previously, until v0.7 just used serialized arrays. Leave this for backwards compatibility. if (is_array($respondedRequest)) { $respondedRequest = RespondedRequest::fromArray($respondedRequest); } if ($this->retryCachedErrorResponses?->shallBeRetried($respondedRequest->response->getStatusCode())) { $this->logger->info('Cached response was an error response, retry.'); return null; } return $respondedRequest; } return null; } protected function shouldResponseBeCached(RespondedRequest $respondedRequest): bool { if (!empty($this->cacheUrlFilters)) { foreach ($this->cacheUrlFilters as $filter) { $noUrlMatched = true; foreach ($respondedRequest->allUris() as $url) { if ($filter->evaluate($url)) { $noUrlMatched = false; } } if ($noUrlMatched) { return false; } } } return true; } protected function shouldRequestBeServedFromCache(RequestInterface $request): bool { if ($this->skipCacheForNextRequest === true) { return false; } if (!empty($this->cacheUrlFilters)) { foreach ($this->cacheUrlFilters as $filter) { if (!$filter->evaluate((string) $request->getUri())) { return false; } } } return true; } /** * @throws InvalidArgumentException|Exception */ protected function validateSubjectType(RequestInterface|string $requestOrUri): RequestInterface { if (is_string($requestOrUri)) { try { $url = Url::parse($requestOrUri); if ($url->isRelativeReference()) { throw new InvalidArgumentException( 'The URI is a relative reference and therefore can\'t be loaded.', ); } return new Request('GET', $url->toPsr7()); } catch (InvalidUrlException) { throw new InvalidArgumentException('Invalid URL.'); } } elseif ( empty(trim($requestOrUri->getUri()->getScheme())) && Url::parse($requestOrUri->getUri())->isRelativeReference() ) { throw new InvalidArgumentException('The URI is a relative reference and therefore can\'t be loaded.'); } return $requestOrUri; } /** * @throws Exception */ protected function prepareRequest(RequestInterface $request): RequestInterface { $request = $request->withHeader('User-Agent', $this->userAgent->__toString()); // When writing tests I found that guzzle somehow messed up headers with multiple strings as value in the PSR-7 // request object. It sent only the last part of the array, instead of concatenating the array of strings to a // comma separated string. Don't know if that happens with all handlers (curl, stream), will investigate // further. But until this is fixed, we just prepare the headers ourselves. foreach ($request->getHeaders() as $headerName => $headerValues) { $request = $request->withHeader($headerName, $request->getHeaderLine($headerName)); } return $this->addCookiesToRequest($request); } protected function addCookiesToJar(RespondedRequest $respondedRequest): void { if ($this->useCookies) { try { $this->cookieJar->addFrom($respondedRequest->effectiveUri(), $respondedRequest->response); } catch (Exception $exception) { $this->logger->warning('Problem when adding cookies to the Jar: ' . $exception->getMessage()); } } } /** * @throws Exception */ protected function addCookiesToRequest(RequestInterface $request): RequestInterface { if (!$this->useCookies) { return $request; } foreach ($this->cookieJar->getFor($request->getUri()) as $cookie) { $request = $request->withAddedHeader('Cookie', $cookie->__toString()); } return $request; } } ================================================ FILE: src/Loader/Http/Messages/RespondedRequest.php ================================================ setResponse($this->response); } /** * @param mixed[] $data * @return RespondedRequest * @throws Exception */ public static function fromArray(array $data): RespondedRequest { $respondedRequest = new RespondedRequest( self::requestFromArray($data), self::responseFromArray($data), self::screenshotsFromArray($data), ); if ($data['effectiveUri'] && $data['effectiveUri'] !== $data['requestUri']) { $respondedRequest->addRedirectUri($data['effectiveUri']); } return $respondedRequest; } /** * @return mixed[] * @throws MissingZlibExtensionException */ public function __serialize(): array { return [ 'requestMethod' => $this->request->getMethod(), 'requestUri' => $this->request->getUri()->__toString(), 'requestHeaders' => $this->request->getHeaders(), 'requestBody' => Http::getBodyString($this->request), 'effectiveUri' => $this->effectiveUri(), 'responseStatusCode' => $this->response->getStatusCode(), 'responseHeaders' => $this->response->getHeaders(), 'responseBody' => Http::getBodyString($this->response), 'screenshots' => array_map(fn(Screenshot $screenshot) => $screenshot->path, $this->screenshots), ]; } /** * @return mixed[] * @throws MissingZlibExtensionException */ public function toArrayForResult(): array { $serialized = $this->__serialize(); $mapping = [ 'url' => 'effectiveUri', 'uri' => 'effectiveUri', 'status' => 'responseStatusCode', 'headers' => 'responseHeaders', 'body' => 'responseBody', ]; foreach ($mapping as $newKey => $originalKey) { $serialized[$newKey] = $serialized[$originalKey]; } return $serialized; } /** * @param mixed[] $data * @throws Exception */ public function __unserialize(array $data): void { $this->request = self::requestFromArray($data); $this->response = self::responseFromArray($data); if ($data['effectiveUri'] && $data['effectiveUri'] !== $data['requestUri']) { $this->addRedirectUri($data['effectiveUri']); } $this->screenshots = self::screenshotsFromArray($data); } public function effectiveUri(): string { return empty($this->redirects) ? $this->requestedUri() : end($this->redirects); } public function requestedUri(): string { return $this->request->getUri(); } /** * @return array */ public function allUris(): array { $uris = [$this->requestedUri() => $this->requestedUri()]; foreach ($this->redirects as $redirect) { $uris[$redirect] = $redirect; } return array_values($uris); } public function isRedirect(): bool { return $this->response->getStatusCode() >= 300 && $this->response->getStatusCode() < 400; } /** * @return string[] */ public function redirects(): array { return $this->redirects; } /** * @throws Exception */ public function setResponse(ResponseInterface $response): void { $this->response = $response; if ($this->isRedirect()) { $this->addRedirectUri(); } } /** * @throws Exception */ public function addRedirectUri(?string $redirectUri = null): void { $redirectUri = Url::parse($this->effectiveUri()) ->resolve($redirectUri ?? $this->response->getHeaderLine('Location')) ->__toString(); // Add it only if different from the previous one. if ($redirectUri !== end($this->redirects)) { $this->redirects[] = $redirectUri; } } public function cacheKey(): string { return RequestKey::from($this->request); } public function isServedFromCache(): bool { return $this->isServedFromCache; } public function setIsServedFromCache(bool $value = true): void { $this->isServedFromCache = $value; } /** * @param mixed[] $data */ protected static function requestFromArray(array $data): Request { return new Request( $data['requestMethod'], $data['requestUri'], $data['requestHeaders'], $data['requestBody'], ); } /** * @param mixed[] $data */ protected static function responseFromArray(array $data): Response { return new Response( $data['responseStatusCode'], $data['responseHeaders'], $data['responseBody'], ); } /** * @param mixed[] $data * @return Screenshot[] */ protected static function screenshotsFromArray(array $data): array { $screenshots = []; if (array_key_exists('screenshots', $data)) { foreach ($data['screenshots'] as $screenshot) { if (file_exists($screenshot)) { $screenshots[] = new Screenshot($screenshot); } } } return $screenshots; } } ================================================ FILE: src/Loader/Http/Politeness/RetryErrorResponseHandler.php ================================================ */ protected array $waitErrors = [ 429 => 'Too many Requests', 503 => 'Service Unavailable', ]; /** * @param int[] $wait */ public function __construct( protected int $retries = 2, protected array $wait = [10, 60], protected int $maxWait = 60, ) {} public function shouldWait(RespondedRequest $respondedRequest): bool { if (array_key_exists($respondedRequest->response->getStatusCode(), $this->waitErrors)) { return true; } return false; } public function setLogger(LoggerInterface $logger): void { $this->logger = $logger; } /** * @throws LoadingException */ public function handleRetries( RespondedRequest $respondedRequest, Closure $retryCallback, ): RespondedRequest { $this->logReceivedErrorResponseMessage($respondedRequest); $retries = 0; $this->wait[0] = $this->getWaitTimeFromResponse($respondedRequest->response) ?? $this->wait[0]; while ($retries < $this->retries) { $this->logWaitForRetryMessage($retries); sleep($this->wait[$retries]); $respondedRequest = $retryCallback(); if ($respondedRequest instanceof RespondedRequest && !$this->shouldWait($respondedRequest)) { return $respondedRequest; } elseif ($respondedRequest) { $this->logRepeatedErrorMessage($respondedRequest); } $retries++; } $this->logger?->error('Stop crawling'); throw new LoadingException('Stopped crawling because of repeated error responses.'); } /** * @throws LoadingException */ protected function getWaitTimeFromResponse(ResponseInterface $response): ?int { $retryAfterHeader = $response->getHeader('Retry-After'); if (!empty($retryAfterHeader)) { $retryAfterHeader = reset($retryAfterHeader); if (is_numeric($retryAfterHeader)) { $waitFor = (int) $retryAfterHeader; if ($waitFor > $this->maxWait) { $this->retryAfterExceedsLimitMessage($response); } return (int) $retryAfterHeader; } } return null; } protected function getResponseCodeAndReasonPhrase(RespondedRequest|ResponseInterface $respondedRequest): string { $response = $respondedRequest instanceof RespondedRequest ? $respondedRequest->response : $respondedRequest; $statusCode = $response->getStatusCode(); if (array_key_exists($statusCode, $this->waitErrors)) { return $statusCode . ' (' . $this->waitErrors[$statusCode] . ')'; } return '?'; } protected function logReceivedErrorResponseMessage(RespondedRequest $respondedRequest): void { $statusCodeAndReasonPhrase = $this->getResponseCodeAndReasonPhrase($respondedRequest); $this->logger?->warning( 'Request to ' . $respondedRequest->requestedUri() . ' returned ' . $statusCodeAndReasonPhrase, ); } protected function logWaitForRetryMessage(int $retryNumber): void { $this->logger?->warning('Will wait for ' . $this->wait[$retryNumber] . ' seconds and then retry'); } protected function logRepeatedErrorMessage(RespondedRequest $respondedRequest): void { $statusCodeAndReasonPhrase = $this->getResponseCodeAndReasonPhrase($respondedRequest); $this->logger?->warning('Retry again received an error response: ' . $statusCodeAndReasonPhrase); } /** * @throws LoadingException */ protected function retryAfterExceedsLimitMessage(ResponseInterface $response): string { $statusCodeAndReasonPhrase = $this->getResponseCodeAndReasonPhrase($response); $message = 'Retry-After header in ' . $statusCodeAndReasonPhrase . ' response, requires to wait longer ' . 'than the defined max wait time for this case. If you want to increase this limit, set it ' . 'in the ErrorResponseHandler of your HttpLoader instance.'; $this->logger?->error($message); throw new LoadingException($message); } } ================================================ FILE: src/Loader/Http/Politeness/RobotsTxtHandler.php ================================================ */ protected array $robotsTxts = []; protected bool $ignoreWildcardRules = false; public function __construct( protected Loader $loader, protected ?LoggerInterface $logger = null, ) { $this->userAgent = $this->loader->userAgent(); } public function ignoreWildcardRules(): void { $this->ignoreWildcardRules = true; } /** * @throws Exception */ public function isAllowed(string|UriInterface|Url $url): bool { if (!$this->userAgent instanceof BotUserAgent) { return true; } $url = $this->getUrlInstance($url); if ($url->path() === '/robots.txt') { return true; } $robotsTxt = $this->getRobotsTxtFor($url); if ($this->ignoreWildcardRules) { return !$robotsTxt->isExplicitlyNotAllowedFor($url, $this->userAgent->productToken()); } return $robotsTxt->isAllowed($url, $this->userAgent->productToken()); } /** * @return string[] * @throws InvalidRobotsTxtFileException */ public function getSitemaps(string|UriInterface|Url $url): array { return $this->getRobotsTxtFor($url)->sitemaps(); } /** * @throws InvalidRobotsTxtFileException|Exception */ protected function getRobotsTxtFor(string|UriInterface|Url $url): RobotsTxt { $url = $this->getUrlInstance($url); $root = $url->root(); if (isset($this->robotsTxts[$root])) { return $this->robotsTxts[$root]; } $robotsTxtContent = $this->loadRobotsTxtContent($root . '/robots.txt'); try { $this->robotsTxts[$root] = RobotsTxt::parse($robotsTxtContent); } catch (Exception $exception) { $this->logger?->warning('Failed to parse robots.txt: ' . $exception->getMessage()); $this->robotsTxts[$root] = RobotsTxt::parse(''); } return $this->robotsTxts[$root]; } protected function loadRobotsTxtContent(string $robotsTxtUrl): string { $usedHeadlessBrowser = false; if ($this->loader instanceof HttpLoader) { // If loader is set to use headless browser, temporary switch to using PSR-18 HTTP Client. $usedHeadlessBrowser = $this->loader->usesHeadlessBrowser(); $this->loader->useHttpClient(); } $response = $this->loader->load($robotsTxtUrl); if ($this->loader instanceof HttpLoader && $usedHeadlessBrowser) { $this->loader->useHeadlessBrowser(); } return $response ? Http::getBodyString($response) : ''; } protected function getUrlInstance(string|UriInterface|Url $url): Url { if (is_string($url) || $url instanceof UriInterface) { return Url::parse($url); } return $url; } } ================================================ FILE: src/Loader/Http/Politeness/Throttler.php ================================================ */ protected array $latestRequestTimes = []; /** * @var array */ protected array $latestResponseTimes = []; /** * @var array */ protected array $latestDurations = []; protected Microseconds|MultipleOf $from; protected Microseconds|MultipleOf $to; protected Microseconds $min; /** * @var string[] */ private array $_currentRequestUrls = []; /** * @throws InvalidArgumentException */ public function __construct( Microseconds|MultipleOf|null $from = null, Microseconds|MultipleOf|null $to = null, ?Microseconds $min = null, protected ?Microseconds $max = null, ) { $this->from = $from ?? new MultipleOf(1.0); $this->to = $to ?? new MultipleOf(2.0); $this->validateFromAndTo(); $this->min = $min ?? Microseconds::fromSeconds(0.25); } /** * @throws InvalidArgumentException */ public function waitBetween(Microseconds|MultipleOf $from, Microseconds|MultipleOf $to): static { $this->from = $from; $this->to = $to; $this->validateFromAndTo(); return $this; } public function waitAtLeast(Microseconds $seconds): static { $this->min = $seconds; return $this; } public function waitAtMax(Microseconds $seconds): static { $this->max = $seconds; return $this; } /** * @throws Exception */ public function trackRequestStartFor(UriInterface $url): void { $domain = $this->getDomain($url); $this->latestRequestTimes[$domain] = $this->time(); $this->_internalTrackStartFor($url); } /** * @throws Exception */ public function trackRequestEndFor(UriInterface $url): void { if (!$this->_requestToUrlWasStarted($url)) { return; } $domain = $this->getDomain($url); if (!isset($this->latestRequestTimes[$domain])) { return; } $this->latestResponseTimes[$domain] = $responseTime = $this->time(); $this->latestDurations[$domain] = $responseTime->subtract($this->latestRequestTimes[$domain]); unset($this->latestRequestTimes[$domain]); $this->_internalTrackEndFor($url); } /** * @throws Exception */ public function waitForGo(UriInterface $url): void { $domain = $this->getDomain($url); if (!isset($this->latestDurations[$domain])) { return; } $waitUntil = $this->calcWaitUntil($this->latestDurations[$domain], $this->latestResponseTimes[$domain]); $now = $this->time(); if ($now->isGreaterThanOrEqual($waitUntil)) { return; } $wait = $waitUntil->subtract($now); usleep($wait->value); } protected function time(): Microseconds { return Microseconds::fromSeconds(microtime(true)); } /** * @throws Exception */ protected function getDomain(UriInterface $url): string { $domain = Url::parse($url)->domain(); if (!$domain) { $domain = $url->getHost(); } if (!is_string($domain)) { $domain = '*'; } return $domain; } protected function calcWaitUntil( Microseconds $latestResponseDuration, Microseconds $latestResponseTime, ): Microseconds { $from = $this->from instanceof MultipleOf ? $this->from->calc($latestResponseDuration) : $this->from; $to = $this->to instanceof MultipleOf ? $this->to->calc($latestResponseDuration) : $this->to; $waitValue = $this->getRandBetween($from, $to); if ($this->min->isGreaterThan($waitValue)) { $waitValue = $this->min; } if ($this->max && $this->max->isLessThan($waitValue)) { $waitValue = $this->max; } return $latestResponseTime->add($waitValue); } protected function getRandBetween(Microseconds $from, Microseconds $to): Microseconds { if ($from->equals($to)) { return $from; } return new Microseconds(rand($from->value, $to->value)); } /** * @internal */ protected function _internalTrackStartFor(UriInterface $url): void { $urlString = (string) $url; $this->_currentRequestUrls[$urlString] = $urlString; } /** * @internal */ protected function _internalTrackEndFor(UriInterface $url): void { unset($this->_currentRequestUrls[(string) $url]); } protected function _requestToUrlWasStarted(UriInterface $url): bool { $urlString = (string) $url; if (array_key_exists($urlString, $this->_currentRequestUrls)) { return true; } return false; } protected function validateFromAndTo(): void { if (!$this->fromAndToAreOfSameType()) { throw new InvalidArgumentException('From and to values must be of the same type (Seconds or MultipleOf).'); } if ($this->fromIsGreaterThanTo()) { throw new InvalidArgumentException('From value can\'t be greater than to value.'); } } protected function fromAndToAreOfSameType(): bool { return ($this->from instanceof Microseconds && $this->to instanceof Microseconds) || ($this->from instanceof MultipleOf && $this->to instanceof MultipleOf); } protected function fromIsGreaterThanTo(): bool { if ($this->from instanceof Microseconds && $this->to instanceof Microseconds) { return $this->from->isGreaterThan($this->to); } if ($this->from instanceof MultipleOf && $this->to instanceof MultipleOf) { return $this->from->factorIsGreaterThan($this->to); } return false; } } ================================================ FILE: src/Loader/Http/Politeness/TimingUnits/MultipleOf.php ================================================ factor, 2) * 100); $result = (int) round(($microseconds->value * $factorTwoDecimalsAsInt) / 100); return new Microseconds($result); } public function factorIsGreaterThan(MultipleOf $multipleOf): bool { return $this->factor > $multipleOf->factor; } } ================================================ FILE: src/Loader/Http/ProxyManager.php ================================================ proxies = array_values($this->proxies); } public function singleProxy(): bool { return count($this->proxies) === 1; } public function hasOnlySingleProxy(): bool { return count($this->proxies) === 1; } public function hasMultipleProxies(): bool { return count($this->proxies) > 1; } public function getProxy(): string { if ($this->hasOnlySingleProxy()) { return $this->proxies[0]; } if ($this->lastUsedProxy === null || !isset($this->proxies[$this->lastUsedProxy + 1])) { $this->lastUsedProxy = 0; } else { $this->lastUsedProxy += 1; } return $this->proxies[$this->lastUsedProxy]; } } ================================================ FILE: src/Loader/Loader.php ================================================ */ protected array $hooks = [ 'beforeLoad' => [], 'onCacheHit' => [], 'onSuccess' => [], 'onError' => [], 'afterLoad' => [], ]; /** * @var array */ private array $_hooksCalledInCurrentLoadCall = []; public function __construct( protected UserAgentInterface $userAgent, ?LoggerInterface $logger = null, ) { $this->logger = $logger ?? new CliLogger(); } public function beforeLoad(callable $callback): void { $this->addHookCallback('beforeLoad', $callback); } public function onCacheHit(callable $callback): void { $this->addHookCallback('onCacheHit', $callback); } public function onSuccess(callable $callback): void { $this->addHookCallback('onSuccess', $callback); } public function onError(callable $callback): void { $this->addHookCallback('onError', $callback); } public function afterLoad(callable $callback): void { $this->addHookCallback('afterLoad', $callback); } public function setCache(CacheInterface $cache): static { $this->cache = $cache; return $this; } public function userAgent(): UserAgentInterface { return $this->userAgent; } /** * Can be implemented in a child class to check if it is allowed to load a certain uri (e.g. check robots.txt) * Throw a LoadingException when it's not allowed and $throwsException is set to true. */ protected function isAllowedToBeLoaded(UriInterface $uri, bool $throwsException = false): bool { return true; } protected function callHook(string $hook, mixed ...$arguments): void { if (!array_key_exists($hook, $this->hooks)) { return; } if (array_key_exists($hook, $this->_hooksCalledInCurrentLoadCall)) { $this->logger->warning( $hook . ' was already called in this load call. Probably a problem in the loader implementation.', ); } if ( $hook === 'afterLoad' && !empty($this->hooks[$hook]) && !array_key_exists('beforeLoad', $this->_hooksCalledInCurrentLoadCall) ) { $this->logger->warning( 'The afterLoad hook was called without a preceding call to the beforeLoad hook. Therefore don\'t ' . 'run the hook callbacks. Most likely an exception/error occurred before the beforeLoad hook call.', ); return; } $arguments[] = $this->logger; foreach ($this->hooks[$hook] as $callback) { call_user_func($callback, ...$arguments); } $this->_hooksCalledInCurrentLoadCall[$hook] = true; } protected function logger(): LoggerInterface { return $this->logger; } protected function addHookCallback(string $hook, callable $callback): void { $this->hooks[$hook][] = $callback; } /** * @internal * @return void */ protected function _resetCalledHooks(): void { $this->_hooksCalledInCurrentLoadCall = []; } } ================================================ FILE: src/Loader/LoaderInterface.php ================================================ log('emergency', $message, $context); } public function alert(string|Stringable $message, array $context = []): void { $this->log('alert', $message, $context); } public function critical(string|Stringable $message, array $context = []): void { $this->log('critical', $message, $context); } public function error(string|Stringable $message, array $context = []): void { $this->log('error', $message, $context); } public function warning(string|Stringable $message, array $context = []): void { $this->log('warning', $message, $context); } public function notice(string|Stringable $message, array $context = []): void { $this->log('notice', $message, $context); } public function info(string|Stringable $message, array $context = []): void { $this->log('info', $message, $context); } public function debug(string|Stringable $message, array $context = []): void { $this->log('debug', $message, $context); } /** * @param mixed $level * @param mixed[] $context */ public function log($level, string|Stringable $message, array $context = []): void { if (!is_string($level)) { throw new InvalidArgumentException('Level must be string.'); } if (!in_array($level, ['emergency', 'alert', 'critical', 'error', 'warning', 'notice', 'info', 'debug'], true)) { throw new UnexpectedValueException('Unknown log level.'); } $this->printTimeAndLevel($level); echo $message . "\n"; } protected function printTimeAndLevel(string $level): void { echo $this->time() . " \033[0;" . $this->levelColor($level) . "m[" . strtoupper($level) . "]\033[0m "; } protected function time(): string { return (new DateTime())->format('H:i:s:u'); } protected function levelColor(string $level): string { $levelColors = [ 'emergency' => '91', // bright red 'alert' => '91', 'critical' => '91', 'error' => '31', // red 'warning' => '36', // cyan 'notice' => '34', // blue 'info' => '32', // green 'debug' => '33', // yellow ]; return $levelColors[$level]; } } ================================================ FILE: src/Logger/PreStepInvocationLogger.php ================================================ > */ public array $messages = []; public function emergency(string|Stringable $message, array $context = []): void { $this->log('emergency', $message, $context); } public function alert(string|Stringable $message, array $context = []): void { $this->log('alert', $message, $context); } public function critical(string|Stringable $message, array $context = []): void { $this->log('critical', $message, $context); } public function error(string|Stringable $message, array $context = []): void { $this->log('error', $message, $context); } public function warning(string|Stringable $message, array $context = []): void { $this->log('warning', $message, $context); } public function notice(string|Stringable $message, array $context = []): void { $this->log('notice', $message, $context); } public function info(string|Stringable $message, array $context = []): void { $this->log('info', $message, $context); } public function debug(string|Stringable $message, array $context = []): void { $this->log('debug', $message, $context); } /** * @param mixed $level * @param mixed[] $context */ public function log($level, string|Stringable $message, array $context = []): void { if (!is_string($level)) { throw new InvalidArgumentException('Level must be string.'); } if (!in_array($level, ['emergency', 'alert', 'critical', 'error', 'warning', 'notice', 'info', 'debug'], true)) { throw new UnexpectedValueException('Unknown log level.'); } $this->messages[] = ['level' => $level, 'message' => $message]; } public function passToOtherLogger(LoggerInterface $logger): void { foreach ($this->messages as $message) { $logger->{$message['level']}($message['message']); } } } ================================================ FILE: src/Output.php ================================================ data = $result->data; } } public function set(string $key, mixed $value): self { if ($key === '') { $key = $this->getUnnamedKey(); } if (array_key_exists($key, $this->data)) { if (!is_array($this->data[$key]) || $this->isAssociativeArray($this->data[$key])) { $this->data[$key] = [$this->data[$key], $value]; } else { $this->data[$key][] = $value; } } else { $this->data[$key] = $value; } return $this; } public function has(string $key): bool { return array_key_exists($key, $this->data); } public function get(string $key, mixed $default = null): mixed { if ($this->has($key)) { return $this->data[$key]; } return $default; } /** * @return mixed[] */ public function toArray(): array { $data = OutputTypeHelper::recursiveChildObjectsToArray($this->data); if ( count($data) === 1 && str_contains('unnamed', array_key_first($data)) && OutputTypeHelper::isAssociativeArray($data[array_key_first($data)]) ) { return $data[array_key_first($data)]; } return $data; } private function getUnnamedKey(): string { $i = 1; while ($this->get('unnamed' . $i) !== null) { $i++; } return 'unnamed' . $i; } /** * @param mixed[] $array */ private function isAssociativeArray(array $array): bool { foreach ($array as $key => $value) { return is_string($key); } return false; } } ================================================ FILE: src/Steps/BaseStep.php ================================================ */ protected array $subCrawlers = []; protected ?LoggerInterface $logger = null; protected ?string $useInputKey = null; protected bool|string $uniqueInput = false; /** * @var array */ protected array $uniqueInputKeys = []; protected bool|string $uniqueOutput = false; /** * @var array */ protected array $uniqueOutputKeys = []; /** * @var array */ protected array $refiners = []; protected ?string $outputKey = null; protected ?int $maxOutputs = null; protected int $currentOutputCount = 0; private ?Input $fullOriginalInput = null; /** * @param Input $input * @return Generator */ abstract public function invokeStep(Input $input): Generator; public function addLogger(LoggerInterface $logger): static { if ($this->logger instanceof PreStepInvocationLogger) { $this->logger->passToOtherLogger($logger); } $this->logger = $logger; if (!empty($this->refiners)) { foreach ($this->refiners as $refiner) { if ($refiner instanceof RefinerInterface) { $refiner->addLogger($logger); } elseif (is_array($refiner) && $refiner['refiner'] instanceof RefinerInterface) { $refiner['refiner']->addLogger($logger); } } } return $this; } public function setParentCrawler(Crawler $crawler): static { $this->parentCrawler = $crawler; return $this; } /** * @param string|string[]|null $keys */ public function keep(string|array|null $keys = null): static { if ($keys === null) { $this->keep = true; } else { $this->keep = $keys; } return $this; } public function keepAs(string $key): static { $this->keepAs = $key; return $this; } /** * @param string|string[]|null $keys */ public function keepFromInput(string|array|null $keys = null): static { if ($keys === null) { $this->keepFromInput = true; } else { $this->keepFromInput = $keys; } return $this; } public function keepInputAs(string $key): static { $this->keepInputAs = $key; return $this; } public function keepsAnything(): bool { return $this->keepsAnythingFromOutputData() || $this->keepsAnythingFromInputData(); } public function keepsAnythingFromInputData(): bool { return $this->keepFromInput !== false || $this->keepInputAs !== null; } public function keepsAnythingFromOutputData(): bool { return $this->keep !== false || $this->keepAs !== null; } public function useInputKey(string $key): static { $this->useInputKey = $key; return $this; } public function uniqueInputs(?string $key = null): static { $this->uniqueInput = $key ?? true; return $this; } public function uniqueOutputs(?string $key = null): static { $this->uniqueOutput = $key ?? true; return $this; } public function refineOutput( string|Closure|RefinerInterface $keyOrRefiner, Closure|RefinerInterface|null $refiner = null, ): static { if ($refiner instanceof RefinerInterface && $this->logger) { $refiner->addLogger($this->logger); } elseif ($keyOrRefiner instanceof RefinerInterface && $this->logger) { $keyOrRefiner->addLogger($this->logger); } if (is_string($keyOrRefiner) && $refiner === null) { throw new InvalidArgumentException( 'You have to provide a Refiner (Closure or instance of RefinerInterface)', ); } elseif (is_string($keyOrRefiner)) { $this->refiners[] = ['key' => $keyOrRefiner, 'refiner' => $refiner]; } else { $this->refiners[] = $keyOrRefiner; } return $this; } public function outputKey(string $key): static { $this->outputKey = $key; return $this; } public function maxOutputs(int $maxOutputs): static { $this->maxOutputs = $maxOutputs; return $this; } public function resetAfterRun(): void { $this->uniqueOutputKeys = $this->uniqueInputKeys = []; $this->currentOutputCount = 0; } /** * Define what type of outputs the step will yield * * Defining this in any step, helps to identify potential errors upfront when a crawler run is started. * If the step will only yield associative array (or object) outputs, * return StepOutputType::AssociativeArrayOrObject. * If it will only yield scalar (string, int, float, bool) outputs, return StepOutputType::Scalar. * * If it can potentially yield both types, but you can determine what it will yield, based on the state of the * class, please implement this. Only if it can't be defined upfront, because it depends on the input, return * StepOutputType::Mixed. * * @return StepOutputType */ public function outputType(): StepOutputType { return StepOutputType::Mixed; } /** * @param BaseStep|mixed[] $previousStepOrInitialInputs * @throws PreRunValidationException */ public function validateBeforeRun(BaseStep|array $previousStepOrInitialInputs): void { if (!$previousStepOrInitialInputs instanceof BaseStep) { $this->validateFirstStepBeforeRun($previousStepOrInitialInputs); } if ($this->keep !== false && $this->keepAs === null && $this->outputKey === null) { $outputType = $this->outputType(); if ($outputType === StepOutputType::Scalar) { throw new PreRunValidationException( 'Keeping data from a step that yields scalar value outputs (= single string/int/bool/float with ' . 'no key like in an associative array or object) requires to define a key, by using keepAs() ' . 'instead of keep()', ); } elseif ($outputType === StepOutputType::Mixed) { $this->logger?->warning( $this->getPreValidationRunMessageStartWithStepClassName() . ' potentially yields scalar value ' . 'outputs (= single string/int/bool/float with no key like in an associative array or object). ' . 'If it does (yield a scalar value output), it can not keep that output value, because it needs ' . 'a key for that. To avoid this, define a key for scalar outputs by using the keepAs() method.', ); } } if ( $this->keepFromInput !== false && $previousStepOrInitialInputs instanceof BaseStep && $this->keepInputAs === null ) { $previousStepOutputType = $previousStepOrInitialInputs->outputType(); if ($previousStepOutputType === StepOutputType::Scalar) { throw new PreRunValidationException( 'You are trying to keep data from a step\'s input with keepFromInput(), but the step before it ' . 'returns scalar value outputs (= single string/int/bool/float with no key like in an associative ' . 'array or object). Please define a key for the input data to keep, by using keepAs() instead.', ); } elseif ($previousStepOutputType === StepOutputType::Mixed) { $this->logger?->warning( $this->getPreValidationRunMessageStartWithStepClassName($previousStepOrInitialInputs) . ' potentially yields scalar value outputs (= single string/int/bool/float with no key like in ' . 'an associative array or object). If it does (yield a scalar value output) the next step can not ' . 'keep it by using keepFromInput(). To avoid this, define a key for scalar inputs by using the ' . 'keepInputAs() method.', ); } } } public function subCrawlerFor(string $for, Closure $crawlerBuilder): static { $this->subCrawlers[$for] = $crawlerBuilder; return $this; } /** * In case useInputKey() was used, use this method to store the original input so you can still * access it later. */ protected function storeOriginalInput(Input $input): void { $this->fullOriginalInput = $input; } /** * In case useInputKey() was used, this method shall still provide access to the full input object, * that the step was last called with. */ protected function getFullOriginalInput(): ?Input { return $this->fullOriginalInput; } protected function runSubCrawlersFor(Output $output): Output { if (empty($this->subCrawlers)) { return $output; } if (!$output->isArrayWithStringKeys()) { $this->logger?->error( 'The sub crawler feature works only with outputs that are associative arrays (arrays with ' . 'string keys). The feature was called with an output of type ' . gettype($output->get()) . '.', ); return $output; } if (!$this->parentCrawler) { $this->logger?->error('Can\'t make sub crawler, because the step has no reference to the parent crawler.'); } else { foreach ($this->subCrawlers as $forKey => $crawlerBuilder) { $outputValue = $output->getProperty($forKey); if ($outputValue !== null) { $crawler = $crawlerBuilder($this->parentCrawler->getSubCrawler()); is_array($outputValue) ? $crawler->inputs($outputValue) : $crawler->input($outputValue); $results = []; foreach ($crawler->run() as $result) { $results[] = $result; } $resultCount = count($results); if ($resultCount === 0) { $output = $output->withPropertyValue($forKey, null); } elseif ($resultCount === 1) { $output = $output->withPropertyValue($forKey, $results[0]->toArray()); } else { $output = $output->withPropertyValue( $forKey, array_map(function (Result $result) { return $result->toArray(); }, $results), ); } } } } return $output; } /** * If you want to define aliases for certain output keys that can be used with keep(), * define this method in the child class and return the mappings. * * @return array alias => output key */ protected function outputKeyAliases(): array { return []; } /** * @param mixed[] $initialInputs * @throws PreRunValidationException */ protected function validateFirstStepBeforeRun(array $initialInputs): void { if ($initialInputs === []) { $this->logger?->error('You did not provide any initial inputs for your crawler.'); return; } if ($this->keepFromInput !== false) { foreach ($initialInputs as $input) { if (!OutputTypeHelper::isAssociativeArrayOrObject($input)) { throw new PreRunValidationException( 'The initial inputs contain scalar values (without keys) and you are calling keepFromInput() ' . 'on the first step (if not the first step in your whole crawler, check sub crawlers). Please ' . 'use keepInputAs() instead with a key, that the input value should have in the kept data.', ); } } } } protected function getPreValidationRunMessageStartWithStepClassName(?BaseStep $step = null): string { $stepClassName = $this->getStepClassName($step); if ($stepClassName) { return 'The ' . $stepClassName . ' step'; } else { $stepClassName = $this->getParentStepClassName($step); if ( $stepClassName && $stepClassName !== 'Crwlr\\Crawler\\Steps\\Step' && $stepClassName !== 'Crwlr\\Crawler\\Steps\\BaseStep' ) { return 'An anonymous class step, that is extending the ' . $stepClassName . ' step'; } else { return 'An anonymous class step'; } } } protected function getStepClassName(?BaseStep $step = null): ?string { $stepClassName = get_class($step ?? $this); if (str_contains($stepClassName, '@anonymous')) { return null; } return $stepClassName; } protected function getParentStepClassName(?BaseStep $step = null): ?string { $parents = class_parents($step ?? $this); $firstLevelParent = reset($parents); if ($firstLevelParent && !str_contains($firstLevelParent, '@anonymous')) { return $firstLevelParent; } return null; } protected function getInputKeyToUse(Input $input): ?Input { if ($this->useInputKey !== null) { $inputValue = $input->get(); if (!is_array($inputValue) || !array_key_exists($this->useInputKey, $inputValue)) { if (!array_key_exists($this->useInputKey, $input->keep)) { $warningMessage = ''; if (!is_array($inputValue)) { $warningMessage = 'Can\'t get key from input, because input is of type ' . gettype($inputValue) . ' instead of array.'; } elseif (!array_key_exists($this->useInputKey, $inputValue)) { $warningMessage = 'Can\'t get key from input, because it does not exist.'; } if (!empty($input->keep)) { $warningMessage .= ' Key also is not present in data kept from previous steps.'; } $this->logger?->warning($warningMessage); return null; } $valueToUse = $input->keep[$this->useInputKey]; } else { $valueToUse = $inputValue[$this->useInputKey]; } $input = $input->withValue($valueToUse); } return $input; } protected function inputOrOutputIsUnique(Io $io): bool { $uniquenessSetting = $io instanceof Input ? $this->uniqueInput : $this->uniqueOutput; $uniqueKeys = $io instanceof Input ? $this->uniqueInputKeys : $this->uniqueOutputKeys; $key = is_string($uniquenessSetting) ? $io->setKey($uniquenessSetting) : $io->setKey(); if (isset($uniqueKeys[$key])) { return false; } if ($io instanceof Input) { $this->uniqueInputKeys[$key] = true; // Don't keep value, just the key, to keep memory usage low. } else { $this->uniqueOutputKeys[$key] = true; } return true; } protected function applyRefiners(mixed $outputValue, mixed $inputValue): mixed { foreach ($this->refiners as $refiner) { $outputValueToRefine = $outputValue; if (is_array($refiner) && isset($outputValue[$refiner['key']])) { $outputValueToRefine = $outputValue[$refiner['key']]; } if ($refiner instanceof Closure) { $refinedOutputValue = $refiner->call($this, $outputValueToRefine, $inputValue); } elseif ($refiner instanceof RefinerInterface) { $refinedOutputValue = $refiner->refine($outputValueToRefine); } else { if ($refiner['refiner'] instanceof Closure) { $refinedOutputValue = $refiner['refiner']->call($this, $outputValueToRefine, $inputValue); } else { $refinedOutputValue = $refiner['refiner']->refine($outputValueToRefine); } } if (is_array($refiner) && isset($outputValue[$refiner['key']])) { $outputValue[$refiner['key']] = $refinedOutputValue; } else { $outputValue = $refinedOutputValue; } } return $outputValue; } protected function makeOutput(mixed $outputData, Input $input): Output { $output = new Output( $outputData, $input->keep, ); $output = $this->runSubCrawlersFor($output); $this->keepData($output, $input); return $output; } protected function keepData(Output $output, Input $input): void { if (!$this->keepsAnything()) { return; } if ($this->keepsAnythingFromInputData()) { $inputDataToKeep = $this->getInputDataToKeep($input, $output->keep); if (!empty($inputDataToKeep)) { $output->keep($inputDataToKeep); } } if ($this->keepsAnythingFromOutputData()) { $outputDataToKeep = $this->getOutputDataToKeep($output, $output->keep); if (!empty($outputDataToKeep)) { $output->keep($outputDataToKeep); } } } /** * @param array $alreadyKept * @return mixed[]|null */ protected function getOutputDataToKeep(Output $output, array $alreadyKept): ?array { return $this->getInputOrOutputDataToKeep($output, $alreadyKept); } /** * @param array $alreadyKept * @return mixed[]|null */ protected function getInputDataToKeep(Input $input, array $alreadyKept): ?array { return $this->getInputOrOutputDataToKeep($input, $alreadyKept); } /** * @param array $alreadyKept * @return mixed[]|null */ protected function getInputOrOutputDataToKeep(Io $io, array $alreadyKept): ?array { $keepProperty = $io instanceof Output ? $this->keep : $this->keepFromInput; $keepAsProperty = $io instanceof Output ? $this->keepAs : $this->keepInputAs; $data = $io->get(); $isScalarValue = OutputTypeHelper::isScalar($data); if ($keepAsProperty !== null && ($isScalarValue || $keepProperty === false)) { return [$keepAsProperty => $data]; } elseif ($keepProperty !== false) { if ($isScalarValue) { $variableMessagePart = $io instanceof Output ? 'yielded an output' : 'received an input'; $this->logger?->error( 'A ' . get_class($this) . ' step ' . $variableMessagePart . ' that is neither an associative ' . 'array, nor an object, so there is no key for the value to keep. Please define a key for the ' . 'output by using keepAs() instead of keep(). The value is now kept with an \'unnamed\' key.', ); return [$this->nextUnnamedKey($alreadyKept) => $data]; } $data = !is_array($data) ? OutputTypeHelper::objectToArray($data) : $data; if ($keepProperty === true) { return $data; } elseif (is_string($keepProperty)) { return [$keepProperty => $this->getOutputPropertyFromArray($keepProperty, $data)]; } return $this->mapKeepProperties($data, $keepProperty); } return null; } /** * @param array $data * @return string */ protected function nextUnnamedKey(array $data): string { $i = 1; while (isset($data['unnamed' . $i])) { $i++; } return 'unnamed' . $i; } /** * @param mixed[] $data * @param array $keep * @return mixed[] */ protected function mapKeepProperties(array $data, array $keep): array { $keepData = []; foreach ($keep as $key => $value) { if (is_int($key)) { $keepData[$value] = $this->getOutputPropertyFromArray($value, $data); } elseif (is_string($key)) { $keepData[$key] = $this->getOutputPropertyFromArray($value, $data); } } return $keepData; } /** * @param mixed[] $data */ protected function getOutputPropertyFromArray(string $key, array $data): mixed { if (array_key_exists($key, $data)) { return $data[$key]; } elseif ($this->isOutputKeyAlias($key)) { return $data[$this->getOutputKeyAliasRealKey($key)]; } $data = OutputTypeHelper::recursiveChildObjectsToArray($data); $dot = new Dot($data); return $dot->get($key); } protected function isOutputKeyAlias(string $key): bool { return array_key_exists($key, $this->outputKeyAliases()); } protected function getOutputKeyAliasRealKey(string $key): string { $mapping = $this->outputKeyAliases(); return $mapping[$key]; } protected function maxOutputsExceeded(): bool { return $this->maxOutputs !== null && $this->currentOutputCount >= $this->maxOutputs; } protected function trackYieldedOutput(): void { if ($this->maxOutputs !== null) { $this->currentOutputCount += 1; } } } ================================================ FILE: src/Steps/Csv.php ================================================ $columnMapping */ public function __construct(protected array $columnMapping = [], protected bool $skipFirstLine = false) {} /** * @param array $columnMapping */ public static function parseString(array $columnMapping = [], bool $skipFirstLine = false): self { return new self($columnMapping, $skipFirstLine); } /** * @param array $columnMapping */ public static function parseFile(array $columnMapping = [], bool $skipFirstLine = false): self { $instance = new self($columnMapping, $skipFirstLine); $instance->method = 'file'; return $instance; } public function skipFirstLine(): static { $this->skipFirstLine = true; return $this; } public function separator(string $separator): static { if (strlen($separator) > 1) { throw new InvalidArgumentException('CSV separator must be single character'); } $this->separator = $separator; return $this; } public function enclosure(string $enclosure): static { $this->enclosure = $enclosure; return $this; } public function escape(string $escape): static { $this->escape = $escape; return $this; } public function outputType(): StepOutputType { return StepOutputType::AssociativeArrayOrObject; } protected function validateAndSanitizeInput(mixed $input): string { if ($this->method === 'string') { return $this->validateAndSanitizeStringOrHttpResponse($input); } elseif ($this->method === 'file') { return $this->validateAndSanitizeStringOrStringable($input); } else { throw new InvalidArgumentException('Parse CSV method must be string or file'); } } /** * @param string $input * @throws Exception */ protected function invoke(mixed $input): Generator { if ($this->method === 'file') { if (!file_exists($input)) { throw new Exception('CSV file not found'); } yield from $this->readFile($input); } elseif ($this->method === 'string') { yield from $this->mapLines(explode(PHP_EOL, $input)); } } protected function readFile(string $filePath): Generator { $handle = fopen($filePath, 'r'); if ($handle === false) { return; } $isFirstLine = true; while (($row = fgetcsv($handle, 0, $this->separator, $this->enclosure, $this->escape)) !== false) { if ($isFirstLine) { if (empty($this->columnMapping)) { $this->columnMapping = $row; } $isFirstLine = false; if ($this->skipFirstLine) { continue; } } yield $this->mapRow($row); } fclose($handle); } /** * @param string[] $lines * @return Generator */ protected function mapLines(array $lines): Generator { foreach ($lines as $key => $line) { if ($key === 0 && $this->skipFirstLine) { if (empty($this->columnMapping)) { $this->columnMapping = str_getcsv($line, $this->separator, $this->enclosure, $this->escape); } continue; } if (!empty($line)) { yield $this->mapRow(str_getcsv($line, $this->separator, $this->enclosure, $this->escape)); } } } /** * @param mixed[] $row * @return mixed[] */ protected function mapRow(array $row): array { $count = 0; $mapped = []; foreach ($row as $column) { if (isset($this->columnMapping[$count]) && !empty($this->columnMapping[$count])) { $mapped[$this->columnMapping[$count]] = $column; } $count++; } return $mapped; } } ================================================ FILE: src/Steps/Dom/DomDocument.php ================================================ makeDocumentInstance($source)); // @phpstan-ignore-line } /** * @param string $source * @return Document|Crawler */ abstract protected function makeDocumentInstance(string $source): object; } ================================================ FILE: src/Steps/Dom/HtmlDocument.php ================================================ querySelectorAll(string $selector) * @method NodeList queryXPath(string $selector) */ class HtmlDocument extends DomDocument { /** * Gets the href attribute of a tag in the document * * In case there are multiple base elements in the document: * https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base * "If multiple elements are used, only the first href and first target are obeyed..." */ public function getBaseHref(): ?string { $baseTag = $this->querySelector('base'); return $baseTag?->getAttribute('href'); } public function outerHtml(): string { return $this->outerSource(); } /** * @param \Dom\Node|DOMNode|Crawler $node */ protected function makeChildNodeInstance(object $node): Node { return new HtmlElement($node); } /** * @return \Dom\HTMLDocument|Crawler */ protected function makeDocumentInstance(string $source): object { $source = $this->fixInvalidCharactersInSource($source); if (PhpVersion::isAtLeast(8, 4)) { return \Dom\HTMLDocument::createFromString($source, HTML_NO_DEFAULT_NS | LIBXML_NOERROR); } return new Crawler($source); } /** * Converts charset to HTML-entities to ensure valid parsing. */ private function fixInvalidCharactersInSource(string $source): string { if (function_exists('iconv')) { $charset = preg_match('//u', $source) ? 'UTF-8' : 'ISO-8859-1'; preg_match('/(charset *= *["\']?)([a-zA-Z\-0-9_:.]+)/i', $source, $matches); if ($matches && !empty($matches[2])) { $declaredCharset = strtoupper($matches[2]); } else { $declaredCharset = null; } if ($charset === 'ISO-8859-1' && $declaredCharset === 'UTF-8') { $fixedSource = iconv("ISO-8859-1", "UTF-8//TRANSLIT", $source); if ($fixedSource !== false) { $source = $fixedSource; } } } return $source; } } ================================================ FILE: src/Steps/Dom/HtmlElement.php ================================================ querySelectorAll(string $selector) * @method NodeList queryXPath(string $selector) */ class HtmlElement extends Node { public function outerHtml(): string { return $this->outerSource(); } public function innerHtml(): string { return $this->innerSource(); } public function html(): string { return $this->innerHtml(); } /** * @param \Dom\Node|DOMNode|Crawler $node */ protected function makeChildNodeInstance(object $node): Node { return new HtmlElement($node); } } ================================================ FILE: src/Steps/Dom/Node.php ================================================ node = $node; } public function querySelector(string $selector): ?Node { if ($this->node instanceof Crawler) { $filtered = $this->node->filter($selector); return $filtered->count() > 0 ? $this->makeChildNodeInstance($filtered->first()) : null; } $result = $this->node->querySelector($selector); return $result !== null ? $this->makeChildNodeInstance($result) : null; } public function querySelectorAll(string $selector): NodeList { if ($this->node instanceof Crawler) { return $this->makeNodeListInstance($this->node->filter($selector)); } return $this->makeNodeListInstance($this->node->querySelectorAll($selector)); } public function queryXPath(string $query): NodeList { $node = $this->node; if (!$node instanceof Crawler) { $node = new Crawler($this->outerSource()); } return $this->makeNodeListInstance($node->filterXPath($query)); } public function removeNodesMatchingSelector(string $selector): void { foreach ($this->querySelectorAll($selector) as $node) { if ($node->node instanceof Crawler) { $node = $node->node->getNode(0); if ($node) { $node->parentNode?->removeChild($node); } } else { $node->node->parentNode?->removeChild($node->node); } } } public function removeNodesMatchingXPath(string $query): void { if ($this->node instanceof Crawler) { foreach ($this->node->filterXPath($query) as $node) { $node->parentNode?->removeChild($node); } } else { $node = $this->getParentDocumentOfNode($this->node); if ($node) { $xpath = new XPath($node); foreach ($xpath->query($query) as $node) { $node->parentNode?->removeChild($node); } } } } public function nodeName(): string { if ($this->node instanceof Crawler) { $nodeName = $this->node->nodeName(); } else { $nodeName = $this->node->nodeName ?? ''; } return strtolower($nodeName); } public function text(): string { if ($this->node instanceof Crawler) { $text = $this->node->text(); } else { $text = is_string($this->node->textContent) ? $this->node->textContent : ''; } return trim( preg_replace("/(?:[ \n\r\t\x0C]{2,}+|[\n\r\t\x0C])/", ' ', $text) ?? $text, " \n\r\t\x0C", ); } public function getAttribute(string $attributeName): ?string { if ($this->node instanceof Crawler) { return $this->node->attr($attributeName); } return $this->node->getAttribute($attributeName); } /** * @param \Dom\Node|DOMNode|Crawler $node */ abstract protected function makeChildNodeInstance(object $node): Node; protected function outerSource(): string { if ($this->node instanceof Crawler) { return $this->node->count() > 0 ? $this->node->outerHtml() : ''; } if ($this->node instanceof Document) { $node = $this->node->documentElement; if ($this->node instanceof \Dom\HTMLDocument) { return $this->node->saveHTML($node); } elseif ($this->node instanceof \Dom\XMLDocument) { $source = $this->node->saveXML($node); return $source !== false ? $source : ''; } } $parentDocument = $this->getParentDocumentOfNode($this->node); if ($parentDocument) { if ($parentDocument instanceof \Dom\HTMLDocument) { return $parentDocument->saveHTML($this->node); } elseif ($parentDocument instanceof \Dom\XMLDocument) { $source = $parentDocument->saveXML($this->node); return $source !== false ? $source : ''; } } return $this->node->innerHTML; } protected function innerSource(): string { if ($this->node instanceof Crawler) { return $this->node->html(); } return $this->node->innerHTML; } /** * @param \Dom\NodeList<\Dom\Node>|Crawler $nodeList */ protected function makeNodeListInstance(object $nodeList): NodeList { return new NodeList( $nodeList, function (object $node): Node { /** @var DOMNode|\Dom\Node $node */ return $this->makeChildNodeInstance($node); }, ); } /** * @param \Dom\Node|Element $node * @return Document|null */ private function getParentDocumentOfNode(object $node): ?object { if ($node instanceof Document) { return $node; } $parentDocument = $node->parentNode; while ($parentDocument && !$parentDocument instanceof Document) { $parentDocument = $parentDocument->parentNode; } if ($parentDocument instanceof Document) { return $parentDocument; } return null; } } ================================================ FILE: src/Steps/Dom/NodeList.php ================================================ */ class NodeList implements IteratorAggregate, Countable { /** * @param \Dom\NodeList<\Dom\Node>|\Dom\NodeList|Crawler|array $nodeList */ public function __construct( private readonly object|array $nodeList, private readonly ?Closure $makeNodeInstance = null, ) {} /** * @throws Exception */ public function first(): ?Node { $iterator = $this->getIterator(); $iterator->rewind(); return $iterator->current(); } /** * @throws Exception */ public function last(): ?Node { $iterator = $this->getIterator(); foreach ($iterator as $node) { } return $node ?? null; } /** * @throws Exception */ public function nth(int $index): ?Node { $iterator = $this->getIterator(); $i = 0; foreach ($iterator as $node) { if (($i + 1) === $index) { return $node; } $i++; } return null; } /** * @return mixed[] * @throws Exception */ public function each(Closure $callback): array { $data = []; foreach ($this->getIterator() as $key => $node) { $data[] = $callback($node, $key); } return $data; } /** * @return int<0, max> */ public function count(): int { if (is_array($this->nodeList)) { return count($this->nodeList); } return max(0, $this->nodeList->count()); } public function getIterator(): Iterator { if (is_array($this->nodeList)) { return new ArrayIterator($this->nodeList); } $iterator = $this->nodeList->getIterator(); /** @var Iterator $iterator */ return new class ($iterator, $this->makeNodeInstance) implements Iterator { /** * @param Iterator $iterator */ public function __construct( private readonly Iterator $iterator, private readonly ?Closure $makeNodeInstanceCallback = null, ) {} public function current(): ?Node { return $this->makeNodeInstance($this->iterator->current()); } public function next(): void { $this->iterator->next(); } public function key(): mixed { return $this->iterator->key(); } public function valid(): bool { return $this->iterator->valid(); } public function rewind(): void { $this->iterator->rewind(); } /** * @param \Dom\Node|DOMNode|Crawler $node */ private function makeNodeInstance(mixed $node): ?Node { if (!is_object($node)) { // @phpstan-ignore-line change when min. required PHP version is 8.4. return null; } return $this->makeNodeInstanceCallback?->__invoke($node) ?? null; } }; } } ================================================ FILE: src/Steps/Dom/XmlDocument.php ================================================ querySelectorAll(string $selector) * @method NodeList queryXPath(string $selector) */ class XmlDocument extends DomDocument { public function outerXml(): string { return $this->outerSource(); } /** * @param \Dom\Node|DOMNode|Crawler $node */ protected function makeChildNodeInstance(object $node): Node { return new XmlElement($node); } /** * @return \Dom\XMLDocument|Crawler */ protected function makeDocumentInstance(string $source): object { if (PhpVersion::isAtLeast(8, 4)) { try { return \Dom\XMLDocument::createFromString($source, LIBXML_NOERROR | LIBXML_NONET); } catch (Throwable) { $source = $this->replaceInvalidXmlCharacters($source); try { return \Dom\XMLDocument::createFromString($source, LIBXML_NOERROR | LIBXML_NONET); } catch (Throwable) { } // If it fails again, try it with symfony DOM Crawler as fallback. } } $crawler = new Crawler($source); if ($crawler->count() === 0) { $source = $this->replaceInvalidXmlCharacters($source); $crawler = new Crawler($source); } return $crawler; } /** * Replace characters that aren't valid within XML documents * * Sometimes XML parsing errors occur because of characters that aren't valid within XML documents. * Therefore, this method finds and replaces them with valid alternatives or HTML entities. * For best results in those cases, please install the voku/portable-ascii composer package. * * @param string $value * @return string */ private function replaceInvalidXmlCharacters(string $value): string { return preg_replace_callback('/[^\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}]/u', function ($match) { $replacement = class_exists('voku\helper\ASCII') ? ASCII::to_transliterate($match[0]) : '?'; if ($replacement === '?') { return '&#' . mb_ord($match[0]) . ';'; } return $replacement; }, $value) ?? $value; } } ================================================ FILE: src/Steps/Dom/XmlElement.php ================================================ querySelectorAll(string $selector) * @method NodeList queryXPath(string $selector) */ class XmlElement extends Node { public function outerXml(): string { return $this->outerSource(); } public function innerXml(): string { return $this->innerSource(); } /** * @param \Dom\Node|DOMNode|Crawler $node */ protected function makeChildNodeInstance(object $node): Node { return new XmlElement($node); } } ================================================ FILE: src/Steps/Dom.php ================================================ */ protected array $mapping = []; protected string|DomQuery|null $singleSelector = null; protected ?string $baseUrl = null; /** * @param string|DomQuery|array $selectorOrMapping */ final public function __construct(string|DomQuery|array $selectorOrMapping = []) { $this->addLogger(new PreStepInvocationLogger()); $this->extract($selectorOrMapping); } public static function root(): static { $instance = new static(); $instance->root = true; return $instance; } public static function each(string|DomQuery $domQuery): static { $instance = new static(); $instance->each = is_string($domQuery) ? $instance->makeDefaultDomQueryInstance($domQuery) : $domQuery; if (trim($instance->each->query) === '') { $instance->logger?->warning( 'The selector you provided for the ‘each’ option is empty. This option is intended to allow ' . 'extracting multiple output objects from a single page, so an empty selector most likely doesn’t ' . 'make sense, as it will definitely result in only one output object.', ); } return $instance; } public static function first(string|DomQuery $domQuery): static { $instance = new static(); $instance->first = is_string($domQuery) ? $instance->makeDefaultDomQueryInstance($domQuery) : $domQuery; if (trim($instance->first->query) === '') { $instance->logger?->warning( 'The selector you provided for the ‘first’ option is empty. This option is meant to restrict your ' . 'extraction to a specific parent element, so an empty selector most likely doesn’t make sense. ' . 'Either define the desired selector or use the root() method instead.', ); } return $instance; } public static function last(string|DomQuery $domQuery): static { $instance = new static(); $instance->last = is_string($domQuery) ? $instance->makeDefaultDomQueryInstance($domQuery) : $domQuery; if (trim($instance->last->query) === '') { $instance->logger?->warning( 'The selector you provided for the ‘last’ option is empty. This option is meant to restrict your ' . 'extraction to a specific parent element, so an empty selector most likely doesn’t make sense. ' . 'Either define the desired selector or use the root() method instead.', ); } return $instance; } /** * @throws InvalidDomQueryException */ public static function cssSelector(string $selector): CssSelector { return new CssSelector($selector); } /** * @throws InvalidDomQueryException */ public static function xPath(string $query): XPathQuery { return new XPathQuery($query); } abstract protected function makeDefaultDomQueryInstance(string $query): DomQuery; /** * @param string|DomQuery|array $selectorOrMapping */ public function extract(string|DomQuery|array $selectorOrMapping): static { if (is_array($selectorOrMapping)) { $this->mapping = $selectorOrMapping; } else { $this->singleSelector = $selectorOrMapping; } return $this; } public function outputType(): StepOutputType { return empty($this->mapping) && $this->singleSelector ? StepOutputType::Scalar : StepOutputType::AssociativeArrayOrObject; } /** * @param HtmlDocument|Node $input * @throws Exception */ protected function invoke(mixed $input): Generator { $base = $this->getBase($input); if (!$base || ($base instanceof NodeList && $base->count() === 0)) { return; } if (empty($this->mapping) && $this->singleSelector) { yield from $this->singleSelector($base); } else { if ($this->each) { if ($base instanceof NodeList) { foreach ($base as $element) { yield $this->mapProperties($element); } } } elseif ($base instanceof Node) { yield $this->mapProperties($base); } } } /** * @throws InvalidArgumentException|MissingZlibExtensionException */ protected function validateAndSanitizeInput(mixed $input): HtmlDocument|XmlDocument { if ($input instanceof RespondedRequest) { $this->baseUrl = $input->effectiveUri(); } return new HtmlDocument($this->validateAndSanitizeStringOrHttpResponse($input)); } /** * @throws InvalidHtmlException * @throws Exception */ protected function singleSelector(Node|NodeList $nodeOrNodeList): Generator { if ($this->singleSelector === null) { return; } $domQuery = is_string($this->singleSelector) ? $this->makeDefaultDomQueryInstance($this->singleSelector) : $this->singleSelector; if ($this->baseUrl !== null) { $domQuery->setBaseUrl($this->baseUrl); } if ($nodeOrNodeList instanceof NodeList) { $outputs = []; foreach ($nodeOrNodeList as $node) { $outputs[] = $domQuery->apply($node); } } else { $outputs = $domQuery->apply($nodeOrNodeList); } if (is_array($outputs)) { foreach ($outputs as $output) { yield $output; } } elseif ($outputs !== null) { yield $outputs; } } /** * @return mixed[] * @throws Exception */ protected function mapProperties(Node $node): array { $mappedProperties = []; foreach ($this->mapping as $key => $domQuery) { if ($domQuery instanceof Dom) { $domQuery->baseUrl = $this->baseUrl; $mappedProperties[$key] = $this->getDataFromChildDomStep($domQuery, $node); } else { if (is_string($domQuery)) { $domQuery = $this->makeDefaultDomQueryInstance($domQuery); } if ($this->baseUrl !== null) { $domQuery->setBaseUrl($this->baseUrl); } $mappedProperties[$key] = $domQuery->apply($node); } } return $mappedProperties; } /** * @throws Exception */ protected function getBase(DomDocument|Node $document): Node|NodeList|null { if ($this->root) { return $document; } elseif ($this->each) { return $this->getBaseFromDomNode($document, $this->each, each: true); } elseif ($this->first) { return $this->getBaseFromDomNode($document, $this->first, first: true); } elseif ($this->last) { return $this->getBaseFromDomNode($document, $this->last, last: true); } throw new Exception('Invalid state: no base selector'); } /** * @throws Exception */ private function getBaseFromDomNode( DomDocument|Node $document, DomQuery $query, bool $each = false, bool $first = false, bool $last = false, ): Node|NodeList|null { if (trim($query->query) === '') { return $each ? new NodeList([$document]) : $document; } if ($each) { return $query instanceof CssSelector ? $document->querySelectorAll($query->query) : $document->queryXPath($query->query); } elseif ($first) { return $this->first instanceof CssSelector ? $document->querySelector($query->query) : $document->queryXPath($query->query)->first(); } elseif ($last) { return $this->last instanceof CssSelector ? $document->querySelectorAll($query->query)->last() : $document->queryXPath($query->query)->last(); } return $document; } /** * @return mixed[] * @throws Exception */ protected function getDataFromChildDomStep(Dom $step, Node $node): array { $childValue = iterator_to_array($step->invoke($node)); // When the child step was not used with each() as base and the result is an array with one // element (index/key "0") being an array, use that child array. if (!$step->each && count($childValue) === 1 && isset($childValue[0]) && is_array($childValue[0])) { return $childValue[0]; } return $childValue; } } ================================================ FILE: src/Steps/Exceptions/PreRunValidationException.php ================================================ useKey = $key; return $this; } /** * Step::orWhere() uses this method to link further Filters with OR to this filter. * The Step then takes care of checking if one of the ORs evaluates to true. */ public function addOr(FilterInterface $filter): void { if ($this->or instanceof FilterInterface) { $or = $this->or; while ($or->getOr()) { $or = $or->getOr(); } $or->addOr($filter); } else { $this->or = $filter; } } /** * Get the Filter linked to this Filter as OR. */ public function getOr(): ?FilterInterface { return $this->or instanceof FilterInterface ? $this->or : null; } public function negate(): NegatedFilter { return new NegatedFilter($this); } /** * @throws Exception */ protected function getKey(mixed $value): mixed { if ($this->useKey === null) { return $value; } if (!is_array($value) && !is_object($value)) { throw new InvalidArgumentException('Can only filter by key with array or object output.'); } if (is_object($value) && !property_exists($value, $this->useKey) && method_exists($value, '__serialize')) { $serialized = $value->__serialize(); if (array_key_exists($this->useKey, $serialized)) { $value = $serialized; } } if ( (is_array($value) && !array_key_exists($this->useKey, $value)) || (is_object($value) && !property_exists($value, $this->useKey)) ) { throw new Exception('Key to filter by does not exist in output.'); } return is_array($value) ? $value[$this->useKey] : $value->{$this->useKey}; } } ================================================ FILE: src/Steps/Filters/ArrayFilter.php ================================================ getKey($valueInQuestion); if (is_array($valueInQuestion) && !empty($valueInQuestion)) { foreach ($valueInQuestion as $value) { if ($this->passesAllFilters($value)) { return true; } } } return false; } } ================================================ FILE: src/Steps/Filters/ClosureFilter.php ================================================ getKey($valueInQuestion); return $this->closure->call($this, $valueInQuestion); } } ================================================ FILE: src/Steps/Filters/ComparisonFilter.php ================================================ filterRule->evaluate($this->getKey($valueInQuestion), $this->compareTo); } } ================================================ FILE: src/Steps/Filters/Enums/ComparisonFilterRule.php ================================================ ($value === $compareTo), self::NotEqual => ($value !== $compareTo), self::GreaterThan => ($value > $compareTo), self::GreaterThanOrEqual => ($value >= $compareTo), self::LessThan => ($value < $compareTo), self::LessThanOrEqual => ($value <= $compareTo), }; } } ================================================ FILE: src/Steps/Filters/Enums/StringFilterRule.php ================================================ str_contains($haystack, $needle), self::StartsWith => str_starts_with($haystack, $needle), self::EndsWith => str_ends_with($haystack, $needle), }; } } ================================================ FILE: src/Steps/Filters/Enums/StringLengthFilterRule.php ================================================ ($actualStringLength === $compareTo), self::NotEqual => ($actualStringLength !== $compareTo), self::GreaterThan => ($actualStringLength > $compareTo), self::GreaterThanOrEqual => ($actualStringLength >= $compareTo), self::LessThan => ($actualStringLength < $compareTo), self::LessThanOrEqual => ($actualStringLength <= $compareTo), }; } } ================================================ FILE: src/Steps/Filters/Enums/UrlFilterRule.php ================================================ Url::parse($url)->scheme() === $needle, self::Host => Url::parse($url)->host() === $needle, self::Domain => Url::parse($url)->domain() === $needle, self::Path => Url::parse($url)->path() === $needle, self::PathStartsWith => str_starts_with(Url::parse($url)->path() ?? '', $needle), self::PathMatches => preg_match($this->prepareRegex($needle), Url::parse($url)->path() ?? '') === 1, }; } catch (InvalidUrlException|Exception $exception) { return false; } } protected function prepareRegex(string $regex): string { return '~' . $regex . '~'; } } ================================================ FILE: src/Steps/Filters/Filter.php ================================================ isOutputKeyAlias($keyOrFilter)) { $keyOrFilter = $this->getOutputKeyAliasRealKey($keyOrFilter); } $filter->useKey($keyOrFilter); $this->filters[] = $filter; } else { $this->filters[] = $keyOrFilter; } return $this; } /** * @throws Exception */ public function orWhere(string|FilterInterface $keyOrFilter, ?FilterInterface $filter = null): static { if (empty($this->filters)) { throw new Exception('No where before orWhere'); } elseif (is_string($keyOrFilter) && $filter === null) { throw new InvalidArgumentException('You have to provide a Filter (instance of FilterInterface)'); } elseif (is_string($keyOrFilter)) { $filter->useKey($keyOrFilter); } else { $filter = $keyOrFilter; } $lastFilter = end($this->filters); $lastFilter->addOr($filter); return $this; } protected function passesAllFilters(mixed $output): bool { foreach ($this->filters as $filter) { if (!$filter->evaluate($output)) { if ($filter->getOr()) { $orFilter = $filter->getOr(); while ($orFilter) { if ($orFilter->evaluate($output)) { continue 2; } $orFilter = $orFilter->getOr(); } } return false; } } return true; } } ================================================ FILE: src/Steps/Filters/NegatedFilter.php ================================================ filter->useKey($key); return $this; } public function evaluate(mixed $valueInQuestion): bool { return !$this->filter->evaluate($valueInQuestion); } public function addOr(FilterInterface $filter): void { $this->filter->addOr($filter); } public function getOr(): ?FilterInterface { return $this->filter->getOr(); } public function negate(): NegatedFilter { return new NegatedFilter($this); } } ================================================ FILE: src/Steps/Filters/StringFilter.php ================================================ getKey($valueInQuestion); if (!is_string($valueInQuestion)) { return false; } return $this->filterRule->evaluate($valueInQuestion, $this->filterString); } } ================================================ FILE: src/Steps/Filters/StringLengthFilter.php ================================================ getKey($valueInQuestion); if (!is_string($valueInQuestion)) { return false; } return $this->filterRule->evaluate($valueInQuestion, $this->compareToLength); } } ================================================ FILE: src/Steps/Filters/UrlFilter.php ================================================ getKey($valueInQuestion); if (!is_string($valueInQuestion)) { return false; } return $this->filterRule->evaluate($valueInQuestion, $this->filterString); } } ================================================ FILE: src/Steps/Group.php ================================================ * @throws Exception */ public function invokeStep(Input $input): Generator { $combinedOutput = $combinedKeptData = []; if ($this->uniqueInput && !$this->inputOrOutputIsUnique($input)) { return; } $this->storeOriginalInput($input); // When input is array and useInputKey() was used, invoke the steps only with that input array element, // but keep the original input, because we want to use it e.g. for the keepInputData() functionality. $inputForStepInvocation = $this->getInputKeyToUse($input); if ($inputForStepInvocation) { foreach ($this->steps as $step) { foreach ($step->invokeStep($inputForStepInvocation) as $nthOutput => $output) { if (method_exists($step, 'callUpdateInputUsingOutput')) { $inputForStepInvocation = $step->callUpdateInputUsingOutput($inputForStepInvocation, $output); } if ($this->includeOutput($step)) { $combinedOutput = $this->addToCombinedOutputData( $output->get(), $combinedOutput, $nthOutput, ); } // Also transfer data, kept in group child steps, to the kept data of the final group output. if ($output->keep !== $inputForStepInvocation->keep) { $keep = $this->getNewlyKeptData($output, $inputForStepInvocation); $combinedKeptData = $this->addToCombinedOutputData($keep, $combinedKeptData, $nthOutput); } } } yield from $this->prepareCombinedOutputs($combinedOutput, $combinedKeptData, $input); } } public function addStep(StepInterface $step): self { if ($this->logger instanceof LoggerInterface) { $step->addLogger($this->logger); } if (method_exists($step, 'setLoader') && $this->loader instanceof LoaderInterface) { $step->setLoader($this->loader); } if ($this->maxOutputs) { $step->maxOutputs($this->maxOutputs); } $this->steps[] = $step; return $this; } public function addLogger(LoggerInterface $logger): static { parent::addLogger($logger); foreach ($this->steps as $step) { $step->addLogger($logger); } return $this; } public function setLoader(LoaderInterface $loader): self { $this->loader = $loader; foreach ($this->steps as $step) { if (method_exists($step, 'setLoader')) { $step->setLoader($loader); } } return $this; } public function maxOutputs(int $maxOutputs): static { parent::maxOutputs($maxOutputs); foreach ($this->steps as $step) { $step->maxOutputs($maxOutputs); } return $this; } public function outputType(): StepOutputType { return StepOutputType::AssociativeArrayOrObject; } protected function includeOutput(StepInterface $step): bool { if ( !method_exists($step, 'shouldOutputBeExcludedFromGroupOutput') || $step->shouldOutputBeExcludedFromGroupOutput() === false ) { return true; } return false; } /** * @param mixed[] $combined * @return mixed[] */ private function addToCombinedOutputData(mixed $add, array $combined, int $nthElement): array { if (is_array($add)) { foreach ($add as $key => $value) { $combined[$nthElement][$key][] = $value; } } else { $combined[$nthElement][][] = $add; } return $combined; } /** * @return mixed[] */ private function getNewlyKeptData(Output $output, Input $input): array { return array_filter($output->keep, function ($key) use ($input) { return !array_key_exists($key, $input->keep); }, ARRAY_FILTER_USE_KEY); } /** * @param mixed[] $combinedOutputs * @param mixed[] $combinedKeptData * @param Input $input * @return Generator * @throws Exception */ private function prepareCombinedOutputs(array $combinedOutputs, array $combinedKeptData, Input $input): Generator { foreach ($combinedOutputs as $key => $combinedOutput) { if ($this->maxOutputsExceeded()) { break; } $outputData = $this->normalizeCombinedOutputs($combinedOutput); $outputData = $this->applyRefiners($outputData, $input->get()); if ($this->passesAllFilters($outputData)) { $output = $this->makeOutput($outputData, $input); if (array_key_exists($key, $combinedKeptData)) { $output->keep($this->normalizeCombinedOutputs($combinedKeptData[$key])); } if ($this->uniqueOutput !== false && !$this->inputOrOutputIsUnique($output)) { continue; } yield $output; $this->trackYieldedOutput(); } } } /** * Normalize combined outputs * * When adding outputs to combined output during step invocation, it always adds as arrays. * Here it unwraps all array properties with just one element to have just that one element as value. * * @param mixed[] $combinedOutputs * @return mixed[] */ private function normalizeCombinedOutputs(array $combinedOutputs): array { return array_map(function ($output) { return count($output) === 1 ? reset($output) : $output; }, $combinedOutputs); } } ================================================ FILE: src/Steps/Html/CssSelector.php ================================================ toXPath($query); } catch (ExpressionErrorException|SyntaxErrorException $exception) { throw InvalidDomQueryException::fromSymfonyException($query, $exception); } } else { try { (new HtmlDocument(''))->querySelector($query); } catch (DOMException $exception) { throw InvalidDomQueryException::fromDomException($query, $exception); } } } parent::__construct($query); } protected function filter(Node $node): NodeList { if ($this->query === '') { return new NodeList([$node]); } return $node->querySelectorAll($this->query); } } ================================================ FILE: src/Steps/Html/DomQuery.php ================================================ toAbsoluteUrl && $node instanceof HtmlDocument) { $baseHref = $node->getBaseHref(); if ($baseHref) { $this->setBaseUrl($baseHref); } } $filtered = $this->filter($node); if ($this->filtersMatches()) { $filtered = $this->filterMatches($filtered); if ($filtered === null) { return null; } } if ($filtered->count() > 1) { return $filtered->each(function ($element) { return $this->getTarget($element); }); } elseif ($filtered->count() === 1) { $node = $filtered->first(); if ($node instanceof HtmlElement || $node instanceof XmlElement) { return $this->getTarget($node); } } return null; } public function first(): self { $this->onlyFirstMatch = true; return $this; } public function last(): self { $this->onlyLastMatch = true; return $this; } public function nth(int $n): self { if ($n < 1) { throw new InvalidArgumentException('Argument $n must be greater than 0'); } $this->onlyNthMatch = $n; return $this; } public function even(): self { $this->onlyEvenMatches = true; return $this; } public function odd(): self { $this->onlyOddMatches = true; return $this; } public function text(): self { $this->target = SelectorTarget::Text; return $this; } public function formattedText(?Html2Text $converter = null): self { $this->target = SelectorTarget::FormattedText; if ($converter) { $this->html2TextConverter = $converter; } return $this; } public function html(): self { $this->target = SelectorTarget::Html; return $this; } public function attribute(string $attributeName): self { $this->target = SelectorTarget::Attribute; $this->attributeName = $attributeName; return $this; } public function outerHtml(): self { $this->target = SelectorTarget::OuterHtml; return $this; } public function link(): self { $this->target = SelectorTarget::Attribute; $this->attributeName = 'href'; $this->toAbsoluteUrl = true; return $this; } public function withoutFragment(): self { $this->withFragment = false; return $this; } /** * Call this method and the selected value will be converted to an absolute url when apply() is called. * * @return $this */ public function toAbsoluteUrl(): self { $this->toAbsoluteUrl = true; return $this; } /** * Automatically called when used in a Dom step. * * @throws Exception */ public function setBaseUrl(string $baseUrl): static { if (!empty($this->baseUrl)) { $this->baseUrl = Url::parse($this->baseUrl)->resolve($baseUrl)->__toString(); } else { $this->baseUrl = $baseUrl; } return $this; } abstract protected function filter(Node $node): NodeList; protected function filtersMatches(): bool { return $this->onlyFirstMatch || $this->onlyLastMatch || $this->onlyNthMatch !== false || $this->onlyEvenMatches || $this->onlyOddMatches; } /** * @return NodeList|null * @throws Exception */ protected function filterMatches(NodeList $matches): ?NodeList { if ( $matches->count() === 0 || ($this->onlyNthMatch !== false && $matches->count() < $this->onlyNthMatch) ) { return null; } if ($this->onlyFirstMatch) { $node = $matches->first(); return $node ? new NodeList([$node]) : new NodeList([]); } elseif ($this->onlyLastMatch) { $node = $matches->last(); return $node ? new NodeList([$node]) : new NodeList([]); } elseif ($this->onlyNthMatch !== false) { $node = $matches->nth($this->onlyNthMatch); return $node ? new NodeList([$node]) : new NodeList([]); } elseif ($this->onlyEvenMatches || $this->onlyOddMatches) { return $this->filterEvenOrOdd($matches); } return null; } /** * @param NodeList $domCrawler * @return NodeList */ protected function filterEvenOrOdd(NodeList $domCrawler): NodeList { $nodes = []; $i = 1; foreach ($domCrawler as $node) { if ( ($this->onlyEvenMatches && $i % 2 === 0) || ($this->onlyOddMatches && $i % 2 !== 0) ) { $nodes[] = $node; } $i++; } return new NodeList($nodes); } /** * @throws InvalidHtmlException * @throws Exception */ protected function getTarget(HtmlElement|XmlElement $node): string { if ($this->target === SelectorTarget::FormattedText) { if (!$this->html2TextConverter) { $this->html2TextConverter = new Html2Text(); } $target = $this->html2TextConverter->convertHtmlToText( $node instanceof HtmlElement ? $node->outerHtml() : $node->outerXml(), ); } elseif ($this->target === SelectorTarget::Html) { $target = $node instanceof HtmlElement ? trim($node->innerHtml()) : trim($node->innerXml()); } elseif ($this->target === SelectorTarget::OuterHtml) { $target = $node instanceof HtmlElement ? trim($node->outerHtml()) : trim($node->outerXml()); } else { $target = trim( $this->attributeName ? ($node->getAttribute($this->attributeName) ?? '') : ( method_exists($node, strtolower($this->target->name)) ? $node->{strtolower($this->target->name)}() : '' ), ); } if ($this->toAbsoluteUrl && $this->baseUrl !== null) { $target = $this->handleUrlFragment(Url::parse($this->baseUrl)->resolve($target)); } if (str_contains($target, '�')) { $target = str_replace('�', '', $target); } return $target; } /** * @throws Exception */ protected function handleUrlFragment(Url $url): Url { if (!$this->withFragment) { $url->fragment(''); } return $url; } } ================================================ FILE: src/Steps/Html/Exceptions/InvalidDomQueryException.php ================================================ setDomQuery($domQuery); return $exception; } public static function fromSymfonyException( string $domQuery, ExpressionErrorException|SyntaxErrorException $originalException, ): self { $exception = new self( $originalException->getMessage(), $originalException->getCode(), $originalException, ); $exception->setDomQuery($domQuery); return $exception; } public static function fromDomException(string $domQuery, DOMException $originalException): self { $exception = new self( $originalException->getMessage(), $originalException->getCode(), $originalException, ); $exception->setDomQuery($domQuery); return $exception; } public function setDomQuery(string $domQuery): void { $this->query = $domQuery; } public function getDomQuery(): string { return $this->query; } } ================================================ FILE: src/Steps/Html/GetLink.php ================================================ selector = is_string($selector) ? new CssSelector($selector) : $selector; } public static function isSpecialNonHttpLink(HtmlElement $linkElement): bool { $href = $linkElement->getAttribute('href') ?? ''; return str_starts_with($href, 'mailto:') || str_starts_with($href, 'tel:') || str_starts_with($href, 'javascript:'); } public function outputType(): StepOutputType { return StepOutputType::Scalar; } /** * @throws MissingZlibExtensionException */ protected function validateAndSanitizeInput(mixed $input): HtmlDocument { if (!$input instanceof RespondedRequest) { throw new InvalidArgumentException('Input must be an instance of RespondedRequest.'); } $this->baseUri = Url::parse($input->effectiveUri()); return new HtmlDocument(Http::getBodyString($input)); } /** * @param HtmlDocument $input * @return Generator * @throws Exception */ protected function invoke(mixed $input): Generator { $this->getBaseFromDocument($input); $selector = $this->selector ?? 'a'; if (is_string($selector)) { $selector = new CssSelector($selector); } foreach ($input->querySelectorAll($selector->query) as $link) { $linkUrl = $this->getLinkUrl($link); if ($linkUrl) { yield (string) $linkUrl; break; } } } public function onSameDomain(): static { $this->onSameDomain = true; return $this; } public function notOnSameDomain(): static { $this->onSameDomain = false; return $this; } /** * @param string|string[] $domains * @return $this */ public function onDomain(string|array $domains): static { if (is_array($domains) && !$this->isArrayWithOnlyStrings($domains)) { throw new InvalidArgumentException('You can only set domains from string values'); } $domains = is_string($domains) ? [$domains] : $domains; $this->onDomain = $this->onDomain ? array_merge($this->onDomain, $domains) : $domains; return $this; } public function onSameHost(): static { $this->onSameHost = true; return $this; } public function notOnSameHost(): static { $this->onSameHost = false; return $this; } /** * @param string|string[] $hosts */ public function onHost(string|array $hosts): static { if (is_array($hosts) && !$this->isArrayWithOnlyStrings($hosts)) { throw new InvalidArgumentException('You can only set hosts from string values'); } $hosts = is_string($hosts) ? [$hosts] : $hosts; $this->onHost = $this->onHost ? array_merge($this->onHost, $hosts) : $hosts; return $this; } public function withoutFragment(): static { $this->withFragment = false; return $this; } /** * @throws Exception */ protected function getBaseFromDocument(HtmlDocument $document): void { $baseHref = $document->getBaseHref(); if (!empty($baseHref)) { $this->baseUri = $this->baseUri->resolve($baseHref); } } /** * @throws Exception */ protected function getLinkUrl(HtmlElement $link): ?Url { if ($link->nodeName() !== 'a') { $this->logger?->warning('Selector matched <' . $link->nodeName() . '> html element. Ignored it.'); return null; } if (self::isSpecialNonHttpLink($link)) { return null; } $linkUrl = $this->handleUrlFragment( $this->baseUri->resolve($link->getAttribute('href') ?? ''), ); if ($this->matchesAdditionalCriteria($linkUrl)) { return $linkUrl; } return null; } /** * @throws Exception */ protected function matchesAdditionalCriteria(Url $link): bool { return ($this->onSameDomain === null || $this->isOnSameDomain($link)) && ($this->onSameHost === null || $this->isOnSameHost($link)) && ($this->onDomain === null || $this->isOnDomain($link)) && ($this->onHost === null || $this->isOnHost($link)); } protected function isOnSameDomain(Url $link): bool { return ($this->onSameDomain && $this->baseUri->isDomainEqualIn($link)) || ($this->onSameDomain === false && !$this->baseUri->isDomainEqualIn($link)); } protected function isOnSameHost(Url $link): bool { return ($this->onSameHost && $this->baseUri->isHostEqualIn($link)) || ($this->onSameHost === false && !$this->baseUri->isHostEqualIn($link)); } /** * @throws Exception */ protected function isOnDomain(Url $link): bool { if (is_array($this->onDomain)) { foreach ($this->onDomain as $domain) { if ($link->domain() === $domain) { return true; } } } return false; } /** * @throws Exception */ protected function isOnHost(Url $link): bool { if (is_array($this->onHost)) { foreach ($this->onHost as $host) { if ($link->host() === $host) { return true; } } } return false; } /** * @param mixed[] $array * @return bool */ protected function isArrayWithOnlyStrings(array $array): bool { foreach ($array as $element) { if (!is_string($element)) { return false; } } return true; } /** * @throws Exception */ protected function handleUrlFragment(Url $url): Url { if (!$this->withFragment) { $url->fragment(''); } return $url; } } ================================================ FILE: src/Steps/Html/GetLinks.php ================================================ * @throws Exception */ protected function invoke(mixed $input): Generator { $this->getBaseFromDocument($input); $selector = $this->selector ?? 'a'; if (is_string($selector)) { $selector = new CssSelector($selector); } foreach ($input->querySelectorAll($selector->query) as $link) { $linkUrl = $this->getLinkUrl($link); if ($linkUrl) { yield (string) $linkUrl; } } } } ================================================ FILE: src/Steps/Html/MetaData.php ================================================ onlyKeys = $keys; return $this; } public function outputType(): StepOutputType { return StepOutputType::AssociativeArrayOrObject; } /** * @param HtmlDocument $input */ protected function invoke(mixed $input): Generator { $data = $this->addToData([], 'title', $this->getTitle($input)); foreach ($input->querySelectorAll('meta') as $metaElement) { $metaName = $metaElement->getAttribute('name'); if (empty($metaName)) { $metaName = $metaElement->getAttribute('property'); } if (!empty($metaName) && (empty($this->onlyKeys) || in_array($metaName, $this->onlyKeys, true))) { $data = $this->addToData($data, $metaName, $metaElement->getAttribute('content') ?? ''); } } yield $data; } /** * @throws MissingZlibExtensionException */ protected function validateAndSanitizeInput(mixed $input): mixed { return $this->validateAndSanitizeToHtmlDocumentInstance($input); } protected function getTitle(HtmlDocument $document): string { $titleElement = $document->querySelector('title'); if ($titleElement) { return $titleElement->text(); } return ''; } /** * @param array $data * @return array */ protected function addToData(array $data, string $key, string $value): array { if (empty($this->onlyKeys) || in_array($key, $this->onlyKeys, true)) { $data[$key] = $value; } return $data; } } ================================================ FILE: src/Steps/Html/SchemaOrg.php ================================================ */ protected array $mapping = []; public function toArray(): static { $this->toArray = true; return $this; } public function onlyType(string $type = ''): static { $this->onlyType = $type; return $this; } /** * @param array $mapping */ public function extract(array $mapping): static { $this->mapping = $mapping; return $this; } public function outputType(): StepOutputType { return StepOutputType::AssociativeArrayOrObject; } /** * @param string $input */ protected function invoke(mixed $input): Generator { $data = \Crwlr\SchemaOrg\SchemaOrg::fromHtml($input, $this->logger); foreach ($data as $schemaOrgObject) { if ($this->onlyType && $schemaOrgObject->getType() !== $this->onlyType) { yield from $this->scanChildrenForType($schemaOrgObject); continue; } yield $this->prepareReturnValue($schemaOrgObject); } } /** * @throws MissingZlibExtensionException */ protected function validateAndSanitizeInput(mixed $input): string { return $this->validateAndSanitizeStringOrHttpResponse($input); } protected function scanChildrenForType(BaseType $schemaOrgObject): Generator { foreach ($schemaOrgObject->getProperties() as $propertyName => $property) { $propertyValue = $schemaOrgObject->getProperty($propertyName); if ($propertyValue instanceof BaseType && $propertyValue->getType() === $this->onlyType) { yield $this->prepareReturnValue($propertyValue); } elseif ($propertyValue instanceof BaseType) { yield from $this->scanChildrenForType($propertyValue); } } } /** * @return BaseType|mixed[] */ protected function prepareReturnValue(BaseType $object): BaseType|array { if ($this->toArray || !empty($this->mapping)) { if (empty($this->mapping)) { return $object->toArray(); } return $this->applyMapping($object->toArray()); } return $object; } /** * @param mixed[] $schemaOrgData * @return mixed[] */ protected function applyMapping(array $schemaOrgData): array { $extractedData = []; $dot = new Dot($schemaOrgData); foreach ($this->mapping as $outputKey => $dotNotationKey) { if (is_int($outputKey)) { $outputKey = $dotNotationKey; } $extractedData[$outputKey] = $dot->get($dotNotationKey); } return $extractedData; } } ================================================ FILE: src/Steps/Html/SelectorTarget.php ================================================ validateQuery($query); } parent::__construct(trim($query)); } protected function filter(Node $node): NodeList { if ($this->query === '') { return new NodeList([$node]); } return $node->queryXPath($this->query); } /** * @throws InvalidDomQueryException */ private function validateQuery(string $query): void { // Temporarily set a new error handler, so checking an invalid XPath query does not generate a PHP warning. set_error_handler(function ($errno, $errstr) { if ($errno === E_WARNING && $errstr === 'DOMXPath::evaluate(): Invalid expression') { return true; } return false; }); $evaluation = (new DOMXPath(new DOMDocument()))->evaluate($query); restore_error_handler(); if ($evaluation === false) { throw InvalidDomQueryException::make('Invalid XPath query', $query); } } } ================================================ FILE: src/Steps/Html.php ================================================ baseUrl = $input->effectiveUri(); } return $this->validateAndSanitizeToHtmlDocumentInstance($input); } /** * @throws InvalidDomQueryException */ protected function makeDefaultDomQueryInstance(string $query): DomQuery { return new CssSelector($query); } } ================================================ FILE: src/Steps/Json.php ================================================ validateAndSanitizeStringOrHttpResponse($input); } protected function invoke(mixed $input): Generator { $array = $this->inputStringToArray($input); if ($array === null || $this->propertyMapping === null) { if ($array === null) { $this->logger?->warning('Failed to decode JSON string.'); } elseif ($this->propertyMapping === null) { yield $array; } return; } $dot = new Dot($array); if ($this->each === null) { yield $this->mapProperties($dot); } else { $each = $this->each === '' ? $dot->get() : $dot->get($this->each); if (!is_iterable($each)) { $this->logger?->warning('The target of "each" does not exist in the JSON data.'); } else { foreach ($each as $item) { yield $this->mapProperties(new Dot($item)); } } } } /** * @return mixed[]|null */ protected function inputStringToArray(string $input): ?array { try { return JsonUtil::stringToArray($input); } catch (InvalidJsonException) { // If headless browser is used in loader, the JSON in the response body is wrapped in an HTML document. if (str_contains($input, 'querySelector('body')?->text() ?? ''; return JsonUtil::stringToArray($bodyText); } catch (Throwable) { } } } return null; } /** * @param Dot $dot * @return mixed[] */ protected function mapProperties(Dot $dot): array { if ($this->propertyMapping === null || $this->propertyMapping === []) { return []; } $mapped = []; foreach ($this->propertyMapping as $propertyKey => $dotNotation) { if (is_int($propertyKey)) { $propertyKey = $dotNotation; } if ($dotNotation === '' || ($dotNotation === '*' && $dot->get('*') === null)) { $mapped[$propertyKey] = $dot->all(); } else { $mapped[$propertyKey] = $dot->get($dotNotation); } } return $mapped; } } ================================================ FILE: src/Steps/Loading/GetSitemapsFromRobotsTxt.php ================================================ */ use LoadingStep; public function outputType(): StepOutputType { return StepOutputType::Scalar; } /** * @throws InvalidRobotsTxtFileException */ protected function invoke(mixed $input): Generator { $robotsTxtHandler = $this->getLoader()->robotsTxt(); foreach ($robotsTxtHandler->getSitemaps($input) as $sitemapUrl) { yield $sitemapUrl; } } /** * @throws InvalidArgumentException */ protected function validateAndSanitizeInput(mixed $input): UriInterface { return $this->validateAndSanitizeToUriInterface($input); } } ================================================ FILE: src/Steps/Loading/Http/AbstractPaginator.php ================================================ */ protected array $loaded = []; protected int $loadedCount = 0; protected ?RequestInterface $latestRequest; /** * @var array */ protected array $stopRules = []; protected bool $hasFinished = false; public function __construct(protected int $maxPages = Paginator::MAX_PAGES_DEFAULT) {} public function processLoaded( RequestInterface $request, ?RespondedRequest $respondedRequest, ): void { $this->registerLoadedRequest($respondedRequest ?? $request); } public function hasFinished(): bool { return $this->hasFinished || $this->maxPagesReached(); } /** * When a paginate step is called with multiple inputs, like: * * ['https://www.example.com/listing1', 'https://www.example.com/listing2', ...] * * it always has to start paginating again for each listing base URL. * Therefore, we reset the state after finishing paginating one base input. * Except for $this->found, because if it would be the case that the exact same pages are * discovered whilst paginating, we don't want to load the exact same pages again and again. */ public function resetFinished(): void { $this->hasFinished = false; $this->loadedCount = 0; $this->latestRequest = null; } public function stopWhen(Closure|StopRule $callback): self { $this->stopRules[] = $callback; return $this; } public function logWhenFinished(LoggerInterface $logger): void { if ($this->maxPagesReached()) { $logger->notice('Max pages limit reached.'); } else { $logger->info('Finished paginating.'); } } abstract public function getNextRequest(): ?RequestInterface; protected function registerLoadedRequest(RequestInterface|RespondedRequest $request): void { $key = $request instanceof RespondedRequest ? RequestKey::from($request->request) : RequestKey::from($request); if (array_key_exists($key, $this->loaded)) { return; } $this->loaded[$key] = true; $this->loadedCount++; if ($request instanceof RespondedRequest) { foreach ($request->redirects() as $redirectUrl) { $this->loaded[RequestKey::from($request->request->withUri(Url::parsePsr7($redirectUrl)))] = true; } } $this->latestRequest = $request instanceof RespondedRequest ? $request->request : $request; $respondedRequest = $request instanceof RespondedRequest ? $request : null; $request = $request instanceof RequestInterface ? $request : $request->request; if ($this->shouldStop($request, $respondedRequest)) { $this->setFinished(); } } protected function shouldStop(RequestInterface $request, ?RespondedRequest $respondedRequest): bool { if ($this->maxPagesReached()) { return true; } foreach ($this->stopRules as $stopRule) { if ($stopRule instanceof StopRule && $stopRule->shouldStop($request, $respondedRequest)) { return true; } elseif ($stopRule instanceof Closure && $stopRule->call($this, $request, $respondedRequest)) { return true; } } return false; } protected function maxPagesReached(): bool { return $this->loadedCount >= $this->maxPages; } protected function setFinished(): self { $this->hasFinished = true; return $this; } } ================================================ FILE: src/Steps/Loading/Http/Browser/BrowserAction.php ================================================ waitUntilContainsElement($cssSelector, $timeout); }; } public static function clickElement( string $cssSelector, int $timeout = self::DEFAULT_TIMEOUT, ): Closure { return function (Page $page) use ($cssSelector, $timeout) { $page->waitUntilContainsElement($cssSelector, $timeout); $page->mouse()->find($cssSelector)->click(); }; } /** * Click an element that lives inside a shadow DOM within the document. * * For this purpose the action needs two selectors: the first one to select the shadow host element and the * second one to select the element that shall be clicked inside that shadow DOM. */ public static function clickInsideShadowDom( string $shadowHostSelector, string $clickElementSelector, int $timeout = self::DEFAULT_TIMEOUT, ): Closure { return function (Page $page) use ($shadowHostSelector, $clickElementSelector, $timeout) { $page->evaluate(<< setTimeout(resolve, 25)); shadowHostElement = document.querySelector('{$shadowHostSelector}'); } if (shadowHostElement.shadowRoot) { let clickElement = shadowHostElement.shadowRoot.querySelector('{$clickElementSelector}'); while (!clickElement) { await new Promise(resolve => setTimeout(resolve, 25)); clickElement = shadowHostElement.shadowRoot.querySelector('{$clickElementSelector}'); } clickElement.dispatchEvent(new MouseEvent("click", { bubbles: true })); } })() JS)->waitForResponse($timeout); }; } public static function moveMouseToElement(string $cssSelector, int $timeout = self::DEFAULT_TIMEOUT): Closure { return function (Page $page) use ($cssSelector, $timeout) { $page->waitUntilContainsElement($cssSelector, $timeout); $page->mouse()->find($cssSelector); }; } public static function moveMouseToPosition(int $x, int $y, ?int $steps = null): Closure { return function (Page $page) use ($x, $y, $steps) { if ($steps !== null) { $page->mouse()->move($x, $y, ['steps' => $steps]); } else { $page->mouse()->move($x, $y); } }; } public static function scrollDown(int $distance): Closure { return function (Page $page) use ($distance) { $page->mouse()->scrollDown($distance); }; } public static function scrollUp(int $distance): Closure { return function (Page $page) use ($distance) { $page->mouse()->scrollUp($distance); }; } public static function typeText(string $text, ?int $delay = null): Closure { return function (Page $page) use ($text, $delay) { if ($delay !== null) { $page->keyboard()->setKeyInterval($delay)->typeText($text); } else { $page->keyboard()->typeText($text); } }; } public static function evaluate(string $jsCode): Closure { return function (Page $page) use ($jsCode) { $page->evaluate($jsCode); }; } public static function waitForReload(int $timeout = self::DEFAULT_TIMEOUT): Closure { return function (Page $page) use ($timeout) { $page->waitForReload(timeout: $timeout); }; } public static function wait(float $seconds): Closure { return function () use ($seconds) { usleep(Microseconds::fromSeconds($seconds)->value); }; } public static function screenshot(ScreenshotConfig $config): Closure { return function (Page $page, ?LoggerInterface $logger) use ($config) { $fullFilePath = $config->getFullPath($page); try { $page->screenshot($config->toChromePhpScreenshotConfig($page))->saveToFile($fullFilePath); return new Screenshot($fullFilePath); } catch (Throwable $exception) { $logger?->error('Failed to take screenshot.'); $logger?->debug($exception->getMessage()); return null; } }; } /** * @deprecated Use the two methods evaluate() and waitForReload() separately. */ public static function evaluateAndWaitForReload(string $jsCode): Closure { return function (Page $page) use ($jsCode) { $page->evaluate($jsCode)->waitForPageReload(); }; } /** * @deprecated Use the two methods clickElement() and waitForReload() separately. */ public static function clickElementAndWaitForReload(string $cssSelector): Closure { return function (Page $page) use ($cssSelector) { $page->waitUntilContainsElement($cssSelector); $page->mouse()->find($cssSelector)->click(); $page->waitForReload(); }; } } ================================================ FILE: src/Steps/Loading/Http/Document.php ================================================ respondedRequest); $this->dom = new HtmlDocument($responseBody); $this->setBaseUrl(); } public function dom(): HtmlDocument { return $this->dom; } public function url(): Url { return $this->url; } public function baseUrl(): Url { return $this->baseUrl; } public function canonicalUrl(): string { if ($this->canonicalUrl === null) { $canonicalLinkElement = $this->dom->querySelector('link[rel=canonical]'); if ($canonicalLinkElement) { $canonicalHref = $canonicalLinkElement->getAttribute('href'); if ($canonicalHref) { try { $this->canonicalUrl = $this->baseUrl->resolve($canonicalHref); } catch (Exception $exception) { $this->logger?->warning( 'Failed to resolve canonical link href value against the document base URL.', ); } } } $this->canonicalUrl = $this->canonicalUrl ?? $this->url; } return $this->canonicalUrl; } private function setBaseUrl(): void { $this->url = Url::parse($this->respondedRequest->effectiveUri()); $this->baseUrl = $this->url; $documentBaseHref = $this->dom->getBaseHref(); if ($documentBaseHref) { try { $this->baseUrl = $this->baseUrl->resolve($documentBaseHref); } catch (Exception $exception) { $this->logger?->warning('Failed to resolve the document tag href against the document URL.'); } } } } ================================================ FILE: src/Steps/Loading/Http/Paginate.php ================================================ paginateInputUrl($inputUrl); } } else { yield from $this->paginateInputUrl($input); } } /** * @throws LoadingException */ private function paginateInputUrl(UriInterface $url): Generator { $request = $this->getRequestFromInputUri($url); $response = $this->getResponseFromRequest($request); if ($response) { yield $response; } $this->processLoaded($request, $response); while (!$this->paginator->hasFinished()) { $request = $this->paginator->getNextRequest(); if (!$request) { break; } $response = $this->getResponseFromRequest($request); if ($response) { yield $response; } $this->processLoaded($request, $response); } $this->finish(); } private function finish(): void { if ($this->logger) { $this->paginator->logWhenFinished($this->logger); $this->paginator->resetFinished(); } } private function processLoaded(RequestInterface $request, ?RespondedRequest $response): void { try { $this->paginator->processLoaded($request, $response); } catch (Exception $exception) { $this->logger?->error('Paginate Error: ' . $exception->getMessage()); } } } ================================================ FILE: src/Steps/Loading/Http/Paginator.php ================================================ has($this->queryParamName)) { return $query->get($this->queryParamName); } return $fallbackValue; } /** * @throws Exception */ protected function getCurrentValueUsingDotNotation(Query $query, mixed $fallbackValue = null): mixed { $dot = new Dot($query->toArray()); return $dot->get($this->queryParamName, $fallbackValue); } /** * @throws Exception */ protected function getCurrentValueAsInt(Query $query): int { return (int) $this->getCurrentValue($query); } /** * @throws Exception */ protected function getCurrentValueAsIntUsingDotNotation(Query $query): int { return (int) $this->getCurrentValueUsingDotNotation($query); } } ================================================ FILE: src/Steps/Loading/Http/Paginators/QueryParams/Decrementor.php ================================================ useDotNotation) { $dot = (new Dot($query->toArray()))->set( $this->queryParamName, (string) ($this->getCurrentValueAsIntUsingDotNotation($query) - $this->decrement), ); return new Query($dot->all()); } return $query->set( $this->queryParamName, (string) ($this->getCurrentValueAsInt($query) - $this->decrement), ); } } ================================================ FILE: src/Steps/Loading/Http/Paginators/QueryParams/Incrementor.php ================================================ useDotNotation) { $dot = (new Dot($query->toArray()))->set( $this->queryParamName, (string) ($this->getCurrentValueAsIntUsingDotNotation($query) + $this->increment), ); return new Query($dot->all()); } return $query->set( $this->queryParamName, (string) ($this->getCurrentValueAsInt($query) + $this->increment), ); } } ================================================ FILE: src/Steps/Loading/Http/Paginators/QueryParams/QueryParamManipulator.php ================================================ paramsInUrl = true; return $this; } public static function paramsInBody(int $maxPages = Paginator::MAX_PAGES_DEFAULT): self { $instance = new self($maxPages); $instance->paramsInUrl = false; return $instance; } public function inBody(): self { $this->paramsInUrl = false; return $this; } public function increase(string $queryParamName, int $by = 1, bool $useDotNotation = false): self { $this->manipulators[] = new Incrementor($queryParamName, $by, $useDotNotation); return $this; } public function increaseUsingDotNotation(string $queryParamName, int $by = 1): self { $this->manipulators[] = new Incrementor($queryParamName, $by, true); return $this; } public function decrease(string $queryParamName, int $by = 1, bool $useDotNotation = false): self { $this->manipulators[] = new Decrementor($queryParamName, $by, $useDotNotation); return $this; } public function decreaseUsingDotNotation(string $queryParamName, int $by = 1): self { $this->manipulators[] = new Decrementor($queryParamName, $by, true); return $this; } /** * @throws Exception */ public function getNextRequest(): ?RequestInterface { if (!$this->latestRequest) { return null; } if ($this->paramsInUrl) { $url = Url::parse($this->latestRequest->getUri()); $query = $url->queryString(); } else { $query = Query::fromString(Http::getBodyString($this->latestRequest)); } foreach ($this->manipulators as $manipulator) { $query = $manipulator->execute($query); } if ($this->paramsInUrl) { $request = $this->latestRequest->withUri($url->toPsr7()); } else { $request = $this->latestRequest->withBody(Utils::streamFor($query->toString())); } return $request; } } ================================================ FILE: src/Steps/Loading/Http/Paginators/SimpleWebsitePaginator.php ================================================ */ protected array $found = []; /** * @var array */ protected array $loadedUrls = []; protected DomQuery $paginationLinksSelector; protected string $latestRequestKey = ''; /** * @var array */ protected array $parentRequests = []; /** * @throws InvalidDomQueryException */ public function __construct(string|DomQuery $paginationLinksSelector, int $maxPages = 1000) { if (is_string($paginationLinksSelector)) { $this->paginationLinksSelector = Dom::cssSelector($paginationLinksSelector); } else { $this->paginationLinksSelector = $paginationLinksSelector; } parent::__construct($maxPages); } public function hasFinished(): bool { return $this->maxPagesReached() || empty($this->found) || $this->hasFinished; } public function getNextRequest(): ?RequestInterface { if (!$this->latestRequest) { return null; } $nextUrl = array_shift($this->found); if (!$nextUrl) { return null; } $request = $this->parentRequests[$nextUrl['foundOn']]; $this->cleanUpParentRequests(); return $request->withUri(Url::parsePsr7($nextUrl['url'])); } /** * @throws Exception */ public function processLoaded( RequestInterface $request, ?RespondedRequest $respondedRequest, ): void { $this->registerLoadedRequest($respondedRequest ?? $request); if ($this->latestRequest) { $this->latestRequestKey = RequestKey::from($this->latestRequest); } $this->loadedUrls[$request->getUri()->__toString()] = true; if ($respondedRequest) { foreach ($respondedRequest->redirects() as $redirectUrl) { $this->loadedUrls[$redirectUrl] = true; } $this->getPaginationLinksFromResponse($respondedRequest); } } public function logWhenFinished(LoggerInterface $logger): void { if ($this->maxPagesReached() && !empty($this->found)) { $logger->notice('Max pages limit reached'); } else { $logger->info('All found pagination links loaded'); } } /** * @throws Exception */ protected function getPaginationLinksFromResponse(RespondedRequest $respondedRequest): void { $responseBody = Http::getBodyString($respondedRequest); $document = new Dom\HtmlDocument($responseBody); $paginationLinksElements = $this->paginationLinksSelector instanceof CssSelector ? $document->querySelectorAll($this->paginationLinksSelector->query) : $document->queryXPath($this->paginationLinksSelector->query); foreach ($paginationLinksElements as $paginationLinksElement) { /** @var Dom\HtmlElement $paginationLinksElement */ $this->addFoundUrlFromLinkElement( $paginationLinksElement, $document, $respondedRequest->effectiveUri(), ); foreach ($paginationLinksElement->querySelectorAll('a') as $linkInPaginationLinksElement) { $this->addFoundUrlFromLinkElement( $linkInPaginationLinksElement, $document, $respondedRequest->effectiveUri(), ); } } } /** * @throws Exception */ protected function addFoundUrlFromLinkElement( Dom\HtmlElement $linkElement, Dom\HtmlDocument $document, string $documentUrl, ): void { if ($this->isRelevantLinkElement($linkElement)) { $url = $this->getAbsoluteUrlFromLinkElement($linkElement, $document, $documentUrl); $this->addFoundUrl($url); } } /** * @throws Exception */ protected function getAbsoluteUrlFromLinkElement( Dom\HtmlElement $linkElement, Dom\HtmlDocument $document, string $documentUrl, ): string { $baseUrl = Url::parse($documentUrl); $baseHref = $document->getBaseHref(); if ($baseHref) { $baseUrl = $baseUrl->resolve($baseHref); } $linkHref = $linkElement->getAttribute('href') ?? ''; return $baseUrl->resolve($linkHref)->__toString(); } protected function isRelevantLinkElement(Dom\HtmlElement $element): bool { if ($element->nodeName() !== 'a') { return false; } $href = $element->getAttribute('href'); return !empty($href) && !str_starts_with($href, '#'); } protected function addFoundUrl(string $url): void { if (!isset($this->found[$url]) && !isset($this->loadedUrls[$url])) { if ($this->latestRequest && !array_key_exists($this->latestRequestKey, $this->parentRequests)) { $this->parentRequests[$this->latestRequestKey] = $this->latestRequest; } $this->found[$url] = ['url' => $url, 'foundOn' => $this->latestRequestKey]; } } /** * The parent requests for found links are stored, so the new requests are always created from the actual parent, * not the latest registered response. After getting the next request to load, always check for all parent * requests, if there are still children in the found URLs. If not, the parent request can be forgotten, so we * keep memory usage as low as possible. */ protected function cleanUpParentRequests(): void { foreach ($this->parentRequests as $requestKey => $request) { foreach ($this->found as $found) { if ($found['foundOn'] === $requestKey) { continue 2; } } unset($this->parentRequests[$requestKey]); } } } ================================================ FILE: src/Steps/Loading/Http/Paginators/StopRules/Contains.php ================================================ response)); return str_contains($content, $this->contains); } } ================================================ FILE: src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInDom.php ================================================ response)); try { $document = $this->makeDom($source); } catch (Throwable $exception) { return true; } $domQuery = $this->selector instanceof DomQuery ? $this->selector : new CssSelector($this->selector); $filtered = $domQuery instanceof CssSelector ? $document->querySelectorAll($domQuery->query) : $document->queryXPath($domQuery->query); if ($filtered->count() === 0) { return true; } foreach ($filtered as $element) { /** @var HtmlElement|XmlElement $element */ if (!$this->nodeIsEmpty($element)) { return false; } } return true; } abstract protected function makeDom(string $source): DomDocument; private function nodeIsEmpty(HtmlElement|XmlElement $node): bool { return $node instanceof HtmlElement ? trim($node->innerHtml()) === '' : trim($node->innerXml()) === ''; } } ================================================ FILE: src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInHtml.php ================================================ response)); $json = Json::stringToArray($content); $dot = new Dot($json); return empty($dot->get($this->dotNotationKey)); } } ================================================ FILE: src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInXml.php ================================================ response)); return $content === '' || $content === '[]' || $content === '{}'; } } ================================================ FILE: src/Steps/Loading/Http/Paginators/StopRules/NotContains.php ================================================ response)); return !str_contains($content, $this->contains); } } ================================================ FILE: src/Steps/Loading/Http/Paginators/StopRules/PaginatorStopRules.php ================================================ response : $message; $message->getBody()->rewind(); $contents = $message->getBody()->getContents(); $message->getBody()->rewind(); if (in_array('application/x-gzip', $message->getHeader('Content-Type'), true)) { return Gzip::decode($contents); } return $contents; } /** * @throws InvalidDomQueryException */ public function paginate( AbstractPaginator|string $paginator, int $defaultPaginatorMaxPages = Paginator::MAX_PAGES_DEFAULT, ): Paginate { if (is_string($paginator)) { $paginator = Paginator::simpleWebsite($paginator, $defaultPaginatorMaxPages); } return $this->transferSettingsToPaginateStep( new Paginate($paginator, $this->method, $this->headers, $this->body, $this->httpVersion), ); } public function outputType(): StepOutputType { return StepOutputType::AssociativeArrayOrObject; } /** * @param UriInterface|UriInterface[] $input * @return Generator * @throws Exception */ protected function invoke(mixed $input): Generator { $input = !is_array($input) ? [$input] : $input; foreach ($input as $uri) { $response = $this->getResponseFromInputUri($uri); if ($response) { yield $response; } } $this->resetInputRequestParams(); } /** * Temporary fix to transfer settings that may have already been defined on the current instance, * to a new Paginate step instance. This shall be fixed in the next major version (v4) by removing * the Paginate class and implementing it in the Http class directly. */ private function transferSettingsToPaginateStep(Paginate $step): Paginate { $step->stopOnErrorResponse = $this->stopOnErrorResponse; $step->yieldErrorResponses = $this->yieldErrorResponses; $step->useAsUrl = $this->useAsUrl; $step->useAsBody = $this->useAsBody; $step->useAsHeaders = $this->useAsHeaders; $step->useAsHeader = $this->useAsHeader; $step->staticUrl = $this->staticUrl; $step->postBrowserNavigateHooks = $this->postBrowserNavigateHooks; $step->skipCache = $this->skipCache; $step->forceBrowserUsage = $this->forceBrowserUsage; return $step; } } ================================================ FILE: src/Steps/Loading/HttpBase.php ================================================ */ use LoadingStep; protected bool $stopOnErrorResponse = false; protected bool $yieldErrorResponses = false; protected ?string $useAsUrl = null; protected ?string $useAsBody = null; protected ?string $inputBody = null; protected ?string $useAsHeaders = null; /** * @var null|array */ protected ?array $useAsHeader = null; /** * @var null|array */ protected ?array $inputHeaders = null; protected ?string $staticUrl = null; /** * @var Closure[] */ protected array $postBrowserNavigateHooks = []; protected bool $skipCache = false; protected bool $forceBrowserUsage = false; /** * @param string $method * @param array $headers * @param string|StreamInterface|null $body * @param string $httpVersion */ public function __construct( protected readonly string $method = 'GET', protected readonly array $headers = [], protected readonly string|StreamInterface|null $body = null, protected readonly string $httpVersion = '1.1', ) {} public function stopOnErrorResponse(): static { $this->stopOnErrorResponse = true; return $this; } public function yieldErrorResponses(): static { $this->yieldErrorResponses = true; return $this; } /** * Chose key from array input to use its value as request URL * * If input is an array with string keys, you can define which key from that array should be used as the URL for * the HTTP request. */ public function useInputKeyAsUrl(string $key): static { $this->useAsUrl = $key; return $this; } /** * Chose key from array input to use its value as request body * * If input is an array with string keys, you can define which key from that array should be used as the body for * the HTTP request. */ public function useInputKeyAsBody(string $key): static { $this->useAsBody = $key; return $this; } /** * Chose key from array input to use its value as a request header * * If input is an array with string keys, you can choose a key from that array and map it to an HTTP request header. */ public function useInputKeyAsHeader(string $key, ?string $asHeader = null): static { $asHeader = $asHeader ?? $key; if ($this->useAsHeader === null) { $this->useAsHeader = []; } $this->useAsHeader[$key] = $asHeader; return $this; } /** * Chose key from array input to use its value as request headers * * If input is an array with string keys, you can choose a key from that array that will be used as headers for the * HTTP request. So, the value behind that array key, has to be an array with header names as keys. If you want to * map just one single HTTP header from input, use the `useInputKeyAsHeader()` method. */ public function useInputKeyAsHeaders(string $key): static { $this->useAsHeaders = $key; return $this; } public function postBrowserNavigateHook(Closure $callback): static { if ($this->method !== 'GET') { $this->logger?->warning( 'A ' . $this->method . ' request cannot be executed using the (headless) browser, so post browser ' . 'navigate hooks can\'t be defined for this step either.', ); return $this; } $this->postBrowserNavigateHooks[] = $callback; return $this; } /** * Skip using the cache for this step * * If you're using a cache in your crawler's loader, but want to skip using the cache for one * particular step in the chain, use this method. * * Attention: this has no effect if you directly use the loader in a custom child step. * If you want to use this feature, please use getResponseFromInputUri() or getResponseFromRequest() * instead of the loader. */ public function skipCache(): static { $this->skipCache = true; return $this; } /** * This allows the step to temporarily switch the loader to use the (headless) Chrome browser, * even if it is configured to use the (guzzle) HTTP client. When a request is finished, * it resets the loader setting. * * Attention: this has no effect if you directly use the loader in a custom child step. * If you want to use this feature, please use getResponseFromInputUri() or getResponseFromRequest() * instead of the loader. */ public function useBrowser(): static { $this->forceBrowserUsage = true; return $this; } public function staticUrl(string $url): static { $this->staticUrl = $url; return $this; } /** * @return UriInterface|UriInterface[] * @throws InvalidArgumentException */ protected function validateAndSanitizeInput(mixed $input): mixed { $this->getBodyFromArrayInput($input); $this->getHeadersFromArrayInput($input); $input = $this->staticUrl ? $this->resolveStaticUrl() : $this->getUrlFromArrayInput($input); if (is_array($input)) { foreach ($input as $key => $url) { $input[$key] = $this->validateAndSanitizeToUriInterface($url); } return $input; } return $this->validateAndSanitizeToUriInterface($input); } protected function outputKeyAliases(): array { return [ 'url' => 'effectiveUri', 'uri' => 'effectiveUri', 'status' => 'responseStatusCode', 'headers' => 'responseHeaders', 'body' => 'responseBody', ]; } /** * @throws LoadingException */ protected function getResponseFromInputUri(UriInterface $input): ?RespondedRequest { $request = $this->getRequestFromInputUri($input); return $this->getResponseFromRequest($request); } protected function getRequestFromInputUri(UriInterface $uri): RequestInterface { $body = $this->inputBody ?? $this->body; $headers = $this->mergeHeaders(); list($body, $headers) = $this->resolveVarsInRequestProperties($body, $headers); return new Request($this->method, $uri, $headers, $body, $this->httpVersion); } /** * @throws LoadingException * @throws Exception */ protected function getResponseFromRequest(RequestInterface $request): ?RespondedRequest { $loader = $this->getLoader(); $loaderResetConfig = $this->applyTempLoaderCustomizations(); try { $response = $this->stopOnErrorResponse ? $loader->loadOrFail($request) : $loader->load($request); } finally { $this->resetTempLoaderCustomizations($loaderResetConfig); } if ($response !== null && ($response->response->getStatusCode() < 400 || $this->yieldErrorResponses)) { return $response; } return null; } /** * @return array * @throws Exception */ private function applyTempLoaderCustomizations(): array { $loader = $this->getLoader(); $resetConfig = ['resetToHttpClient' => false, 'resetToBrowser' => false]; if ($this->skipCache) { $loader->skipCacheForNextRequest(); } if ($this->method !== 'GET' && ($this->forceBrowserUsage || $loader->usesHeadlessBrowser())) { $this->logger?->warning( 'The (headless) browser can only be used for GET requests! Therefore this step will use the HTTP ' . 'client for loading.', ); if ($loader->usesHeadlessBrowser()) { $loader->useHttpClient(); $resetConfig['resetToBrowser'] = true; } } elseif ($this->forceBrowserUsage && !$loader->usesHeadlessBrowser()) { $resetConfig['resetToHttpClient'] = true; $loader->useHeadlessBrowser(); } if (!empty($this->postBrowserNavigateHooks) && $loader->usesHeadlessBrowser()) { $loader->browser()->setTempPostNavigateHooks($this->postBrowserNavigateHooks); } return $resetConfig; } /** * @param array $resetConfig */ private function resetTempLoaderCustomizations(array $resetConfig): void { $loader = $this->getLoader(); if ($resetConfig['resetToHttpClient'] === true) { try { $loader->useHttpClient(); } catch (Throwable) { } } elseif ($resetConfig['resetToBrowser']) { $loader->useHeadlessBrowser(); } } /** * @return mixed */ protected function getUrlFromArrayInput(mixed $input): mixed { if ($this->useAsUrl) { if (!is_array($input)) { $this->logger?->warning('Input is not array, therefore can\'t get URL from input by key.'); } elseif (array_key_exists($this->useAsUrl, $input)) { return [$input[$this->useAsUrl]]; } else { $this->logger?->warning( 'Input key ' . $this->useAsUrl . ' that should be used as request URL isn\'t present in input.', ); } } elseif (is_array($input) && array_key_exists('url', $input)) { return $input['url']; } elseif (is_array($input) && array_key_exists('uri', $input)) { return $input['uri']; } return $input; } protected function getBodyFromArrayInput(mixed $input): void { if ($this->useAsBody) { if (!is_array($input)) { $this->logger?->warning('Input is not array, therefore can\'t get body from input by key.'); } elseif (array_key_exists($this->useAsBody, $input)) { $this->inputBody = $input[$this->useAsBody]; } else { $this->logger?->warning( 'Input key ' . $this->useAsBody . ' that should be used as request body isn\'t present in input.', ); } } } protected function getHeadersFromArrayInput(mixed $input): void { if ($this->useAsHeaders) { if (!is_array($input)) { $this->logger?->warning('Input is not array, therefore can\'t get headers from input by key.'); } elseif (array_key_exists($this->useAsHeaders, $input)) { $this->inputHeaders = $input[$this->useAsHeaders]; } else { $this->logger?->warning( 'Input key ' . $this->useAsHeaders . ' that should be used as request headers isn\'t present in ' . 'input.', ); } } if (is_array($this->useAsHeader)) { if (!is_array($input)) { $this->logger?->warning('Input is not array, therefore can\'t get header from input by key.'); } else { foreach ($this->useAsHeader as $inputKey => $headerName) { $this->addToInputHeadersFromInput($input, $inputKey, $headerName); } } } } protected function addToInputHeadersFromInput(mixed $input, string $inputKey, string $headerName): void { if (!is_array($this->inputHeaders)) { $this->inputHeaders = []; } if (!array_key_exists($inputKey, $input)) { $this->logger?->warning( 'Input key ' . $inputKey . ' that should be used as a request header, isn\'t present in input.', ); return; } $inputValue = $input[$inputKey]; if (!array_key_exists($headerName, $this->inputHeaders)) { $this->inputHeaders[$headerName] = is_array($inputValue) ? $inputValue : [$inputValue]; return; } $this->inputHeaders = HttpHeaders::addTo(HttpHeaders::normalize($this->inputHeaders), $headerName, $inputValue); } /** * @return array */ protected function mergeHeaders(): array { $headers = HttpHeaders::normalize($this->headers); if (is_array($this->inputHeaders)) { $inputHeaders = HttpHeaders::normalize($this->inputHeaders); $headers = HttpHeaders::merge($headers, $inputHeaders); } return $headers; } protected function resetInputRequestParams(): void { $this->inputHeaders = null; $this->inputBody = null; } private function resolveStaticUrl(): string { $fullInput = $this->getFullOriginalInput(); $inputValue = $fullInput?->get(); if (!is_array($inputValue)) { $inputValue = []; } return TemplateString::resolve($this->staticUrl ?? '', $inputValue); } /** * @param StreamInterface|string|null $body * @param array $headers * @return array{ 0: string|StreamInterface|null, 1: array } */ private function resolveVarsInRequestProperties(StreamInterface|string|null $body, array $headers): array { $fullInput = $this->getFullOriginalInput(); if (!$fullInput) { return [$body, $headers]; } $fullInputData = $fullInput->get(); if (!is_array($fullInputData)) { return [$body, $headers]; } return [ is_string($body) ? TemplateString::resolve($body, $fullInputData) : $body, $this->resolveVarsInHeaders($headers, $fullInputData), ]; } /** * @param array $headers * @param mixed[] $fullInputData * @return array */ private function resolveVarsInHeaders(array $headers, array $fullInputData): array { foreach ($headers as $headerName => $headerValues) { foreach ($headerValues as $key => $headerValue) { $headers[$headerName][$key] = TemplateString::resolve($headerValue, $fullInputData); } } return $headers; } } ================================================ FILE: src/Steps/Loading/HttpCrawl.php ================================================ > */ protected array $urls = []; /** * @var array */ protected array $loadedUrls = []; protected int $yieldedResponseCount = 0; public function __construct(array $headers = [], string $httpVersion = '1.1') { parent::__construct(headers: $headers, httpVersion: $httpVersion); } public function depth(int $depth): static { $this->depth = $depth; return $this; } public function sameHost(): static { $this->sameHost = true; $this->sameDomain = false; return $this; } public function sameDomain(): static { $this->sameDomain = true; $this->sameHost = false; return $this; } public function pathStartsWith(string $startsWith = ''): static { $this->pathStartsWith = $startsWith; return $this; } public function pathMatches(string $regexPattern = ''): static { $this->pathRegex = $regexPattern; return $this; } public function customFilter(Closure $closure): static { $this->customClosure = $closure; return $this; } public function inputIsSitemap(): static { $this->inputIsSitemap = true; return $this; } public function loadAllButYieldOnlyMatching(): static { $this->loadAll = true; return $this; } public function keepUrlFragment(): static { $this->keepUrlFragment = true; return $this; } public function useCanonicalLinks(): static { $this->useCanonicalLinks = true; return $this; } protected function validateAndSanitizeInput(mixed $input): mixed { return $this->validateAndSanitizeToUriInterface($input); } /** * @param UriInterface $input * @throws Exception */ protected function invoke(mixed $input): Generator { $this->setHostOrDomain($input); $response = $this->getResponseFromInputUri($input); if (!$response) { return; } $initialResponseDocument = new Document($response); $this->setResponseCanonicalUrl($response, $initialResponseDocument); $this->addLoadedUrlsFromResponse($response); if (!$this->inputIsSitemap && $this->matchesAllCriteria(Url::parse($input))) { $this->yieldedResponseCount++; yield $response; } $this->urls = $this->getUrlsFromInitialResponse($response, $initialResponseDocument); $depth = 1; while ( !$this->depthIsExceeded($depth) && !empty($this->urls) && (!$this->maxOutputs || $this->yieldedResponseCount < $this->maxOutputs) ) { yield from $this->loadUrls(); $depth++; } } /** * @throws Exception */ protected function setHostOrDomain(UriInterface $uri): void { if ($this->sameHost) { $this->host = $uri->getHost(); } else { $domain = Url::parse($uri)->domain(); if (!is_string($domain) || empty($domain)) { throw new Exception('No domain in input url'); } $this->domain = $domain; } } /** * @throws Exception */ protected function loadUrls(): Generator { $newUrls = []; foreach ($this->urls as $url => $yieldResponse) { $uri = Url::parsePsr7($url); $response = $this->getResponseFromInputUri($uri); if ($response !== null && !$this->wasAlreadyLoaded($response)) { $document = new Document($response, $this->logger); $this->setResponseCanonicalUrl($response, $document); $yieldResponse = $this->yieldResponse($document, $yieldResponse['yield']); $this->addLoadedUrlsFromResponse($response); $newUrls = array_merge($newUrls, $this->getUrlsFromHtmlDocument($document)); if ($yieldResponse) { yield $response; $this->yieldedResponseCount++; if ($this->maxOutputs && $this->yieldedResponseCount >= $this->maxOutputs) { break; } } } } $this->urls = $newUrls; } /** * @return array> * @throws Exception */ protected function getUrlsFromInitialResponse(RespondedRequest $respondedRequest, ?Document $document = null): array { if ($this->inputIsSitemap) { return $this->getUrlsFromSitemap($respondedRequest); } else { $document = $document ?? new Document($respondedRequest); return $this->getUrlsFromHtmlDocument($document); } } /** * @return array> * @throws Exception */ protected function getUrlsFromSitemap(RespondedRequest $respondedRequest): array { $document = new XmlDocument(Http::getBodyString($respondedRequest)); if (PhpVersion::isBelow(8, 4)) { $document = GetUrlsFromSitemap::fixUrlSetTag($document); } $urls = []; foreach ($document->querySelectorAll('urlset url loc') as $url) { $url = $this->handleUrlFragment(Url::parse($url->text())); if (!$this->isOnSameHostOrDomain($url)) { continue; } $matchesCriteria = $this->matchesCriteriaBesidesHostOrDomain($url); if (!$matchesCriteria && !$this->loadAll) { continue; } $url = $url->toString(); if (!isset($urls[$url]) && !isset($this->urls[$url]) && !isset($this->loadedUrls[$url])) { $urls[$url] = ['yield' => $matchesCriteria]; } } return $urls; } /** * @return array> * @throws Exception */ protected function getUrlsFromHtmlDocument(Document $document): array { $this->addCanonicalUrlToLoadedUrls($document); $urls = []; foreach ($document->dom()->querySelectorAll('a') as $link) { if (GetLink::isSpecialNonHttpLink($link)) { continue; } try { $url = $this->handleUrlFragment($document->baseUrl()->resolve($link->getAttribute('href') ?? '')); } catch (Throwable) { $this->logger?->warning('Failed to resolve a link with href: ' . $link->getAttribute('href')); continue; } if (!$this->isOnSameHostOrDomain($url)) { continue; } $matchesCriteria = $this->matchesCriteriaBesidesHostOrDomain($url, $link); if (!$matchesCriteria && !$this->loadAll) { continue; } $url = $url->toString(); if (!isset($urls[$url]) && !isset($this->urls[$url]) && !isset($this->loadedUrls[$url])) { $urls[$url] = ['yield' => $matchesCriteria]; } } return $urls; } protected function addLoadedUrlsFromResponse(RespondedRequest $respondedRequest): void { $loadedUrls = [$respondedRequest->requestedUri() => true]; foreach ($respondedRequest->redirects() as $redirectUrl) { $loadedUrls[$redirectUrl] = true; } foreach ($loadedUrls as $loadedUrl => $true) { if (!isset($this->loadedUrls[$loadedUrl])) { $this->loadedUrls[$loadedUrl] = true; } } } /** * If the loaded response had a redirect, it can be that it was a redirect to a page that was already loaded before. * In that case, don't yield that response again. * * @param RespondedRequest $respondedRequest * @return bool */ protected function wasAlreadyLoaded(RespondedRequest $respondedRequest): bool { if ( array_key_exists($respondedRequest->requestedUri(), $this->loadedUrls) || array_key_exists($respondedRequest->effectiveUri(), $this->loadedUrls) ) { $this->logger?->info('Was already loaded before. Do not process this page again.'); return true; } foreach ($respondedRequest->redirects() as $url) { if (array_key_exists($url, $this->loadedUrls)) { $this->logger?->info('Was already loaded before. Do not process this page again.'); return true; } } return false; } protected function addCanonicalUrlToLoadedUrls(Document $document): void { if ($this->useCanonicalLinks && !isset($this->loadedUrls[$document->canonicalUrl()])) { $this->loadedUrls[$document->canonicalUrl()] = true; } } /** * Yield response only if the URL matches the defined criteria and if the canonical URL isn't already among the * loaded URLs (and of course, the user decided that canonical links shall be used, because this is optional). */ protected function yieldResponse(Document $document, bool $urlMatchesCriteria): bool { if (!$urlMatchesCriteria) { return false; } return !$this->useCanonicalLinks || !array_key_exists($document->canonicalUrl(), $this->loadedUrls); } /** * @throws Exception */ protected function setResponseCanonicalUrl(RespondedRequest $respondedRequest, Document $document): void { if ($this->useCanonicalLinks && $respondedRequest->effectiveUri() !== $document->canonicalUrl()) { $this->logger?->info('Canonical link URL of this document is: ' . $document->canonicalUrl()); $respondedRequest->addRedirectUri($document->canonicalUrl()); } } protected function depthIsExceeded(int $depth): bool { return $this->depth !== null && $depth > $this->depth; } /** * @throws Exception */ protected function matchesAllCriteria(Url $url, ?HtmlElement $linkElement = null): bool { return $this->isOnSameHostOrDomain($url) && $this->matchesCriteriaBesidesHostOrDomain($url, $linkElement); } /** * @throws Exception */ protected function matchesCriteriaBesidesHostOrDomain(Url $url, ?HtmlElement $linkElement = null): bool { return $this->matchesPathCriteria($url) && $this->matchesCustomCriteria($url, $linkElement); } /** * @throws Exception */ protected function isOnSameHostOrDomain(Url $url): bool { if ($this->sameHost) { return $this->host === $url->host(); } else { return $this->domain === $url->domain(); } } /** * @throws Exception */ protected function matchesPathCriteria(Url $url): bool { if ($this->pathStartsWith === null && $this->pathRegex === null) { return true; } $path = $url->path() ?? ''; return ($this->pathStartsWith === null || str_starts_with($path, $this->pathStartsWith)) && ($this->pathRegex === null || preg_match($this->pathRegex, $path) === 1); } protected function matchesCustomCriteria(Url $url, ?HtmlElement $linkElement): bool { return $this->customClosure === null || $this->customClosure->call($this, $url, $linkElement); } /** * @throws Exception */ protected function handleUrlFragment(Url $url): Url { if (!$this->keepUrlFragment) { $url->fragment(''); } return $url; } } ================================================ FILE: src/Steps/Loading/LoadingStep.php ================================================ loader = $loader; return $this; } /** * @param T $loader */ public function withLoader(LoaderInterface $loader): static { $this->customLoader = $loader; return $this; } /** * @return T */ protected function getLoader(): LoaderInterface { return $this->customLoader ?? $this->loader; } } ================================================ FILE: src/Steps/Refiners/AbstractRefiner.php ================================================ logger = $logger; return $this; } protected function logTypeWarning(string $staticRefinerMethod, mixed $value): void { $this->logger?->warning( 'Refiner ' . $staticRefinerMethod . ' can\'t be applied to value of type ' . gettype($value), ); } } ================================================ FILE: src/Steps/Refiners/DateTime/DateTimeFormat.php ================================================ apply($value, function ($value) { if ($this->originFormat) { $parsed = DateTime::createFromFormat($this->originFormat, $value); } else { $parsed = $this->parseFromUnknownFormat($value); } if ($parsed === null) { return $value; } elseif ($parsed === false) { $this->logger?->warning( 'Failed parsing date/time "' . $value . '", so can\'t reformat it to requested format.', ); return $value; } return $parsed->format($this->targetFormat); }, 'DateTimeRefiner::reformat()'); } private function parseFromUnknownFormat(string $value): ?DateTime { $timestamp = strtotime($value); if ($timestamp === false || $timestamp === 0) { $this->logger?->warning( 'Failed to automatically (without known format) parse date/time "' . $value . '", so can\'t reformat ' . 'it to requested format.', ); return null; } return (new DateTime())->setTimestamp($timestamp); } } ================================================ FILE: src/Steps/Refiners/DateTimeRefiner.php ================================================ query; if (trim($selectorString) === '') { $this->logger?->warning( 'Empty selector in remove HTML refiner. If you want HTML nodes to be removed, please define a ' . 'selector for those nodes.', ); } if (is_string($selector)) { $selector = Dom::cssSelector($selector); } $this->selector = $selector; } public function refine(mixed $value): mixed { return $this->apply($value, function ($value) { try { $document = new HtmlDocument($value); } catch (Throwable $exception) { $this->logger?->warning( 'Failed parsing output as HTML in refiner to remove nodes from HTML: ' . $exception->getMessage(), ); return $value; } if ($this->selector instanceof CssSelector) { $document->removeNodesMatchingSelector($this->selector->query); } else { $document->removeNodesMatchingXPath($this->selector->query); } if (str_contains($value, 'outerHtml(); } return $document->querySelector('body')?->innerHtml() ?? $document->outerHtml(); }, 'HtmlRefiner::remove()'); } } ================================================ FILE: src/Steps/Refiners/HtmlRefiner.php ================================================ logTypeWarning($staticRefinerMethod, $value); return $value; } if (is_array($value)) { foreach ($value as $key => $element) { if (is_string($element)) { $value[$key] = $refiner($element); } } } else { $value = $refiner($value); } return $value; } } ================================================ FILE: src/Steps/Refiners/String/StrAfterFirst.php ================================================ apply($value, function ($value) { if ($this->first === '') { return $value; } $split = explode($this->first, $value, 2); $lastPart = end($split); return trim($lastPart); }, 'StringRefiner::afterFirst()'); } } ================================================ FILE: src/Steps/Refiners/String/StrAfterLast.php ================================================ apply($value, function ($value) { if ($this->last === '') { return ''; } $split = explode($this->last, $value); $lastPart = end($split); return trim($lastPart); }, 'StringRefiner::afterLast()'); } } ================================================ FILE: src/Steps/Refiners/String/StrBeforeFirst.php ================================================ apply($value, function ($value) { if ($this->first === '') { return ''; } return trim(explode($this->first, $value)[0]); }, 'StringRefiner::beforeFirst()'); } } ================================================ FILE: src/Steps/Refiners/String/StrBeforeLast.php ================================================ apply($value, function ($value) { if ($this->last === '') { return $value; } $split = explode($this->last, $value); if (count($split) === 1) { return $value; } array_pop($split); return trim(implode($this->last, $split)); }, 'StringRefiner::beforeLast()'); } } ================================================ FILE: src/Steps/Refiners/String/StrBetweenFirst.php ================================================ apply($value, function ($value) { if ($this->start === '') { $splitAtStart = ['', $value]; } else { $splitAtStart = explode($this->start, $value, 2); } if (count($splitAtStart) === 2) { if ($this->end === '') { return trim($splitAtStart[1]); } return trim(explode($this->end, $splitAtStart[1])[0]); } return ''; }, 'StringRefiner::betweenFirst()'); } } ================================================ FILE: src/Steps/Refiners/String/StrBetweenLast.php ================================================ apply($value, function ($value) { if ($this->start === '') { $splitAtStart = ['', $value]; } else { $splitAtStart = explode($this->start, $value); } $lastPart = end($splitAtStart); if ($this->end === '') { return trim($lastPart); } return trim(explode($this->end, $lastPart)[0]); }, 'StringRefiner::betweenLast()'); } } ================================================ FILE: src/Steps/Refiners/String/StrReplace.php ================================================ apply($value, function ($value) { $replaced = str_replace($this->search, $this->replace, $value); return trim($replaced); }, 'StringRefiner::replace()'); // if (!is_string($value)) { // $this->logTypeWarning('StringRefiner::replace()', $value); // // return $value; // } // // $replaced = str_replace($this->search, $this->replace, $value); // // return trim($replaced); } } ================================================ FILE: src/Steps/Refiners/StringRefiner.php ================================================ $url) { $value[$key] = $this->refine($url); } return $value; } if (!is_string($value) && !$value instanceof Url && !$value instanceof UriInterface) { $this->logTypeWarning($this->staticRefinerMethod(), $value); return $value; } if (!$value instanceof Url) { $value = Url::parse($value); } return $this->refineUrl($value); } abstract protected function staticRefinerMethod(): string; abstract protected function refineUrl(Url $url): string; } ================================================ FILE: src/Steps/Refiners/Url/WithFragment.php ================================================ fragment($this->fragment); return (string) $url; } } ================================================ FILE: src/Steps/Refiners/Url/WithHost.php ================================================ host($this->host); return (string) $url; } } ================================================ FILE: src/Steps/Refiners/Url/WithPath.php ================================================ path($this->path); return (string) $url; } } ================================================ FILE: src/Steps/Refiners/Url/WithPort.php ================================================ port($this->port); return (string) $url; } } ================================================ FILE: src/Steps/Refiners/Url/WithQuery.php ================================================ query($this->query); return (string) $url; } } ================================================ FILE: src/Steps/Refiners/Url/WithScheme.php ================================================ scheme($this->scheme); return (string) $url; } } ================================================ FILE: src/Steps/Refiners/Url/WithoutPort.php ================================================ resetPort(); return (string) $url; } } ================================================ FILE: src/Steps/Refiners/UrlRefiner.php ================================================ tag * * Symfony's DomCrawler component has problems when a sitemap's tag contains certain attributes. * So, if the count of urls in the sitemap is zero, try to remove all attributes from the tag. */ public static function fixUrlSetTag(XmlDocument $dom): XmlDocument { if ($dom->querySelectorAll('urlset url')->count() === 0) { return new XmlDocument(preg_replace('//', '', $dom->outerXml()) ?? $dom->outerXml()); } return $dom; } public function withData(): static { $this->withData = true; return $this; } public function outputType(): StepOutputType { return $this->withData ? StepOutputType::AssociativeArrayOrObject : StepOutputType::Scalar; } /** * @param XmlDocument $input */ protected function invoke(mixed $input): Generator { if (PhpVersion::isBelow(8, 4)) { $input = self::fixUrlSetTag($input); } foreach ($input->querySelectorAll('urlset url') as $urlNode) { if ($urlNode->querySelector('loc')) { if ($this->withData) { yield $this->getWithAdditionalData($urlNode); } else { yield $urlNode->querySelector('loc')->text(); } } } } /** * @throws MissingZlibExtensionException */ protected function validateAndSanitizeInput(mixed $input): mixed { return $this->validateAndSanitizeToXmlDocumentInstance($input); } /** * @return string[] */ protected function getWithAdditionalData(XmlElement $urlNode): array { $data = ['url' => $urlNode->querySelector('loc')?->text() ?? '']; $properties = ['lastmod', 'changefreq', 'priority']; foreach ($properties as $property) { $node = $urlNode->querySelector($property); if ($node) { $data[$property] = $node->text(); } } return $data; } } ================================================ FILE: src/Steps/Sitemap.php ================================================ */ abstract protected function invoke(mixed $input): Generator; /** * Calls the validateAndSanitizeInput method and assures that the invoke method receives valid, sanitized input. * * @return Generator * @throws Exception */ final public function invokeStep(Input $input): Generator { if ($this->maxOutputsExceeded()) { return; } $this->storeOriginalInput($input); $inputForStepInvocation = $this->getInputKeyToUse($input); if ($inputForStepInvocation) { try { $validInputValue = $this->validateAndSanitizeInput($inputForStepInvocation->get()); } catch (InvalidArgumentException $exception) { $this->logInvalidInputException($exception, $inputForStepInvocation->get()); return; } if ($this->uniqueInput === false || $this->inputOrOutputIsUnique(new Input($validInputValue))) { if (!$this->groupOutputsPerInput) { yield from $this->invokeAndYield($validInputValue, $input); } else { yield from $this->invokeAndYieldOneOutputPerInput($validInputValue, $input); } } } } /** * Callback that is called in a step group to adapt the input for further steps * * In groups all the steps are called with the same Input, but with this callback it's possible to adjust the input * for the following steps. */ public function updateInputUsingOutput(Closure $closure): static { $this->updateInputUsingOutput = $closure; return $this; } public function excludeFromGroupOutput(): static { $this->excludeFromGroupOutput = true; return $this; } public function oneOutputPerInput(): static { $this->groupOutputsPerInput = true; return $this; } public function shouldOutputBeExcludedFromGroupOutput(): bool { return $this->excludeFromGroupOutput; } /** * If the user set a callback to update the input (see above) => call it. */ public function callUpdateInputUsingOutput(Input $input, Output $output): Input { if ($this->updateInputUsingOutput instanceof Closure) { return $input->withValue( $this->updateInputUsingOutput->call($this, $input->get(), $output->get()), ); } return $input; } /** * Validate and sanitize the incoming Input object * * In child classes you can add this method to validate and sanitize the incoming input. The method is called * automatically when the step is invoked within the Crawler and the invoke method receives the validated and * sanitized input. Also, you can just return any value from this method and in the invoke method it's again * incoming as an Input object. * * @throws InvalidArgumentException Throw this if the input value is invalid for this step. */ protected function validateAndSanitizeInput(mixed $input): mixed { return $input; } /** * @throws InvalidArgumentException */ protected function validateAndSanitizeStringOrStringable( mixed $inputValue, string $exceptionMessage = 'Input must be string or stringable', ): string { $inputValue = $this->getSingleElementFromArray($inputValue); if (is_object($inputValue) && method_exists($inputValue, '__toString')) { return $this->removeUtf8BomFromString($inputValue->__toString()); } if (is_string($inputValue)) { return $this->removeUtf8BomFromString($inputValue); } throw new InvalidArgumentException($exceptionMessage); } /** * @throws InvalidArgumentException|MissingZlibExtensionException */ protected function validateAndSanitizeStringOrHttpResponse( mixed $inputValue, string $exceptionMessage = 'Input must be string, stringable or HTTP response (RespondedRequest)', bool $allowOnlyRespondedRequest = false, ): string { if (is_array($inputValue) && count($inputValue) > 1 && array_key_exists('response', $inputValue)) { $inputValue = $inputValue['response']; } $inputValue = $this->getSingleElementFromArray($inputValue); if ( $inputValue instanceof RespondedRequest || ($inputValue instanceof ResponseInterface && !$allowOnlyRespondedRequest) ) { return $this->removeUtf8BomFromString(Http::getBodyString($inputValue)); } return $this->validateAndSanitizeStringOrStringable($inputValue, $exceptionMessage); } /** * @throws InvalidArgumentException */ protected function validateAndSanitizeToUriInterface( mixed $inputValue, string $exceptionMessage = 'Input must be string, stringable or an instance of UriInterface or Crwlr\\Url', ): UriInterface { $inputValue = $this->getSingleElementFromArray($inputValue); if ($inputValue instanceof UriInterface) { return $inputValue; } if ( is_string($inputValue) || $inputValue instanceof Url || (is_object($inputValue) && method_exists($inputValue, '__toString')) ) { try { return Url::parsePsr7((string) $inputValue); } catch (InvalidUrlException $exception) { throw new InvalidArgumentException($exception->getMessage()); } } throw new InvalidArgumentException($exceptionMessage); } /** * @throws MissingZlibExtensionException */ protected function validateAndSanitizeToHtmlDocumentInstance( mixed $inputValue, string $exceptionMessage = 'Input must be string, stringable or HTTP response (RespondedRequest)', ): HtmlDocument { return new HtmlDocument($this->validateAndSanitizeStringOrHttpResponse($inputValue, $exceptionMessage)); } /** * @throws MissingZlibExtensionException */ protected function validateAndSanitizeToXmlDocumentInstance( mixed $inputValue, string $exceptionMessage = 'Input must be string, stringable or HTTP response (RespondedRequest)', ): XmlDocument { return new XmlDocument($this->validateAndSanitizeStringOrHttpResponse($inputValue, $exceptionMessage)); } protected function getSingleElementFromArray(mixed $inputValue): mixed { if (is_array($inputValue) && count($inputValue) === 1) { return reset($inputValue); } return $inputValue; } /** * @throws Exception */ private function invokeAndYield(mixed $validInputValue, Input $input): Generator { foreach ($this->invoke($validInputValue) as $outputData) { $outputData = $this->applyRefiners($outputData, $input->get()); if ($this->maxOutputsExceeded()) { break; } elseif (!$this->passesAllFilters($outputData)) { continue; } if (!is_array($outputData) && $this->outputKey) { $outputData = [$this->outputKey => $outputData]; } $output = $this->makeOutput($outputData, $input); if ($this->uniqueOutput && !$this->inputOrOutputIsUnique($output)) { continue; } yield $output; $this->trackYieldedOutput(); } } /** * Version of invokeAndYield() when oneOutputPerInput() was called. */ private function invokeAndYieldOneOutputPerInput(mixed $validInputValue, Input $input): Generator { $outputDataArray = []; foreach ($this->invoke($validInputValue) as $outputData) { $outputData = $this->applyRefiners($outputData, $input->get()); if (!$this->passesAllFilters($outputData)) { continue; } $outputDataArray[] = $outputData; } if ($this->outputKey) { $outputDataArray = [$this->outputKey => $outputDataArray]; } $output = $this->makeOutput($outputDataArray, $input); if ($this->uniqueOutput && !$this->inputOrOutputIsUnique($output)) { return; } yield $output; $this->trackYieldedOutput(); } /** * Sometimes there can be a so-called byte order mark character as first characters in a text file. See: * https://stackoverflow.com/questions/53303571/why-does-the-filereader-stream-read-239-187-191-from-a-textfile * 239, 187, 191 is the BOM for UTF-8. Remove it, as it is unnecessary and can cause issues when a string * needs to start with a certain character. * * @param string $string * @return string */ private function removeUtf8BomFromString(string $string): string { if (substr($string, 0, 3) === (chr(239) . chr(187) . chr(191))) { return substr($string, 3); } return $string; } private function logInvalidInputException(InvalidArgumentException $exception, mixed $input): void { $exceptionMessage = $exception->getMessage(); $stepClassName = $this->getStepClassName(); $logMessage = ($stepClassName ? 'The ' . $stepClassName . ' step' : 'A step') . ' was called with input ' . 'that it can not work with: ' . $exceptionMessage; if (str_starts_with($exceptionMessage, 'Input must be string')) { $logMessage .= '. The invalid input is of type ' . gettype($input) . '.'; } $this->logger?->error($logMessage); } } ================================================ FILE: src/Steps/StepInterface.php ================================================ */ public function invokeStep(Input $input): Generator; /** * @param string|string[]|null $keys */ public function keep(string|array|null $keys = null): static; public function keepAs(string $key): static; /** * @param string|string[]|null $keys */ public function keepFromInput(string|array|null $keys = null): static; public function keepInputAs(string $key): static; public function keepsAnything(): bool; public function keepsAnythingFromInputData(): bool; public function keepsAnythingFromOutputData(): bool; public function useInputKey(string $key): static; public function uniqueInputs(?string $key = null): static; public function uniqueOutputs(?string $key = null): static; public function where(string|FilterInterface $keyOrFilter, ?FilterInterface $filter = null): static; public function orWhere(string|FilterInterface $keyOrFilter, ?FilterInterface $filter = null): static; public function outputKey(string $key): static; public function maxOutputs(int $maxOutputs): static; public function resetAfterRun(): void; } ================================================ FILE: src/Steps/StepOutputType.php ================================================ baseUrl = $input->effectiveUri(); } return $this->validateAndSanitizeToXmlDocumentInstance($input); } } ================================================ FILE: src/Stores/JsonFileStore.php ================================================ createTimestamp = time(); touch($this->filePath()); file_put_contents($this->filePath(), '[]'); } /** * @throws Exception */ public function store(Result $result): void { $currentResultsFileContent = file_get_contents($this->filePath()); if (!$currentResultsFileContent) { $currentResultsFileContent = '[]'; } $results = json_decode($currentResultsFileContent, true); $results[] = $result->toArray(); file_put_contents($this->filePath(), json_encode($results)); } public function filePath(): string { return $this->storePath . '/' . ($this->filePrefix ? $this->filePrefix . '-' : '') . $this->createTimestamp . '.json'; } } ================================================ FILE: src/Stores/SimpleCsvFileStore.php ================================================ createTimestamp = time(); touch($this->filePath()); } /** * @throws Exception */ public function store(Result $result): void { $fileHandle = fopen($this->filePath(), 'a'); if (!is_resource($fileHandle)) { throw new Exception('Failed to open file to store data'); } if ($this->isFirstResult) { fputcsv($fileHandle, array_keys($result->toArray()), escape: ''); $this->isFirstResult = false; } $resultArray = $result->toArray(); if ($this->anyPropertyIsArray($result)) { $resultArray = $this->flattenResultArray($resultArray); } fputcsv($fileHandle, array_values($resultArray), escape: ''); fclose($fileHandle); } public function filePath(): string { return $this->storePath . '/' . ($this->filePrefix ? $this->filePrefix . '-' : '') . $this->createTimestamp . '.csv'; } protected function anyPropertyIsArray(Result $result): bool { foreach ($result->toArray() as $value) { if (is_array($value)) { return true; } } return false; } /** * @param mixed[] $result * @return array */ protected function flattenResultArray(array $result): array { foreach ($result as $key => $value) { if (is_array($value)) { $result[$key] = implode(' | ', $value); } } return $result; } } ================================================ FILE: src/Stores/Store.php ================================================ logger = $logger; return $this; } } ================================================ FILE: src/Stores/StoreInterface.php ================================================ productToken; if ($this->version) { $botUserAgent .= '/' . $this->version; } if ($this->infoUri) { $botUserAgent .= '; +' . $this->infoUri; } return $botUserAgent . ')'; } public function productToken(): string { return $this->productToken; } } ================================================ FILE: src/UserAgents/BotUserAgentInterface.php ================================================ userAgent; } public static function mozilla5CompatibleBrowser(): self { return new self('Mozilla/5.0 (compatible)'); } } ================================================ FILE: src/UserAgents/UserAgentInterface.php ================================================ $headers * @return array */ public static function normalize(array $headers): array { $normalized = []; foreach ($headers as $headerName => $value) { $normalized[$headerName] = is_array($value) ? $value : [$value]; } return $normalized; } /** * @param array> $headers * @param array> $mergeHeaders * @return array> */ public static function merge(array $headers, array $mergeHeaders): array { foreach ($mergeHeaders as $headerName => $value) { if (!array_key_exists($headerName, $headers)) { $headers[$headerName] = $value; } else { $headers = self::addTo($headers, $headerName, $value); } } return $headers; } /** * @param array> $headers * @param string $headerName * @param string|string[] $value * @return array> */ public static function addTo(array $headers, string $headerName, string|array $value): array { if (!array_key_exists($headerName, $headers)) { $headers[$headerName] = is_array($value) ? $value : [$value]; } elseif (is_array($value)) { foreach ($value as $valueItem) { if (!in_array($valueItem, $headers[$headerName], true)) { $headers[$headerName][] = $valueItem; } } } elseif (!in_array($value, $headers[$headerName], true)) { $headers[$headerName][] = $value; } return $headers; } } ================================================ FILE: src/Utils/OutputTypeHelper.php ================================================ toArrayForResult(); } elseif (method_exists($output, 'toArray')) { return $output->toArray(); } elseif (method_exists($output, '__serialize')) { return $output->__serialize(); } return (array) $output; } public static function isScalar(mixed $output): bool { return !self::isAssociativeArrayOrObject($output); } public static function isAssociativeArrayOrObject(mixed $output): bool { return self::isAssociativeArray($output) || is_object($output); } public static function isAssociativeArray(mixed $output): bool { if (!is_array($output)) { return false; } foreach ($output as $key => $value) { return is_string($key); } return false; } /** * @param mixed[] $data * @return mixed[] */ public static function recursiveChildObjectsToArray(array $data): array { foreach ($data as $key => $value) { if (is_object($value)) { $data[$key] = self::recursiveChildObjectsToArray(self::objectToArray($value)); } elseif (is_array($value)) { $data[$key] = self::recursiveChildObjectsToArray($value); } } return $data; } } ================================================ FILE: src/Utils/RequestKey.php ================================================ request : $request; $data = [ 'requestMethod' => $request->getMethod(), 'requestUri' => $request->getUri()->__toString(), 'requestHeaders' => $request->getHeaders(), 'requestBody' => Http::getBodyString($request), ]; $data = self::removeIgnoreHeaders($data, $ignoreHeaders); $serialized = serialize($data); return md5($serialized); } /** * @param array $data * @param string[] $ignoreHeaders * @return array */ private static function removeIgnoreHeaders(array $data, array $ignoreHeaders): array { foreach ($ignoreHeaders as $ignoreHeader) { if (isset($data['requestHeaders'][$ignoreHeader])) { unset($data['requestHeaders'][$ignoreHeader]); } $otherCase = strtolower($ignoreHeader); if ($otherCase === $ignoreHeader) { $otherCase = ucwords($ignoreHeader, '-'); } $ignoreHeader = $otherCase; if (isset($data['requestHeaders'][$ignoreHeader])) { unset($data['requestHeaders'][$ignoreHeader]); } } return $data; } } ================================================ FILE: src/Utils/TemplateString.php ================================================ get($varName); } return ''; }, $string) ?? $string; } return $string; } private static function trimAndUnescapeQuotes(string $string): string { if ( str_starts_with($string, '\'') && str_ends_with($string, '\'') || str_starts_with($string, '"') && str_ends_with($string, '"') ) { $string = substr($string, 1, -1); } $string = str_replace(["\'", '\"'], ["'", '"'], $string); return $string; } } ================================================ FILE: tests/Cache/CacheItemTest.php ================================================ value())->toBe('value'); expect($unserialized->key())->toBe('key123'); expect($unserialized->ttl)->toBe(123); expect($unserialized->createdAt->format('Y-m-d H:i:s'))->toBe('2023-01-10 12:10:00'); }); it('creates a key based on the value if you don\'t provide a key manually', function () { $item = new CacheItem('foo'); expect($item->key())->toBeString(); expect(strlen($item->key()))->toBeGreaterThan(0); }); it('tells if it is expired already', function () { $item = new CacheItem('v', 'k', 10); expect($item->isExpired())->toBeFalse(); $item = new CacheItem('v', 'k', 10, (new DateTimeImmutable())->sub(new DateInterval('PT9S'))); expect($item->isExpired())->toBeFalse(); $item = new CacheItem('v', 'k', 10, (new DateTimeImmutable())->sub(new DateInterval('PT11S'))); expect($item->isExpired())->toBeTrue(); }); ================================================ FILE: tests/Cache/FileCacheTest.php ================================================ set($item->cacheKey(), $item); } } function helper_respondedRequestWithRequestUrl(string $requestUrl): RespondedRequest { return new RespondedRequest(new Request('GET', $requestUrl), new Response()); } /** * Helper function to get the CacheItem instance, because FileCache::get() returns only * the value wrapped in the CacheItem object. */ function helper_getCacheItemByKey(string $key): ?CacheItem { $cacheFileContent = file_get_contents(helper_cachedir() . '/' . $key); $cacheItem = unserialize($cacheFileContent !== false ? $cacheFileContent : 'a:0:{}'); return $cacheItem instanceof CacheItem ? $cacheItem : null; } afterEach(function () { helper_resetCacheDir(); }); /** @var TestCase $this */ it('caches a simple value', function () { $cache = new FileCache(helper_cachedir()); $cache->set('user', 'otsch'); expect($cache->get('user'))->toBe('otsch'); }); it('caches RespondedRequest objects', function () { $respondedRequest = new RespondedRequest(new Request('GET', '/'), new Response()); $cache = new FileCache(helper_cachedir()); expect($cache->set($respondedRequest->cacheKey(), $respondedRequest))->toBeTrue() ->and(file_exists(helper_cachedir() . '/' . $respondedRequest->cacheKey()))->toBeTrue() ->and($cache->get($respondedRequest->cacheKey()))->toBeInstanceOf(RespondedRequest::class); }); it('checks if it has an item for a certain key', function () { $respondedRequest = new RespondedRequest(new Request('GET', '/'), new Response()); $cache = new FileCache(helper_cachedir()); $cache->set($respondedRequest->cacheKey(), $respondedRequest); expect($cache->has($respondedRequest->cacheKey()))->toBeTrue() ->and($cache->has('otherKey'))->toBeFalse(); }); it('does not return expired items', function () { $respondedRequest = new RespondedRequest(new Request('GET', '/'), new Response()); $cacheItem = new CacheItem( $respondedRequest, $respondedRequest->cacheKey(), 10, (new DateTimeImmutable())->sub(new DateInterval('PT11S')), ); $cache = new FileCache(helper_cachedir()); $cache->set($cacheItem->key(), $cacheItem); expect($cache->has($cacheItem->key()))->toBeFalse() ->and($cache->get($cacheItem->key()))->toBeNull(); }); it('deletes a cache item', function () { $respondedRequest = new RespondedRequest(new Request('GET', '/'), new Response()); $cache = new FileCache(helper_cachedir()); $cache->set($respondedRequest->cacheKey(), $respondedRequest); expect($cache->has($respondedRequest->cacheKey()))->toBeTrue(); $cache->delete($respondedRequest->cacheKey()); expect($cache->has($respondedRequest->cacheKey()))->toBeFalse(); }); it('deletes an expired cache item when has() is called with its key', function () { $cacheItem = new CacheItem('bar', 'foo', 10, (new DateTimeImmutable())->sub(new DateInterval('PT11S'))); $cache = new FileCache(helper_cachedir()); $cache->set('foo', $cacheItem); expect(file_exists(helper_cachedir() . '/foo'))->toBeTrue() ->and($cache->has('foo'))->toBeFalse() ->and(file_exists(helper_cachedir() . '/foo'))->toBeFalse(); }); it('deletes an expired cache item when get() is called with its key', function () { $cacheItem = new CacheItem('bar', 'foo', 10, (new DateTimeImmutable())->sub(new DateInterval('PT11S'))); $cache = new FileCache(helper_cachedir()); $cache->set('foo', $cacheItem); expect(file_exists(helper_cachedir() . '/foo'))->toBeTrue() ->and($cache->get('foo', 'defaultValue'))->toBe('defaultValue') ->and(file_exists(helper_cachedir() . '/foo'))->toBeFalse(); }); it('clears the whole cache', function () { $cacheItem1 = helper_respondedRequestWithRequestUrl('/foo'); $cacheItem2 = helper_respondedRequestWithRequestUrl('/bar'); $cacheItem3 = helper_respondedRequestWithRequestUrl('/baz'); $cache = new FileCache(helper_cachedir()); helper_addMultipleItemsToCache([$cacheItem1, $cacheItem2, $cacheItem3], $cache); expect($cache->has($cacheItem1->cacheKey()))->toBeTrue() ->and($cache->has($cacheItem2->cacheKey()))->toBeTrue() ->and($cache->has($cacheItem3->cacheKey()))->toBeTrue(); $cache->clear(); expect($cache->has($cacheItem1->cacheKey()))->toBeFalse() ->and($cache->has($cacheItem2->cacheKey()))->toBeFalse() ->and($cache->has($cacheItem3->cacheKey()))->toBeFalse(); }); it('gets multiple items', function () { $cacheItem1 = helper_respondedRequestWithRequestUrl('/foo'); $cacheItem2 = helper_respondedRequestWithRequestUrl('/bar'); $cacheItem3 = helper_respondedRequestWithRequestUrl('/baz'); $cache = new FileCache(helper_cachedir()); helper_addMultipleItemsToCache([$cacheItem1, $cacheItem2, $cacheItem3], $cache); $items = $cache->getMultiple([$cacheItem1->cacheKey(), $cacheItem2->cacheKey(), $cacheItem3->cacheKey()]); expect(reset($items)->request->getUri()->__toString())->toBe('/foo') ->and(next($items)->request->getUri()->__toString())->toBe('/bar') ->and(next($items)->request->getUri()->__toString())->toBe('/baz'); }); it('sets multiple items', function () { $cacheItem1 = helper_respondedRequestWithRequestUrl('/foo'); $cacheItem2 = helper_respondedRequestWithRequestUrl('/bar'); $cacheItem3 = helper_respondedRequestWithRequestUrl('/baz'); $cache = new FileCache(helper_cachedir()); $cache->setMultiple([ $cacheItem1->cacheKey() => $cacheItem1, $cacheItem2->cacheKey() => $cacheItem2, $cacheItem3->cacheKey() => $cacheItem3, ]); expect($cache->has($cacheItem1->cacheKey()))->toBeTrue() ->and($cache->has($cacheItem2->cacheKey()))->toBeTrue() ->and($cache->has($cacheItem3->cacheKey()))->toBeTrue(); }); it('deletes multiple items', function () { $cacheItem1 = helper_respondedRequestWithRequestUrl('/blog'); $cacheItem2 = helper_respondedRequestWithRequestUrl('/contact'); $cacheItem3 = helper_respondedRequestWithRequestUrl('/privacy'); $cache = new FileCache(helper_cachedir()); helper_addMultipleItemsToCache([$cacheItem1, $cacheItem2, $cacheItem3], $cache); $cache->deleteMultiple([$cacheItem1->cacheKey(), $cacheItem2->cacheKey(), $cacheItem3->cacheKey()]); expect($cache->has($cacheItem1->cacheKey()))->toBeFalse() ->and($cache->has($cacheItem2->cacheKey()))->toBeFalse() ->and($cache->has($cacheItem3->cacheKey()))->toBeFalse(); }); it('can still use legacy (pre CacheItem object) cache files', function () { $content = file_get_contents(__DIR__ . '/_cachefilecontent'); file_put_contents(helper_cachedir() . '/foo', $content); $cache = new FileCache(helper_cachedir()); expect($cache->has('foo'))->toBeTrue(); $cacheItem = $cache->get('foo'); expect($cacheItem)->toBeArray(); $respondedRequest = RespondedRequest::fromArray($cacheItem); expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class) ->and($respondedRequest->requestedUri())->toBe( 'https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php', ); }); it('compresses cache data when useCompression() is used', function () { $data = <<set($respondedRequest->cacheKey(), $respondedRequest); $uncompressedFileSize = filesize(helper_cachedir() . '/' . $respondedRequest->cacheKey()); expect($uncompressedFileSize)->not()->toBeFalse(); if ($uncompressedFileSize === false) { throw new RuntimeException('Unable to determine cache file size.'); } clearstatcache(); // Results of filesize() are cached. Clear that to get correct result for compressed file size. $cache->useCompression(); $cache->set($respondedRequest->cacheKey(), $respondedRequest); $compressedFileSize = filesize(helper_cachedir() . '/' . $respondedRequest->cacheKey()); expect($compressedFileSize)->not()->toBeFalse(); if ($compressedFileSize === false) { throw new RuntimeException('Unable to determine cache file size.'); } expect($compressedFileSize)->toBeLessThan($uncompressedFileSize) // Didn't want to check for exact numbers, because I guess they could be a bit different on different systems. // But thought the diff should at least be more than 30% for the test to succeed. ->and($uncompressedFileSize - $compressedFileSize)->toBeGreaterThan($uncompressedFileSize * 0.3); }); it('gets compressed cache items', function () { $cache = new FileCache(helper_cachedir()); $cache->useCompression(); $respondedRequest = new RespondedRequest( new Request('GET', '/compression'), new Response(body: Utils::streamFor('Hello World')), ); $cache->set($respondedRequest->cacheKey(), $respondedRequest); $retrievedCacheItem = $cache->get($respondedRequest->cacheKey()); expect($retrievedCacheItem)->toBeInstanceOf(RespondedRequest::class) ->and(Http::getBodyString($retrievedCacheItem))->toBe('Hello World'); }); it('is also able to decode uncompressed cache files when useCompression() is used', function () { $cache = new FileCache(helper_cachedir()); $respondedRequest = new RespondedRequest(new Request('GET', '/yo'), new Response(body: Utils::streamFor('Yo'))); $cache->set($respondedRequest->cacheKey(), $respondedRequest); $retrievedCacheItem = $cache->get($respondedRequest->cacheKey()); expect($retrievedCacheItem) ->toBeInstanceOf(RespondedRequest::class) ->and(Http::getBodyString($retrievedCacheItem)) ->toBe('Yo'); $cache->useCompression(); $retrievedCacheItem = $cache->get($respondedRequest->cacheKey()); expect($retrievedCacheItem) ->toBeInstanceOf(RespondedRequest::class) ->and(Http::getBodyString($retrievedCacheItem)) ->toBe('Yo'); }); it('can also read compressed cache files, when useCompression() is not used', function () { $cache = new FileCache(helper_cachedir()); $cache->useCompression(); $respondedRequest = new RespondedRequest(new Request('GET', '/no'), new Response(body: Utils::streamFor('No'))); $cache->set($respondedRequest->cacheKey(), $respondedRequest); $cache = new FileCache(helper_cachedir()); $retrievedCacheItem = $cache->get($respondedRequest->cacheKey()); expect($retrievedCacheItem) ->toBeInstanceOf(RespondedRequest::class) ->and(Http::getBodyString($retrievedCacheItem)) ->toBe('No'); }); test('you can change the default ttl', function () { $cache = new FileCache(helper_cachedir()); $cache->ttl(900); $respondedRequest = new RespondedRequest( new Request('GET', '/foo'), new Response(body: Utils::streamFor('bar')), ); $cache->set($respondedRequest->cacheKey(), $respondedRequest); $cacheItem = helper_getCacheItemByKey($respondedRequest->cacheKey()); expect($cacheItem)->toBeInstanceOf(CacheItem::class) ->and($cacheItem?->ttl)->toBe(900); }); it('prolongs the time to live for a single item', function () { $cache = new FileCache(helper_cachedir()); $cache->ttl(100); $respondedRequest = new RespondedRequest(new Request('GET', '/a'), new Response(body: Utils::streamFor('b'))); $cache->set($respondedRequest->cacheKey(), $respondedRequest); $cacheItem = helper_getCacheItemByKey($respondedRequest->cacheKey()); expect($cacheItem)->toBeInstanceOf(CacheItem::class) ->and($cacheItem?->ttl)->toBe(100); /** @var CacheItem $cacheItem */ $cache->prolong($cacheItem->key(), 200); $cacheItem = helper_getCacheItemByKey($cacheItem->key()); expect($cacheItem)->toBeInstanceOf(CacheItem::class) ->and($cacheItem?->ttl)->toBe(200); }); it('prolongs the time to live for all items in the cache directory', function () { $cache = new FileCache(helper_cachedir()); $respondedRequest = new RespondedRequest(new Request('GET', '/a'), new Response(body: Utils::streamFor('b'))); $cache->set($key1 = $respondedRequest->cacheKey(), $respondedRequest, 100); $respondedRequest = new RespondedRequest(new Request('GET', '/c'), new Response(body: Utils::streamFor('d'))); $cache->set($key2 = $respondedRequest->cacheKey(), $respondedRequest, 200); $respondedRequest = new RespondedRequest(new Request('GET', '/e'), new Response(body: Utils::streamFor('f'))); $cache->set($key3 = $respondedRequest->cacheKey(), $respondedRequest, 300); $cacheItem = helper_getCacheItemByKey($key1); expect($cacheItem)->toBeInstanceOf(CacheItem::class) ->and($cacheItem?->ttl)->toBe(100); $cacheItem = helper_getCacheItemByKey($key2); expect($cacheItem)->toBeInstanceOf(CacheItem::class) ->and($cacheItem?->ttl)->toBe(200); $cacheItem = helper_getCacheItemByKey($key3); expect($cacheItem)->toBeInstanceOf(CacheItem::class) ->and($cacheItem?->ttl)->toBe(300); $cache->prolongAll(250); $cacheItem = helper_getCacheItemByKey($key1); expect($cacheItem)->toBeInstanceOf(CacheItem::class) ->and($cacheItem?->ttl)->toBe(250); $cacheItem = helper_getCacheItemByKey($key2); expect($cacheItem)->toBeInstanceOf(CacheItem::class) ->and($cacheItem?->ttl)->toBe(250); $cacheItem = helper_getCacheItemByKey($key3); // Prolonging sets the provided value, no matter if an item's previous ttl value was // higher than the new one. expect($cacheItem)->toBeInstanceOf(CacheItem::class) ->and($cacheItem?->ttl)->toBe(250); }); test('the get() and has() methods delete an expired item, but prolong does not', function () { $cache = new FileCache(helper_cachedir()); $resp = new RespondedRequest(new Request('GET', '/'), new Response()); // with get() $cacheItem = new CacheItem($resp, $resp->cacheKey(), 10, (new DateTimeImmutable())->sub(new DateInterval('PT11S'))); $cache->set($cacheItem->key(), $cacheItem); $cacheItem = $cache->get($cacheItem->key()); expect($cacheItem)->toBeNull() ->and(file_exists(helper_cachedir($resp->cacheKey())))->toBeFalse(); // with has() $cacheItem = new CacheItem($resp, $resp->cacheKey(), 10, (new DateTimeImmutable())->sub(new DateInterval('PT11S'))); $cache->set($cacheItem->key(), $cacheItem); $cache->has($cacheItem->key()); expect($cache->has($cacheItem->key()))->toBeFalse() ->and(file_exists(helper_cachedir($cacheItem->key())))->toBeFalse(); // with prolong() $cache->set($cacheItem->key(), $cacheItem); $cache->prolong($cacheItem->key(), 20); expect($cache->has($cacheItem->key()))->toBeTrue() ->and(file_exists(helper_cachedir($cacheItem->key())))->toBeTrue(); }); ================================================ FILE: tests/Cache/_cachefilecontent ================================================ a:8:{s:13:"requestMethod";s:3:"GET";s:10:"requestUri";s:74:"https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php";s:14:"requestHeaders";a:3:{s:4:"Host";a:1:{i:0;s:18:"www.crwlr.software";}s:10:"User-Agent";a:1:{i:0;s:117:"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36";}s:6:"Cookie";a:2:{i:0;s:20:"XSRF-TOKEN=xsrftoken";i:1;s:29:"crwlrsoftware_session=session";}}s:11:"requestBody";s:0:"";s:12:"effectiveUri";s:74:"https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php";s:18:"responseStatusCode";i:200;s:15:"responseHeaders";a:12:{s:6:"Server";a:1:{i:0;s:12:"nginx/1.21.4";}s:12:"Content-Type";a:1:{i:0;s:24:"text/html; charset=UTF-8";}s:17:"Transfer-Encoding";a:1:{i:0;s:7:"chunked";}s:10:"Connection";a:1:{i:0;s:10:"keep-alive";}s:4:"Vary";a:1:{i:0;s:15:"Accept-Encoding";}s:12:"X-Powered-By";a:1:{i:0;s:9:"PHP/8.1.1";}s:13:"Cache-Control";a:1:{i:0;s:17:"no-cache, private";}s:4:"Date";a:1:{i:0;s:29:"Tue, 03 Jan 2023 12:38:20 GMT";}s:10:"Set-Cookie";a:2:{i:0;s:81:"XSRF-TOKEN=xsrftoken; expires=Tue, 03-Jan-2023 14:38:20 GMT; Max-Age=7200; path=/";i:1;s:100:"crwlrsoftware_session=session; expires=Tue, 03-Jan-2023 14:38:20 GMT; Max-Age=7200; path=/; httponly";}s:15:"X-Frame-Options";a:2:{i:0;s:10:"SAMEORIGIN";i:1;s:4:"DENY";}s:16:"X-XSS-Protection";a:1:{i:0;s:13:"1; mode=block";}s:22:"X-Content-Type-Options";a:2:{i:0;s:7:"nosniff";i:1;s:7:"nosniff";}}s:12:"responseBody";s:39078:" Dealing with HTTP (Url) Query Strings in PHP - crwlr.software Blog

Dealing with HTTP (Url) Query Strings in PHP

2022-06-02

There is a new package in town called query-string. It allows to create, access and manipulate query strings for HTTP requests in a very convenient way. Here's a quick overview of what you can do with it and also how it can be used via the url package.

The last months I started thinking about improving how you can change a URL's query string. It all started with this tweet as an answer to @heychazza's tweet about a nice way to build URLs in javascript.

Screenshot of a tweet by @chrolear saying: In PHP you can use my url package to get and set query params as array. I Could maybe also add a method to set/add a single param 🤔

Then last week someone added this github issue for the url package, and it got me thinking more about this. I liked the suggested API to get and set query params, but I found that it's not enough for more complex query strings. As query strings are also used in POST requests and sent in the request body, I now finally added a separate query-string package and also implemented it in the url package.

Implementation in the Url Package

First off: I set the required PHP version for the new package to 8.0 as the last 7.x version (7.4) is already in the final "security fixes only" phase. The url package currently still requires only 7.2. As I probably plan another BC break for v2 of the url package, for now I just added the query-string package as suggestion to the composer.json. You can manually install it, when you're already on PHP 8.x and want to use the advanced query string functionality.

When you've installed it via

composer require crwlr/query-string

the new queryString() method of the Url class returns an instance of the Query class shipped with the new package. Here's a quick usage example:

$url = Url::parse('https://www.example.com/listing?page[number]=3&page[size]=25');

$url->queryString()
    ->get('page')
    ->set('number', '4');

var_dump($url->__toString());

// string(68) "https://www.example.com/listing?page%5Bnumber%5D=4&page%5Bsize%5D=25"

Standalone Usage

If you want to parse query strings standalone, not in the URL context, you can create an instance of the Query class from string or from array:

$query = Query::fromString('foo=bar&baz=quz');

$query = Query::fromArray(['foo' => 'bar', 'baz' => 'quz']);

Access

Here a quick example of different ways how to access query string params:

$fooValue = Query::fromString('foo=bar&baz=quz')->get('foo'); // string(3) "bar"

When the requested key is an array, the get() method returns another (child) Query instance that you can query further:

$fooBazValue = Query::fromString('foo[bar]=1&foo[baz]=2&foo[quz]=3')
    ->get('foo')
    ->get('baz'); // string(1) "2"

You can check if a certain key exists in the query:

$query = Query::fromString('foo=1&bar=2');

$query->has('bar'); // bool(true)

$query->has('baz'); // bool(false)

You can get the first or last element of an indexed array:

$query = Query::fromString('foo[]=1&foo[]=2&foo[]=3');

$query->first('foo'); // string(1) "1"

$query->last('foo');  // string(1) "3"

You can check if the value for a certain key is an array of a scalar value:

$query = Query::fromString('foo[]=1&foo[]=2&bar=3');

$query->isArray('foo'); // bool(true)

$query->isScalar('foo'); // bool(false)

$query->isArray('bar'); // bool(false)

$query->isScalar('bar'); // bool(true)

And of course you can then convert the query to a string or to an array again:

$query = Query::fromString('foo=bar&baz=quz');

$queryArray = $query->toArray();

// array(2) {
//   ["foo"]=>
//   string(3) "bar"
//   ["baz"]=>
//   string(3) "quz"
// }

$query = Query::fromArray(['foo' => 'bar', 'baz' => 'quz']);

$queryString = $query->toString(); // string(15) "foo=bar&baz=quz"

Manipulation

You can set a certain key:

$query = Query::fromString('foo=bar')->set('baz', 'quz');

// string(15) "foo=bar&baz=quz"

Also to an array:

$query = Query::fromString('foo=1&bar=2')
    ->set('baz', ['3', '4']);

// string(29) "foo=1&bar=2&baz[0]=3&baz[1]=4"

You can also append values to an existing array:

$query = Query::fromString('foo[]=1&foo[]=2')
    ->appendTo('foo', '3');

// string(26) "foo[0]=1&foo[1]=2&foo[2]=3"

$query = Query::fromString('foo[bar]=1&foo[baz]=2')
    ->appendTo('foo', ['quz' => '3']);

// string(32) "foo[bar]=1&foo[baz]=2&foo[quz]=3"

Remove keys or values from keys:

$query = Query::fromString('foo[]=1&foo[]=2&bar=3&baz=4')
    ->remove('foo');

// string(11) "bar=3&baz=4"

$query = Query::fromString('foo[]=1&foo[]=2&foo[]=3&foo[]=2')
    ->removeValueFrom('foo', '2');

// string(17) "foo[0]=1&foo[1]=3"

And you can filter or map queries with callback functions:

$query = Query::fromString('no1=12&no2=7&no3=23&no4=9&no5=10')
    ->filter(function ($value, $key) {
        return (int) $value >= 10;
    });

// string(20) "no1=12&no3=23&no5=10"

$query = Query::fromString('foo=1&bar=2&baz=3&quz=4')
    ->map(function ($value) {
        return (int) $value + 3;
    });

// string(23) "foo=4&bar=5&baz=6&quz=7"

For more details have a look at the documentation. If you're having any question or issues, don't be shy and reach out on twitter or github.

";} ================================================ FILE: tests/CrawlerTest.php ================================================ addStep($step); return $crawler; } /** @var TestCase $this */ test( 'The methods to define UserAgent, Logger and Loader instances are called in construct and the getter methods ' . 'always return the same instance.', function () { $crawler = new DummyTwo(); expect($crawler->getUserAgent()->testProperty)->toBe('foo') ->and($crawler->getLogger()->testProperty)->toBe('foo') ->and($crawler->getLoader()->testProperty)->toBe('foo') ->and($crawler->userAgentCalled)->toBe(1) ->and($crawler->loggerCalled)->toBe(1) ->and($crawler->loaderCalled)->toBe(1); $crawler->getUserAgent()->testProperty = 'bar'; $crawler->getLogger()->testProperty = 'bar'; $crawler->getLoader()->testProperty = 'bar'; $crawler->addStep(Http::get()); // adding steps passes on logger and loader, should use the same instances expect($crawler->getUserAgent()->testProperty)->toBe('bar') ->and($crawler->getLogger()->testProperty)->toBe('bar') ->and($crawler->getLoader()->testProperty)->toBe('bar') ->and($crawler->userAgentCalled)->toBe(1) ->and($crawler->loggerCalled)->toBe(1) ->and($crawler->loaderCalled)->toBe(1); }, ); it('gives you the current memory limit', function () { expect(Crawler::getMemoryLimit())->toBeString(); }); it('changes the current memory limit when allowed', function () { $currentLimit = Crawler::getMemoryLimit(); if ($currentLimit === '512M') { $newValue = '1G'; } else { $newValue = '512M'; } $setLimitReturnValue = Crawler::setMemoryLimit($newValue); if ($setLimitReturnValue === false) { expect(Crawler::getMemoryLimit())->toBe($currentLimit); } else { expect(Crawler::getMemoryLimit())->toBe($newValue); } }); test('You can set a single input for the first step using the input method', function () { $crawler = helper_getDummyCrawlerWithInputReturningStep(); $crawler->input('https://www.example.com'); $results = helper_generatorToArray($crawler->run()); expect($results[0]->toArray()['unnamed'])->toBe('https://www.example.com'); }); test('You can set multiple inputs by multiply calling the input method', function () { $crawler = helper_getDummyCrawlerWithInputReturningStep(); $crawler->input('https://www.crwl.io'); $crawler->input('https://www.otsch.codes'); $results = helper_generatorToArray($crawler->run()); expect($results[0]->toArray()['unnamed'])->toBe('https://www.crwl.io'); expect($results[1]->toArray()['unnamed'])->toBe('https://www.otsch.codes'); }); test('You can set multiple inputs using the inputs (plural) method', function () { $crawler = helper_getDummyCrawlerWithInputReturningStep(); $crawler->inputs(['https://www.crwl.io', 'https://www.otsch.codes']); $results = helper_generatorToArray($crawler->run()); expect($results[0]->toArray()['unnamed'])->toBe('https://www.crwl.io'); expect($results[1]->toArray()['unnamed'])->toBe('https://www.otsch.codes'); }); test('Initial inputs are reset after the crawler was run', function () { $crawler = helper_getDummyCrawlerWithInputReturningStep(); $crawler->inputs(['https://www.crwl.io', 'https://www.otsch.codes']); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(2); $crawler->input('https://fetzi.dev/'); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1); }); test('You can add steps and the Crawler class passes on its Logger and also its Loader if needed', function () { $step = Mockery::mock(StepInterface::class); $step->shouldReceive('addLogger')->once(); $crawler = helper_getDummyCrawler(); $crawler->addStep($step); $step = helper_getLoadingStep(); $step = Mockery::mock($step)->makePartial(); $step->shouldReceive('addLogger')->once(); $step->shouldReceive('setLoader')->once(); $step->shouldReceive('setParentCrawler')->once()->andReturnSelf(); /** @var Step $step */ $crawler->addStep($step); }); test('You can add steps and they are invoked when the Crawler is run', function () { $step1 = helper_getValueReturningStep('step1 output')->keepAs('step1'); $step2 = helper_getValueReturningStep('step2 output')->keepAs('step2'); $crawler = helper_getDummyCrawler() ->addStep($step1) ->addStep($step2); $crawler->input('randomInput'); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1) ->and($results[0]->toArray())->toBe(['step1' => 'step1 output', 'step2' => 'step2 output']); }); it('resets the initial inputs and calls the resetAfterRun method of all its steps', function () { $step = helper_getInputReturningStep()->uniqueOutputs(); $crawler = helper_getDummyCrawler() ->inputs(['input1', 'input1', 'input2']) ->addStep($step->keepAs('foo')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(2) ->and($results[0]->toArray())->toBe(['foo' => 'input1']) ->and($results[1]->toArray())->toBe(['foo' => 'input2']); $crawler->inputs(['input1', 'input3']); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(2) ->and($results[0]->toArray())->toBe(['foo' => 'input1']) ->and($results[1]->toArray())->toBe(['foo' => 'input3']); }); test('You can add a step group as a step and all it\'s steps are invoked when the Crawler is run', function () { $crawler = helper_getDummyCrawler(); $step1 = Mockery::mock(StepInterface::class); $step1->shouldReceive('invokeStep')->andReturn(helper_arrayToGenerator(['foo'])); $step1->shouldReceive('addLogger'); $step2 = Mockery::mock(StepInterface::class); $step2->shouldReceive('invokeStep')->andReturn(helper_arrayToGenerator(['bar'])); $step2->shouldReceive('addLogger'); $step3 = Mockery::mock(StepInterface::class); $step3->shouldReceive('invokeStep')->andReturn(helper_arrayToGenerator(['baz'])); $step3->shouldReceive('addLogger'); $crawler->addStep( Crawler::group() ->addStep($step1) ->addStep($step2) ->addStep($step3), ); expect(true)->toBeTrue(); // So pest doesn't complain that there is no assertion. }); /* ----------------------------- keep() and keepAs() ----------------------------- */ test('when you call keep() or keepAs() on a step, it keeps its output data until the end', function () { $crawler = helper_getDummyCrawler(); $crawler ->input('test') ->addStep( helper_getValueReturningStep(['father' => 'Karl', 'mother' => 'Ludmilla'])->keep(), ) ->addStep( helper_getValueReturningStep([ 'daughter1' => 'Elisabeth', 'son1' => 'Leon', 'son2' => 'Franz', 'daughter2' => 'Julia', 'daughter3' => 'Franziska', ])->keep(['daughter' => 'daughter2', 'son' => 'son2']), ) ->addStep(helper_getValueReturningStep('Lea')->keepAs('cousin')) ->addStep( helper_getValueReturningStep([ 'grandson1' => 'Jonah', 'granddaughter1' => 'Paula', 'granddaughter2' => 'Sophie', ]), ); $results = iterator_to_array($crawler->run()); expect($results[0]->toArray())->toBe([ 'father' => 'Karl', 'mother' => 'Ludmilla', 'daughter' => 'Julia', 'son' => 'Franz', 'cousin' => 'Lea', 'grandson1' => 'Jonah', 'granddaughter1' => 'Paula', 'granddaughter2' => 'Sophie', ]); }); it('immediately stops when keepAs() is not used with a scalar value output step', function () { $crawler = helper_getDummyCrawler(); $step1 = new class extends Step { public bool $wasCalled = false; protected function invoke(mixed $input): Generator { $this->wasCalled = true; yield ['father' => 'Karl', 'mother' => 'Ludmilla']; } public function outputType(): StepOutputType { return StepOutputType::AssociativeArrayOrObject; } }; $step2 = new class extends Step { protected function invoke(mixed $input): Generator { yield 'foo'; } public function outputType(): StepOutputType { return StepOutputType::Scalar; } }; $crawler ->input('test') ->addStep($step1->keep()) ->addStep($step2->keep()); try { $results = iterator_to_array($crawler->run()); } catch (PreRunValidationException $exception) { } expect($results ?? null)->toBeEmpty() ->and($step1->wasCalled)->toBeFalse() ->and($this->getActualOutputForAssertion())->toContain('Pre-Run validation error in step number 2') ->and($exception ?? null)->toBeInstanceOf(PreRunValidationException::class); }); it('sends all results to the Store when there is one and still yields the results', function () { $store = Mockery::mock(StoreInterface::class); $store->shouldReceive('addLogger'); $store->shouldReceive('store')->times(3); $crawler = helper_getDummyCrawler(); $crawler->input('gogogo'); $crawler->setStore($store); $step = new class extends Step { protected function invoke(mixed $input): Generator { yield 'one'; yield 'two'; yield 'three'; } }; $crawler->addStep($step->keepAs('number')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(3) ->and($results[0]->toArray())->toBe(['number' => 'one']) ->and($results[1]->toArray())->toBe(['number' => 'two']) ->and($results[2]->toArray())->toBe(['number' => 'three']); }); it( 'actually runs the crawler without the need to traverse results manually, when runAndTraverse is called', function () { $step = helper_getInputReturningStep(); $store = Mockery::mock(StoreInterface::class); $store->shouldReceive('addLogger'); $store->shouldNotReceive('store'); $crawler = helper_getDummyCrawler() ->addStep($step) ->setStore($store) ->input('test'); $crawler->run(); $store = Mockery::mock(StoreInterface::class); $store->shouldReceive('store', 'addLogger')->once(); $crawler = helper_getDummyCrawler() ->addStep($step) ->setStore($store) ->input('test'); $crawler->runAndTraverse(); }, ); it('yields only unique outputs from a step when uniqueOutput was called', function () { $crawler = helper_getDummyCrawler(); $crawler->addStep(helper_getInputReturningStep()->uniqueOutputs()); $crawler->inputs(['one', 'two', 'three', 'one', 'three', 'four', 'one', 'five', 'two']); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(5); }); it( 'cascades step outputs immediately and doesn\'t wait for the current step being called with all the inputs', function () { $step1 = new class extends Step { protected function invoke(mixed $input): Generator { $this->logger?->info('step1 called'); yield $input . ' step1-1'; yield $input . ' step1-2'; } }; $step2 = new class extends Step { protected function invoke(mixed $input): Generator { $this->logger?->info('step2 called'); yield $input . ' step2'; } }; $store = new class extends Store { public function store(Result $result): void { $this->logger?->info('Stored a result'); } }; $crawler = helper_getDummyCrawler() ->inputs(['input1', 'input2']) ->addStep($step1->keepAs('foo')) ->addStep($step2->keepAs('bar')) ->setStore($store); $crawler->runAndTraverse(); $output = $this->getActualOutputForAssertion(); $outputLines = explode("\n", $output); expect($outputLines[0])->toContain('step1 called') ->and($outputLines[1])->toContain('step2 called') ->and($outputLines[2])->toContain('Stored a result') ->and($outputLines[3])->toContain('step2 called') ->and($outputLines[4])->toContain('Stored a result') ->and($outputLines[5])->toContain('step1 called') ->and($outputLines[6])->toContain('step2 called') ->and($outputLines[7])->toContain('Stored a result') ->and($outputLines[8])->toContain('step2 called') ->and($outputLines[9])->toContain('Stored a result'); }, ); it( 'immediately calls the store for each final output', function () { $step1 = new class extends Step { protected function invoke(mixed $input): Generator { $this->logger?->info('step1 called'); yield '1-1'; yield '1-2'; } }; $step2 = new class extends Step { protected function invoke(mixed $input): Generator { $this->logger?->info('step2 called: ' . $input); yield $input . ' 2-1'; yield $input . ' 2-2'; } }; $step3 = new class extends Step { protected function invoke(mixed $input): Generator { $this->logger?->info('step3 called: ' . $input); yield $input . ' 3-1'; yield $input . ' 3-2'; } }; $step4 = new class extends Step { protected function invoke(mixed $input): Generator { $this->logger?->info('step4 called: ' . $input); yield $input . ' 4-1'; yield $input . ' 4-2'; } }; $store = new class extends Store { public function store(Result $result): void { $this->logger?->info('Stored a result: ' . $result->get('unnamed')); } }; $crawler = helper_getDummyCrawler() ->input('input') ->addStep($step1) ->addStep($step2) ->addStep($step3) ->addStep($step4) ->setStore($store); $crawler->runAndTraverse(); $output = $this->getActualOutputForAssertion(); $outputLines = explode("\n", $output); expect($outputLines[0]) ->toContain('step1 called') ->and($outputLines[1])->toContain('step2 called: 1-1') ->and($outputLines[2])->toContain('step3 called: 1-1 2-1') ->and($outputLines[3])->toContain('step4 called: 1-1 2-1 3-1') ->and($outputLines[4])->toContain('Stored a result: 1-1 2-1 3-1 4-1') ->and($outputLines[5])->toContain('Stored a result: 1-1 2-1 3-1 4-2') ->and($outputLines[6])->toContain('step4 called: 1-1 2-1 3-2') ->and($outputLines[7])->toContain('Stored a result: 1-1 2-1 3-2 4-1') ->and($outputLines[8])->toContain('Stored a result: 1-1 2-1 3-2 4-2') ->and($outputLines[9])->toContain('step3 called: 1-1 2-2') ->and($outputLines[10])->toContain('step4 called: 1-1 2-2 3-1') ->and($outputLines[11])->toContain('Stored a result: 1-1 2-2 3-1 4-1') ->and($outputLines[12])->toContain('Stored a result: 1-1 2-2 3-1 4-2') ->and($outputLines[13])->toContain('step4 called: 1-1 2-2 3-2') ->and($outputLines[14])->toContain('Stored a result: 1-1 2-2 3-2 4-1') ->and($outputLines[15])->toContain('Stored a result: 1-1 2-2 3-2 4-2') ->and($outputLines[16])->toContain('step2 called: 1-2') ->and($outputLines[17])->toContain('step3 called: 1-2 2-1') ->and($outputLines[18])->toContain('step4 called: 1-2 2-1 3-1') ->and($outputLines[19])->toContain('Stored a result: 1-2 2-1 3-1 4-1') ->and($outputLines[20])->toContain('Stored a result: 1-2 2-1 3-1 4-2') ->and($outputLines[21])->toContain('step4 called: 1-2 2-1 3-2') ->and($outputLines[22])->toContain('Stored a result: 1-2 2-1 3-2 4-1') ->and($outputLines[23])->toContain('Stored a result: 1-2 2-1 3-2 4-2') ->and($outputLines[24])->toContain('step3 called: 1-2 2-2') ->and($outputLines[25])->toContain('step4 called: 1-2 2-2 3-1') ->and($outputLines[26])->toContain('Stored a result: 1-2 2-2 3-1 4-1') ->and($outputLines[27])->toContain('Stored a result: 1-2 2-2 3-1 4-2') ->and($outputLines[28])->toContain('step4 called: 1-2 2-2 3-2') ->and($outputLines[29])->toContain('Stored a result: 1-2 2-2 3-2 4-1') ->and($outputLines[30])->toContain('Stored a result: 1-2 2-2 3-2 4-2'); }, ); it( 'does not wait for all child outputs originating from an output of a step where keepAs() was called before ' . 'calling the store', function () { $step1 = new class extends Step { protected function invoke(mixed $input): Generator { $this->logger?->info('step1 called'); yield '1-1'; yield '1-2'; } }; $step2 = new class extends Step { protected function invoke(mixed $input): Generator { $this->logger?->info('step2 called: ' . $input); yield $input . ' 2-1'; yield $input . ' 2-2'; } }; $step2->keepAs('foo'); $step3 = new class extends Step { protected function invoke(mixed $input): Generator { $this->logger?->info('step3 called: ' . $input); yield $input . ' 3-1'; yield $input . ' 3-2'; } }; $step4 = new class extends Step { protected function invoke(mixed $input): Generator { $this->logger?->info('step4 called: ' . $input); yield $input . ' 4-1'; yield $input . ' 4-2'; } }; $step4->keepAs('bar'); $store = new class extends Store { public function store(Result $result): void { $this->logger?->info('Stored a result: ' . $result->get('bar')); } }; $crawler = helper_getDummyCrawler() ->input('input') ->addStep($step1) ->addStep($step2) ->addStep($step3) ->addStep($step4) ->setStore($store); $crawler->runAndTraverse(); $output = $this->getActualOutputForAssertion(); $outputLines = explode("\n", $output); expect($outputLines[0])->toContain('step1 called') ->and($outputLines[1])->toContain('step2 called: 1-1') ->and($outputLines[2])->toContain('step3 called: 1-1 2-1') ->and($outputLines[3])->toContain('step4 called: 1-1 2-1 3-1') ->and($outputLines[4])->toContain('Stored a result: 1-1 2-1 3-1 4-1') ->and($outputLines[5])->toContain('Stored a result: 1-1 2-1 3-1 4-2') ->and($outputLines[6])->toContain('step4 called: 1-1 2-1 3-2') ->and($outputLines[7])->toContain('Stored a result: 1-1 2-1 3-2 4-1') ->and($outputLines[8])->toContain('Stored a result: 1-1 2-1 3-2 4-2') ->and($outputLines[9])->toContain('step3 called: 1-1 2-2') ->and($outputLines[10])->toContain('step4 called: 1-1 2-2 3-1') ->and($outputLines[11])->toContain('Stored a result: 1-1 2-2 3-1 4-1') ->and($outputLines[12])->toContain('Stored a result: 1-1 2-2 3-1 4-2') ->and($outputLines[13])->toContain('step4 called: 1-1 2-2 3-2') ->and($outputLines[14])->toContain('Stored a result: 1-1 2-2 3-2 4-1') ->and($outputLines[15])->toContain('Stored a result: 1-1 2-2 3-2 4-2') ->and($outputLines[16])->toContain('step2 called: 1-2') ->and($outputLines[17])->toContain('step3 called: 1-2 2-1') ->and($outputLines[18])->toContain('step4 called: 1-2 2-1 3-1') ->and($outputLines[19])->toContain('Stored a result: 1-2 2-1 3-1 4-1') ->and($outputLines[20])->toContain('Stored a result: 1-2 2-1 3-1 4-2') ->and($outputLines[21])->toContain('step4 called: 1-2 2-1 3-2') ->and($outputLines[22])->toContain('Stored a result: 1-2 2-1 3-2 4-1') ->and($outputLines[23])->toContain('Stored a result: 1-2 2-1 3-2 4-2') ->and($outputLines[24])->toContain('step3 called: 1-2 2-2') ->and($outputLines[25])->toContain('step4 called: 1-2 2-2 3-1') ->and($outputLines[26])->toContain('Stored a result: 1-2 2-2 3-1 4-1') ->and($outputLines[27])->toContain('Stored a result: 1-2 2-2 3-1 4-2') ->and($outputLines[28])->toContain('step4 called: 1-2 2-2 3-2') ->and($outputLines[29])->toContain('Stored a result: 1-2 2-2 3-2 4-1') ->and($outputLines[30])->toContain('Stored a result: 1-2 2-2 3-2 4-2'); }, ); it('logs memory usage if you want it to', function () { $step1 = helper_getValueReturningStep('foo'); $step2 = helper_getValueReturningStep('bar'); $crawler = helper_getDummyCrawler() ->input('go') ->addStep($step1) ->addStep($step2) ->monitorMemoryUsage(); $crawler->runAndTraverse(); $output = $this->getActualOutputForAssertion(); expect($output)->toContain('memory usage: '); }); it('sends all outputs to the outputHook when defined', function () { $outputs = []; $crawler = helper_getDummyCrawler() ->input(1) ->addStep(helper_getNumberIncrementingStep()) ->addStep(helper_getNumberIncrementingStep()) ->outputHook(function (Output $output, int $stepIndex, StepInterface $step) use (&$outputs) { $outputs[$stepIndex][] = $output->get(); }); $crawler->runAndTraverse(); expect($outputs)->toHaveCount(2) ->and($outputs[0])->toHaveCount(1) ->and($outputs[0][0])->toBe(2) ->and($outputs[1])->toHaveCount(1) ->and($outputs[1][0])->toBe(3); }); test( 'When result is not explicitly composed and last step produces array output with string keys, it uses those keys ' . 'for the result.', function () { $crawler = helper_getDummyCrawler() ->input('hello') ->addStep(helper_getValueReturningStep(['foo' => 'bar', 'baz' => 'quz'])); $results = helper_generatorToArray($crawler->run()); expect($results[0]->toArray())->toBe(['foo' => 'bar', 'baz' => 'quz']); }, ); it('just runs the crawler and dumps all results as array when runAndDump() is called', function () { helper_getDummyCrawlerWithInputReturningStep() ->inputs([ ['foo' => 'one', 'bar' => 'two'], ['baz' => 'three', 'quz' => 'four'], ]) ->runAndDump(); $actualOutput = $this->getActualOutputForAssertion(); expect(explode('array(2)', $actualOutput))->toHaveCount(3) ->and($actualOutput)->toContain('["foo"]=>') ->and($actualOutput)->toContain('string(3) "one"') ->and($actualOutput)->toContain('["bar"]=>') ->and($actualOutput)->toContain('string(3) "two"') ->and($actualOutput)->toContain('["baz"]=>') ->and($actualOutput)->toContain('string(5) "three"') ->and($actualOutput)->toContain('["quz"]=>') ->and($actualOutput)->toContain('string(4) "four"'); }); ================================================ FILE: tests/HttpCrawler/AnonymousHttpCrawlerBuilderTest.php ================================================ withBotUserAgent('YoloCrawler'); expect($crawler)->toBeInstanceOf(HttpCrawler::class) ->and($crawler->getLoader())->toBeInstanceOf(HttpLoader::class); $loader = $crawler->getLoader(); expect($loader->userAgent())->toBeInstanceOf(BotUserAgent::class); $userAgent = $loader->userAgent(); /** @var BotUserAgent $userAgent */ expect($userAgent->productToken())->toBe('YoloCrawler'); }); it('creates an HttpCrawler instance with a non bot user agent', function () { $crawler = HttpCrawler::make() ->withUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 ...'); expect($crawler)->toBeInstanceOf(HttpCrawler::class) ->and($crawler->getLoader())->toBeInstanceOf(HttpLoader::class); $loader = $crawler->getLoader(); expect($loader->userAgent())->toBeInstanceOf(UserAgent::class); $userAgent = $loader->userAgent(); /** @var UserAgent $userAgent */ expect($userAgent->__toString())->toBe('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 ...'); }); it('creates an HttpCrawler instance with a mozilla 5.0 compatible user agent', function () { $crawler = HttpCrawler::make()->withMozilla5CompatibleUserAgent(); $userAgent = $crawler->getLoader()->userAgent(); expect($userAgent->__toString())->toBe('Mozilla/5.0 (compatible)'); }); ================================================ FILE: tests/IoTest.php ================================================ toBeInstanceOf(Io::class); }); test('you can add an array with data that should be kept (see Step::keep() functionality)', function () { $keep = ['foo' => 'bar', 'baz' => 'quz']; $io = helper_getIoInstance('test', keep: $keep); expect($io->keep)->toBe($keep); }); test('you can create it from another Io instance and it keeps the value of the original instance.', function () { $io1 = helper_getIoInstance('test'); $io2 = helper_getIoInstance($io1); expect($io2->get())->toBe('test'); }); test('when created from another Io instance it passes on the data to keep', function () { $io1 = helper_getIoInstance('test', keep: ['co' => 'derotsch']); $io2 = helper_getIoInstance($io1); expect($io2->keep)->toBe(['co' => 'derotsch']); }); test('the withValue() method creates a new instance with that value but keeps the keep data', function () { $io1 = helper_getIoInstance('hey', ['baz' => 'three']); $io2 = $io1->withValue('ho'); expect($io2->get())->toBe('ho') ->and($io2->keep)->toBe(['baz' => 'three']); }); test( 'the withPropertyValue() method creates a new instance and replaces a certain property in its array value', function () { $io1 = helper_getIoInstance(['a' => '1', 'b' => '2', 'c' => '3'], ['baz' => 'three']); $io2 = $io1->withPropertyValue('c', '4'); expect($io2->get())->toBe(['a' => '1', 'b' => '2', 'c' => '4']) ->and($io2->keep)->toBe(['baz' => 'three']); }, ); test('if the property does not exist, it is added, when withPropertyValue() is used', function () { $io1 = helper_getIoInstance(['a' => '1', 'b' => '2']); $io2 = $io1->withPropertyValue('c', '3'); expect($io2->get())->toBe(['a' => '1', 'b' => '2', 'c' => '3']); }); it('gets a particular property by key from array output', function () { $io = helper_getIoInstance(['foo' => 'so', 'bar' => 'lala', 'baz' => 'bla']); expect($io->getProperty('bar'))->toBe('lala'); }); it('when the property does not exist, getProperty() returns the defined fallback value (default null)', function () { $io = helper_getIoInstance(['foo' => 'so', 'bar' => 'lala', 'baz' => 'bla']); expect($io->getProperty('quz'))->toBeNull() ->and($io->getProperty('quz', 123))->toBe(123); }); it('sets a simple value key', function ($value, $key) { $io = helper_getIoInstance($value); expect($io->setKey())->toBe($key) ->and($io->getKey())->toBe($key); })->with([ ['foo', 'foo'], [123, '123'], [123.1234, '123.1234'], [true, 'true'], [false, 'false'], [null, 'null'], ]); it('sets a key from array output', function () { $io = helper_getIoInstance(['foo' => 'bar', 'yo' => 123.45]); expect($io->setKey('yo'))->toBe('123.45') ->and($io->getKey())->toBe('123.45'); }); it('sets a key from object output', function () { $value = helper_getStdClassWithData(['foo' => 'bar', 'yo' => 123.45]); $io = helper_getIoInstance($value); expect($io->setKey('yo'))->toBe('123.45') ->and($io->getKey())->toBe('123.45'); }); it('creates a string key for array output when not providing a key name', function () { $io = helper_getIoInstance(['one', 'two', 'three']); expect($io->setKey())->toBe('6975f1fd65cae4b21e32f4f47bf153a8') ->and($io->getKey())->toBe('6975f1fd65cae4b21e32f4f47bf153a8'); }); it('creates a string key for object output when not providing a key name', function () { $object = helper_getStdClassWithData(['one', 'two', 'three']); $io = helper_getIoInstance($object); expect($io->setKey())->toBe('bb8dd69ea029ca1379df3994721f5fa9') ->and($io->getKey())->toBe('bb8dd69ea029ca1379df3994721f5fa9'); }); it('creates a string key for array output when provided key name doesn\'t exist in output array', function () { $io = helper_getIoInstance(['one', 'two', 'three']); expect($io->setKey('four'))->toBe('6975f1fd65cae4b21e32f4f47bf153a8') ->and($io->getKey())->toBe('6975f1fd65cae4b21e32f4f47bf153a8'); }); it('creates a string key for array output when provided key name doesn\'t exist in output object', function () { $object = helper_getstdClassWithData(['one', 'two', 'three']); $io = helper_getIoInstance($object); expect($io->setKey('four'))->toBe('bb8dd69ea029ca1379df3994721f5fa9') ->and($io->getKey())->toBe('bb8dd69ea029ca1379df3994721f5fa9'); }); test('getKey returns a key when setKey was not called yet', function () { $io = helper_getIoInstance('test'); expect($io->getKey())->toBe('test'); }); test('isArrayWithStringKeys returns true when the value is an array with string keys', function () { $io = helper_getIoInstance(['foo' => 'one', 'bar' => 'two', 'baz' => 'three']); expect($io->isArrayWithStringKeys())->toBeTrue(); }); test('isArrayWithStringKeys returns false when the value is not an array with string keys', function ($value) { $io = helper_getIoInstance($value); expect($io->isArrayWithStringKeys())->toBeFalse(); })->with([ 123, true, ['foo', 'bar'], helper_getStdClassWithData(['foo' => 'bar']), ]); it('adds data to keep when calling keep() and makes already existing keys an array', function () { $io = helper_getIoInstance('value', keep: ['foo' => 'one', 'bar' => 'two']); $io->keep(['bar' => 'three', 'baz' => 'four']); expect($io->keep)->toBe(['foo' => 'one', 'bar' => ['two', 'three'], 'baz' => 'four']); }); ================================================ FILE: tests/Loader/Http/Browser/ScreenshotConfigTest.php ================================================ storePath)->toBe('/some/path') ->and($instance->fileType)->toBe('png') ->and($instance->quality)->toBeNull() ->and($instance->fullPage)->toBeFalse(); }); it('can be constructed via the static make() method', function () { $instance = ScreenshotConfig::make('/some/different/path'); expect($instance->storePath)->toBe('/some/different/path') ->and($instance->fileType)->toBe('png') ->and($instance->quality)->toBeNull() ->and($instance->fullPage)->toBeFalse(); }); test('the image file type can be changed to jpeg via the setImageFileType() method', function () { $instance = ScreenshotConfig::make('/some/path')->setImageFileType('jpeg'); expect($instance->fileType)->toBe('jpeg') ->and($instance->quality)->toBe(80); }); test('the image file type can be changed to webp via the setImageFileType() method', function () { $instance = ScreenshotConfig::make('/some/path')->setImageFileType('webp'); expect($instance->fileType)->toBe('webp') ->and($instance->quality)->toBe(80); }); test('the image file type can be changed to png via the setImageFileType() method', function () { $instance = ScreenshotConfig::make('/some/path')->setImageFileType('jpeg'); $instance->setImageFileType('png'); expect($instance->fileType)->toBe('png') ->and($instance->quality)->toBeNull(); }); test('setting the image file type to something different than png, jpeg or webp does not work', function () { $instance = ScreenshotConfig::make('/some/path')->setImageFileType('gif'); expect($instance->fileType)->toBe('png'); }); test('the image quality can be changed via setQuality()', function () { $instance = ScreenshotConfig::make('/some/path')->setImageFileType('jpeg')->setQuality(65); expect($instance->quality)->toBe(65); }); test('the image quality can not be changed via setQuality() when the file type is png', function () { $instance = ScreenshotConfig::make('/some/path')->setQuality(65); expect($instance->quality)->toBeNull(); }); test('the full page param can be set to true via setFullPage()', function () { $instance = ScreenshotConfig::make('/some/path')->setFullPage(); expect($instance->fullPage)->toBeTrue(); }); it('creates a config array for the chrome-php library', function () { $pageMock = Mockery::mock(Page::class); $instance = ScreenshotConfig::make('/some/path'); expect($instance->toChromePhpScreenshotConfig($pageMock))->toBe(['format' => 'png']); }); test('the config array for the chrome-php library contains the image quality', function () { $pageMock = Mockery::mock(Page::class); $instance = ScreenshotConfig::make('/some/path')->setImageFileType('webp')->setQuality(75); expect($instance->toChromePhpScreenshotConfig($pageMock))->toBe(['format' => 'webp', 'quality' => 75]); }); test('the config array has the necessary properties when fullPage is set to true', function () { $pageMock = Mockery::mock(Page::class); $pageMock->shouldReceive('getFullPageClip')->andReturn(Mockery::mock(Clip::class)); $instance = ScreenshotConfig::make('/some/path')->setFullPage(); $configArray = $instance->toChromePhpScreenshotConfig($pageMock); expect($configArray['format'])->toBe('png') ->and($configArray['captureBeyondViewport'])->toBeTrue() ->and($configArray['clip'])->toBeInstanceOf(Clip::class); }); ================================================ FILE: tests/Loader/Http/Cache/RetryManagerTest.php ================================================ = 400 when nothing else was defined', function (int $statusCode) { expect((new RetryManager())->shallBeRetried($statusCode))->toBeTrue(); })->with([[403], [404], [500], [503]]); it('returns false for status codes below 400 when nothing else was defined', function (int $statusCode) { expect((new RetryManager())->shallBeRetried($statusCode))->toBeFalse(); })->with([[100], [200], [302], [308]]); it( 'returns true for only one error status code when only() was used with an int', function (int $statusCode, bool $expected) { $retryManager = new RetryManager(); $retryManager->only(404); expect($retryManager->shallBeRetried($statusCode))->toBe($expected); }, )->with([ [401, false], [403, false], [404, true], [405, false], [500, false], [503, false], ]); it( 'returns true for only a set of error status codes when only() was used with an array', function (int $statusCode, bool $expected) { $retryManager = new RetryManager(); $retryManager->only([404, 503]); expect($retryManager->shallBeRetried($statusCode))->toBe($expected); }, )->with([ [401, false], [403, false], [404, true], [405, false], [500, false], [503, true], ]); it( 'returns true for all error status codes except one, when except() was used with an int', function (int $statusCode, bool $expected) { $retryManager = new RetryManager(); $retryManager->except(404); expect($retryManager->shallBeRetried($statusCode))->toBe($expected); }, )->with([ [401, true], [403, true], [404, false], [405, true], [500, true], [503, true], ]); it( 'returns true except for a set of error status codes, when except() was used with an array', function (int $statusCode, bool $expected) { $retryManager = new RetryManager(); $retryManager->except([403, 410, 500]); expect($retryManager->shallBeRetried($statusCode))->toBe($expected); }, )->with([ [401, true], [403, false], [404, true], [405, true], [410, false], [500, false], [503, true], ]); ================================================ FILE: tests/Loader/Http/Cookies/CookieJarTest.php ================================================ addFrom('https://www.crwl.io', new Response(200, [ 'Set-Cookie' => ['cook13=v4lu3; Secure'], ])); $allCookiesForDomain = $jar->allByDomain('crwl.io'); expect($allCookiesForDomain)->toHaveCount(1); }); test('addFrom works with an instance of UriInterface', function () { $jar = new CookieJar(); $jar->addFrom(Url::parsePsr7('https://www.crwl.io'), new Response(200, [ 'Set-Cookie' => ['cook13=v4lu3; Secure'], ])); $allCookiesForDomain = $jar->allByDomain('crwl.io'); expect($allCookiesForDomain)->toHaveCount(1); }); test('addFrom works with an instance of Url', function () { $jar = new CookieJar(); $jar->addFrom(Url::parse('https://www.crwl.io'), new Response(200, [ 'Set-Cookie' => ['cook13=v4lu3; Secure'], ])); $allCookiesForDomain = $jar->allByDomain('crwl.io'); expect($allCookiesForDomain)->toHaveCount(1); }); test('addFrom() works with a CookieCollection from the chrome-php lib', function () { $jar = new CookieJar(); $jar->addFrom(Url::parse('https://www.crwl.io'), new CookiesCollection([ new Cookie([ 'name' => 'foo', 'value' => 'one', 'domain' => '.www.crwl.io', 'expires' => '1745068860', 'max-age' => '86400', 'secure' => true, 'httpOnly' => true, 'sameSite' => 'Strict', ]), new Cookie([ 'name' => 'bar', 'value' => 'two', 'domain' => '.www.crwl.io', 'expires' => '1729603260.5272', 'path' => '/bar', ]), new Cookie([ 'name' => 'baz', 'value' => 'three', 'domain' => '.www.crwl.io', 'expires' => '1764076860.878', ]), ])); $allCookiesForDomain = $jar->allByDomain('crwl.io'); expect($allCookiesForDomain)->toHaveCount(3) ->and($allCookiesForDomain['foo']->expires()?->dateTime()->format('Y-m-d H:i'))->toBe('2025-04-19 13:21') ->and($allCookiesForDomain['foo']->name())->toBe('foo') ->and($allCookiesForDomain['foo']->value())->toBe('one') ->and($allCookiesForDomain['foo']->domain())->toBe('www.crwl.io') ->and($allCookiesForDomain['foo']->maxAge())->toBe(86400) ->and($allCookiesForDomain['foo']->path())->toBeNull() ->and($allCookiesForDomain['foo']->secure())->toBeTrue() ->and($allCookiesForDomain['foo']->httpOnly())->toBeTrue() ->and($allCookiesForDomain['foo']->sameSite())->toBe('Strict') ->and($allCookiesForDomain['bar']->expires()?->dateTime()->format('Y-m-d H:i'))->toBe('2024-10-22 13:21') ->and($allCookiesForDomain['bar']->name())->toBe('bar') ->and($allCookiesForDomain['bar']->value())->toBe('two') ->and($allCookiesForDomain['bar']->domain())->toBe('www.crwl.io') ->and($allCookiesForDomain['bar']->maxAge())->toBeNull() ->and($allCookiesForDomain['bar']->path())->toBe('/bar') ->and($allCookiesForDomain['bar']->secure())->toBeFalse() ->and($allCookiesForDomain['bar']->httpOnly())->toBeFalse() ->and($allCookiesForDomain['bar']->sameSite())->toBe('Lax') ->and($allCookiesForDomain['baz']->expires()?->dateTime()->format('Y-m-d H:i'))->toBe('2025-11-25 13:21'); }); it('adds all cookies from a response', function () { $jar = new CookieJar(); $jar->addFrom(Url::parse('https://www.otsch.codes'), new Response(200, [ 'Set-Cookie' => ['cook13=v4lu3; Secure', 'anotherCookie=andItsValue', 'oneMoreCookie=dough'], ])); $allCookiesForDomain = $jar->allByDomain('otsch.codes'); expect($allCookiesForDomain)->toHaveCount(3); }); it('returns all cookies that should be sent to a url', function () { $jar = new CookieJar(); $jar->addFrom(Url::parse('https://www.otsch.codes/blog'), new Response(200, [ 'Set-Cookie' => [ 'cook13=v4lu3; Secure', '__Host-anotherCookie=andItsValue; Secure; Path=/', 'oneMoreCookie=dough', ], ])); expect($jar->getFor('https://www.otsch.codes/contact'))->toHaveCount(3) ->and($jar->getFor('https://jobs.otsch.codes/index'))->toHaveCount(2) ->and($jar->getFor('http://games.otsch.codes'))->toHaveCount(1); }); ================================================ FILE: tests/Loader/Http/Cookies/CookieTest.php ================================================ toBeInstanceOf(Cookie::class); }); test('It can be created with received from url as Url object', function () { $cookie = new Cookie(Url::parse('https://www.crwlr.software/packages'), 'cookieName=cookieValue'); expect($cookie)->toBeInstanceOf(Cookie::class); }); test('It provides the received from url as PSR-7 Uri object', function () { $cookie = new Cookie('https://www.crwlr.software/contact', 'cookieName=cookieValue'); expect($cookie->receivedFromUrl())->toBeInstanceOf(UriInterface::class); }); test('It must at least have a name and value', function () { new Cookie(Url::parse('https://www.crwlr.software/packages'), 'cookieNameWithoutValueIsInvalid'); })->throws(InvalidCookieException::class); test('It parses the name and value of the cookie', function () { $cookie = new Cookie('https://www.crwlr.software/blog', 'crwlrsoftware_session=foobar'); expect($cookie->name())->toBe('crwlrsoftware_session'); expect($cookie->value())->toBe('foobar'); }); test('The __toString() method returns name=value (only)', function () { $cookie = new Cookie('https://www.crwl.io', '__Secure-cook13N4m3=c00k1eV4lu3; Secure; Path=/'); expect($cookie->__toString())->toBe('__Secure-cook13N4m3=c00k1eV4lu3'); }); test('It automatically sets the domain based on the received from url when no attribute is included', function () { // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Set-Cookie // If omitted, this attribute defaults to the host of the current document URL, not including subdomains. $cookie = new Cookie('https://www.otsch.codes/blog', 'otschcodes_session=cook13'); expect($cookie->domain())->toBe('otsch.codes'); }); test('It parses an expires attribute when included', function () { $cookie = new Cookie( 'https://www.otsch.codes/blog', 'otschcodes_session=cook13; Expires=Wed, 23-Feb-2022 10:13:41 GMT', ); expect($cookie->expires())->toBeInstanceOf(Date::class); expect($cookie->expires()->dateTime()->format('Y-m-d H:i'))->toBe('2022-02-23 10:13'); // @phpstan-ignore-line }); test('It parses a maxAge attribute when included', function () { $cookie = new Cookie('https://www.otsch.codes/blog', 'otschcodes_session=cook13; Max-Age=600'); expect($cookie->maxAge())->toBeInt(); expect($cookie->maxAge())->toBe(600); }); test('It parses a domain attribute when included', function () { $cookie = new Cookie('https://sub.domain.example.com/foobar', 'fookie=cook13; domain=domain.example.com'); expect($cookie->domain())->toBe('domain.example.com'); }); test('It\'s not allowed to set a different domain than the one of the document url it was received from', function () { new Cookie('https://sub.domain.example.com/foobar', 'fookie=cook13; domain=crwl.io'); })->throws(InvalidCookieException::class); test('It\'s not allowed to set a subdomain that is not included in the document url it was received from', function () { new Cookie('https://sub.domain.example.com/foobar', 'fookie=cook13; domain=foo.example.com'); })->throws(InvalidCookieException::class); test('When domain attribute is defined with leading dot, it\'s ignored', function () { $cookie = new Cookie('https://sub.domain.example.com/', 'fookie=cook13; domain=.domain.example.com'); expect($cookie->domain())->toBe('domain.example.com'); }); test('It parses a path attribute when included', function () { $cookie = new Cookie('https://sub.domain.example.com/foobar', 'co=asdf2345; path=/foobar'); expect($cookie->path())->toBe('/foobar'); }); test('It parses a secure attribute when included', function () { $cookie = new Cookie('https://sub.domain.example.com/foobar', 'co=asdf2345; Secure'); expect($cookie->secure())->toBeTrue(); }); test( 'It throws an exception when secure attribute is sent but url where it was received from is not on https', function () { new Cookie('http://www.example.io/foobar', 'eggs=ample; Secure'); }, )->throws(InvalidCookieException::class); test('It parses a SameSite attribute when included', function ($value) { $cookie = new Cookie('https://www.example.io/foobar', 'eggs=ample; SameSite=' . $value); expect($cookie->sameSite())->toBe($value); })->with(['Strict', 'Lax', 'None']); test('It throws an error when an unknown value is sent for the SameSite attribute', function () { new Cookie('https://www.example.io/foobar', 'eggs=ample; SameSite=Foo'); })->throws(InvalidCookieException::class); test('It parses an HttpOnly attribute when included', function () { $cookie = new Cookie('https://jobs.foo.bar/', 'csrf=asdfjkloe123; HttpOnly'); expect($cookie->httpOnly())->toBeTrue(); }); test('It\'s possible to set multiple attributes', function () { $cookie = new Cookie( 'https://www.crwl.io', '__Secure-cook13N4m3=c00k1eV4lu3; Expires=Wed, 23-Feb-2022 10:13:41 GMT; Secure; Path=/foo', ); expect($cookie->secure())->toBeTrue(); expect($cookie->expires()?->dateTime()->format('d.m.Y H:i'))->toBe('23.02.2022 10:13'); expect($cookie->path())->toBe('/foo'); }); test( 'It throws an Exception when cookie name is prefixed with __Secure- or __Host- and not sent via https', function ($prefix) { new Cookie('http://example.com', $prefix . 'Abc=defg123; Secure'); }, )->with(['__Secure-', '__Host-'])->throws(InvalidCookieException::class); test( 'It throws an Exception when cookie name is prefixed with __Secure- or __Host- and Secure flag is not included', function ($prefix) { new Cookie('https://example.com', $prefix . 'Abc=defg123;'); }, )->with(['__Secure-', '__Host-'])->throws(InvalidCookieException::class); test('Using __Secure- prefix works when received via https and Secure flag is included', function () { $cookie = new Cookie('https://www.crwl.io', '__Secure-Foo=bar123; Secure'); expect($cookie->hasSecurePrefix())->toBeTrue(); }); test('It throws an Exception when __Host- prefix used and Domain attribute included', function () { new Cookie('https://www.crwlr.software/', '__Host-Foo=bar123; Secure; Domain=www.crwlr.software; Path=/'); })->throws(InvalidCookieException::class); test('It throws an Exception when __Host- prefix used and Path attribute is not included', function () { new Cookie('https://www.crwlr.software/', '__Host-Foo=bar123; Secure;'); })->throws(InvalidCookieException::class); test('It throws an Exception when __Host- prefix used and Path attribute is not "/"', function () { new Cookie('https://www.crwlr.software/', '__Host-Foo=bar123; Secure; Path=/foo'); })->throws(InvalidCookieException::class); test('Using __Host- works when everything is valid', function () { $cookie = new Cookie('https://www.crwlr.software/', '__Host-Foo=bar123; Secure; Path=/'); expect($cookie->hasHostPrefix())->toBeTrue(); }); test( 'It should not be sent to a url when the domain doesn\'t match', function ($receivedFrom, $domainAttribute, $shouldBeSentTo) { $cookie = new Cookie($receivedFrom, 'cookie=value' . ($domainAttribute ? '; Domain=' . $domainAttribute : '')); expect($cookie->shouldBeSentTo($shouldBeSentTo))->toBeFalse(); }, )->with([ ['https://www.crwlr.software', null, 'https://www.otsch.codes'], ['https://www.crwlr.software', 'www.crwlr.software', 'https://jobs.crwlr.software'], ['https://www.crwlr.software', 'www.crwlr.software', 'https://crwlr.software'], ['https://sub.domain.crwlr.software', 'sub.domain.crwlr.software', 'https://sab.domain.crwlr.software'], ['https://sub.domain.crwlr.software', 'sub.domain.crwlr.software', 'https://domain.crwlr.software'], ]); test('It should be sent to a url when the domain matches', function ($receivedFrom, $domainAttribute, $shouldBeSentTo) { $cookie = new Cookie($receivedFrom, 'cookie=value' . ($domainAttribute ? '; Domain=' . $domainAttribute : '')); expect($cookie->shouldBeSentTo($shouldBeSentTo))->toBeTrue(); })->with([ ['https://www.crwlr.software', null, 'https://www.crwlr.software'], ['https://www.crwlr.software', null, 'https://crwlr.software'], ['https://www.crwlr.software', null, 'https://anything.crwlr.software'], ['https://sub.domain.crwlr.software', 'domain.crwlr.software', 'https://domain.crwlr.software'], ['https://sub.domain.crwlr.software', 'domain.crwlr.software', 'https://sab.domain.crwlr.software'], ]); test( 'It should not be sent to a url when it has a __Host- prefix and hosts don\'t match exactly', function ($receivedFrom, $shouldBeSentTo) { $cookie = new Cookie($receivedFrom, '__Host-cookie=value; Secure; Path=/'); expect($cookie->shouldBeSentTo($shouldBeSentTo))->toBeFalse(); }, )->with([ ['https://www.crwlr.software', 'https://jobs.crwlr.software'], ['https://sub.domain.crwlr.software', 'https://domain.crwlr.software'], ['https://subdomain.crwlr.software', 'https://sabdomain.crwlr.software'], ]); test('It should not be sent to non https url when secure flag is included', function () { $cookie = new Cookie('https://www.crwl.io', 'cookie=value; Secure'); expect($cookie->shouldBeSentTo('http://www.crwl.io'))->toBeFalse(); }); test('It should be sent to https url when secure flag is included', function () { $cookie = new Cookie('https://www.crwl.io', 'cookie=value; Secure'); expect($cookie->shouldBeSentTo('https://www.crwl.io'))->toBeTrue(); }); test('It should be sent to non https url when secure flag is included but host is localhost', function ($host) { $cookie = new Cookie('https://' . $host, 'cookie=value; Secure'); expect($cookie->shouldBeSentTo('http://' . $host))->toBeTrue(); })->with(['localhost', '127.0.0.1']); test( 'It should not be sent to urls where the path doesn\'t match the sent path attribute', function ($path, $shouldBeSentTo) { $cookie = new Cookie('https://www.crwlr.software', 'cookie=value; Path=' . $path); expect($cookie->shouldBeSentTo('https://www.crwlr.software' . $shouldBeSentTo))->toBeFalse(); }, )->with([ ['/foo', '/bar'], ['/foo', '/foobar'], ['/foo', '/'], ['/foo', '/bar/foo'], ]); test( 'It should be sent to urls where the path does match the sent path attribute', function ($path, $shouldBeSentTo) { $cookie = new Cookie('https://www.crwlr.software', 'cookie=value; Path=' . $path); expect($cookie->shouldBeSentTo('https://www.crwlr.software' . $shouldBeSentTo))->toBeTrue(); }, )->with([ ['/', '/anything'], ['/foo', '/foo'], ['/foo', '/foo/something'], ['/foo', '/foo/some/thing'], ]); test('It should not be sent when already expired', function () { $now = new DateTime('now', new DateTimeZone('GMT')); $now = $now->sub(new DateInterval('PT1S')); $cookie = new Cookie( 'https://www.crwlr.software', 'cookie=value; Expires=' . $now->format(DateTimeInterface::COOKIE), ); expect($cookie->shouldBeSentTo('https://www.crwlr.software'))->toBeFalse(); }); test('It should be sent when date of expires attribute is not reached', function () { $now = new DateTime('now', new DateTimeZone('GMT')); $now = $now->add(new DateInterval('PT5S')); $cookie = new Cookie( 'https://www.crwlr.software', 'cookie=value; Expires=' . $now->format(DateTimeInterface::COOKIE), ); expect($cookie->shouldBeSentTo('https://www.crwlr.software'))->toBeTrue(); }); test('It should not be sent when maxAge attribute is already reached', function () { $cookie = new Cookie('https://www.crwlr.software', 'cookie=value; Max-Age=1'); expect($cookie->shouldBeSentTo('https://www.crwlr.software'))->toBeTrue(); invade($cookie)->receivedAtTimestamp -= 2; // instead of sleep, manipulate the timestamp when it was received. expect($cookie->shouldBeSentTo('https://www.crwlr.software'))->toBeFalse(); }); test('It is immediately expired when the max-age attribute is zero or negative', function ($maxAgeValue) { $cookie = new Cookie('https://www.crwlr.software', 'cookie=value; Max-Age=' . $maxAgeValue); expect($cookie->shouldBeSentTo('https://www.crwlr.software'))->toBeFalse(); })->with([0, -1, -5, -1000]); test('It should be sent when maxAge attribute is not yet reached', function () { $cookie = new Cookie('https://www.crwlr.software', 'cookie=value; Max-Age=1'); expect($cookie->shouldBeSentTo('https://www.crwlr.software'))->toBeTrue(); }); ================================================ FILE: tests/Loader/Http/Cookies/DateTest.php ================================================ toBeInstanceOf(Date::class); expect($date->dateTime()->format('Y-m-d H:i:s'))->toBe('2022-02-22 16:04:55'); }); test('It gets the timezone right', function () { $date = new Date('Tue, 22-Feb-2022 20:04:29 GMT'); expect( $date->dateTime()->setTimezone(new DateTimeZone('Europe/Vienna'))->format('d.m.Y H:i:s'), )->toBe('22.02.2022 21:04:29'); }); test('It also works without the dashes between d-M-Y in the format', function () { $date = new Date('Wed, 05 Jul 2023 15:19:55 GMT'); expect( $date->dateTime()->setTimezone(new DateTimeZone('Europe/Vienna'))->format('d.m.Y H:i:s'), )->toBe('05.07.2023 17:19:55'); }); ================================================ FILE: tests/Loader/Http/HeadlessBrowserLoaderHelperTest.php ================================================ shouldReceive('createBrowser'); if ($createBrowserArgsExpectationCallback) { $createBrowserExpectation->withArgs($createBrowserArgsExpectationCallback); } $createBrowserExpectation->andReturn($browserMock); $pageMock = Mockery::mock(Page::class); $browserMock->shouldReceive('createPage')->andReturn($pageMock); if ($browserMockCallback) { $browserMockCallback($browserMock); } $sessionMock = Mockery::mock(Session::class); $pageMock->shouldReceive('getSession')->andReturn($sessionMock); if ($pageSessionMockCallback) { $pageSessionMockCallback($sessionMock); } $pageMock->shouldReceive('getCookies')->andReturn(new CookiesCollection([])); $sessionMock->shouldReceive('once'); $pageNavigationMock = Mockery::mock(PageNavigation::class); $pageMock->shouldReceive('navigate')->andReturn($pageNavigationMock); $pageMock->shouldReceive('getHtml')->andReturn('Hello World!'); if ($pageMockCallback) { $pageMockCallback($pageMock); } $waitForNavigationCall = $pageNavigationMock->shouldReceive('waitForNavigation'); if ($pageNavigationArgsClosure) { $waitForNavigationCall->withArgs($pageNavigationArgsClosure); } return $browserFactoryMock; } it('uses the configured timeout', function () { $browserFactoryMock = helper_setUpHeadlessChromeMocks(function (string $event, int $timeout) { return $event === Page::LOAD && $timeout === 45_000; }); $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock); $helper->setTimeout(45_000); $response = $helper->navigateToPageAndGetRespondedRequest( new Request('GET', 'https://www.example.com/foo'), helper_getMinThrottler(), cookieJar: new CookieJar(), ); expect(Http::getBodyString($response))->toBe('Hello World!'); }); it('returns the configured timeout', function () { $helper = new HeadlessBrowserLoaderHelper(); expect($helper->getTimeout())->toBe(30_000); $helper->setTimeout(75_000); expect($helper->getTimeout())->toBe(75_000); }); it('waits for the configured browser navigation event', function () { $browserFactoryMock = helper_setUpHeadlessChromeMocks(function (string $event, int $timeout) { return $event === Page::FIRST_MEANINGFUL_PAINT && $timeout === 57_000; }); $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock); $helper ->waitForNavigationEvent(Page::FIRST_MEANINGFUL_PAINT) ->setTimeout(57_000); $response = $helper->navigateToPageAndGetRespondedRequest( new Request('GET', 'https://www.example.com/foo'), helper_getMinThrottler(), cookieJar: new CookieJar(), ); expect(Http::getBodyString($response))->toBe('Hello World!'); }); it('uses the correct executable', function () { $helper = new HeadlessBrowserLoaderHelper(); $helper->setExecutable('somethingthatdefinitelyisntachromeexecutable'); $invadedHelper = invade($helper); $exception = null; try { $invadedHelper->getBrowser(new Request('GET', 'https://www.example.com/foo')); } catch (Exception $exception) { } expect($exception)->not->toBeNull(); $chromeExecutable = (new AutoDiscover())->guessChromeBinaryPath(); $helper = new HeadlessBrowserLoaderHelper(); $helper->setExecutable($chromeExecutable); $invadedHelper = invade($helper); $invadedHelper->getBrowser(new Request('GET', 'https://www.example.com/foo')); $browserFactory = $invadedHelper->browserFactory; expect($browserFactory)->toBeInstanceOf(BrowserFactory::class); /** @var BrowserFactory $browserFactory */ $invadedBrowserFactory = invade($browserFactory); expect($invadedBrowserFactory->chromeBinary)->toBe($chromeExecutable); }); it('calls the temporary post navigate hooks once', function () { $browserFactoryMock = helper_setUpHeadlessChromeMocks( pageMockCallback: function (Mockery\MockInterface $pageMock) { $pageMock->shouldReceive('assertNotClosed')->once(); }, ); $logger = new DummyLogger(); $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock, $logger); $helper->setTempPostNavigateHooks([ function (Page $page, LoggerInterface $logger) { $logger->info('hook 1 called'); }, function (Page $page, LoggerInterface $logger) { $logger->info('hook 2 called'); }, function (Page $page, LoggerInterface $logger) { $logger->info('hook 3 called'); }, ]); $helper->navigateToPageAndGetRespondedRequest( new Request('GET', 'https://www.example.com/foo'), helper_getMinThrottler(), cookieJar: new CookieJar(), ); expect($logger->messages)->toHaveCount(3) ->and($logger->messages[0]['message'])->toBe('hook 1 called') ->and($logger->messages[1]['message'])->toBe('hook 2 called') ->and($logger->messages[2]['message'])->toBe('hook 3 called'); $helper->navigateToPageAndGetRespondedRequest( new Request('GET', 'https://www.example.com/foo'), helper_getMinThrottler(), cookieJar: new CookieJar(), ); expect($logger->messages)->toHaveCount(3); }); it( 'passes the script source provided via the setPageInitScript() method, to the ' . 'ProcessAwareBrowser::setPagePreScript() method', function () { $script = 'console.log(\'hey\');'; $browserFactoryMock = helper_setUpHeadlessChromeMocks( browserMockCallback: function (Mockery\MockInterface $browser) use ($script) { $browser ->shouldReceive('setPagePreScript') ->once() ->with($script); }, ); $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock); $helper->setPageInitScript($script); $helper->navigateToPageAndGetRespondedRequest( new Request('GET', 'https://www.example.com/bar'), helper_getMinThrottler(), cookieJar: new CookieJar(), ); }, ); it('does not call the ProcessAwareBrowser::setPagePreScript() when no page init script was defined', function () { $browserFactoryMock = helper_setUpHeadlessChromeMocks( browserMockCallback: function (Mockery\MockInterface $browser) { $browser->shouldNotReceive('setPagePreScript'); }, ); $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock); $helper->navigateToPageAndGetRespondedRequest( new Request('GET', 'https://www.example.com/bar'), helper_getMinThrottler(), cookieJar: new CookieJar(), ); }); it( 'passes the userAgent option when Request contains a user-agent header and useNativeUserAgent() was not called', function () { $browserFactoryMock = helper_setUpHeadlessChromeMocks( createBrowserArgsExpectationCallback: function ($options) { return array_key_exists('userAgent', $options) && $options['userAgent'] === 'MyBot'; }, ); $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock); $response = $helper->navigateToPageAndGetRespondedRequest( new Request('GET', 'https://www.example.com/bar', ['user-agent' => ['MyBot']]), helper_getMinThrottler(), cookieJar: new CookieJar(), ); expect(Http::getBodyString($response))->toBe('Hello World!'); }, ); it( 'does not pass the userAgent option when Request contains a user-agent header and useNativeUserAgent() was called', function () { $browserFactoryMock = helper_setUpHeadlessChromeMocks( createBrowserArgsExpectationCallback: function ($options) { return !array_key_exists('userAgent', $options); }, ); $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock); $helper->useNativeUserAgent(); $response = $helper->navigateToPageAndGetRespondedRequest( new Request('GET', 'https://www.example.com/bar', ['user-agent' => ['MyBot']]), helper_getMinThrottler(), cookieJar: new CookieJar(), ); expect(Http::getBodyString($response))->toBe('Hello World!'); }, ); it('clears the browsers cookies when no cookie jar is provided', function () { $browserFactoryMock = helper_setUpHeadlessChromeMocks( pageSessionMockCallback: function (Mockery\MockInterface $mock) { $mock ->shouldReceive('sendMessageSync') ->once() ->withArgs(function (Message $message) { return $message->getMethod() === 'Network.clearBrowserCookies'; }); }, ); $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock); $response = $helper->navigateToPageAndGetRespondedRequest( new Request('GET', 'https://www.example.com/yolo', ['user-agent' => ['MyBot']]), helper_getMinThrottler(), ); expect(Http::getBodyString($response))->toBe('Hello World!'); }); it('reuses a previously opened page', function () { $browserFactoryMock = helper_setUpHeadlessChromeMocks( pageMockCallback: function (Mockery\MockInterface $pageMock) { $pageMock->shouldReceive('assertNotClosed')->twice(); }, ); $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock); $t = helper_getMinThrottler(); $c = new CookieJar(); $helper->navigateToPageAndGetRespondedRequest(new Request('GET', 'https://www.example.com/foo'), $t, null, $c); $helper->navigateToPageAndGetRespondedRequest(new Request('GET', 'https://www.example.com/bar'), $t, null, $c); $helper->navigateToPageAndGetRespondedRequest(new Request('GET', 'https://www.example.com/baz'), $t, null, $c); }); ================================================ FILE: tests/Loader/Http/HttpLoaderPolitenessTest.php ================================================ shouldReceive('sendRequest')->once()->andReturnUsing(function (RequestInterface $request) { $response = new Response(200, [], $request->getUri()->__toString() . ' response'); helper_wait300ms(); return $response; }); $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response(200)); $loader = new HttpLoader(new UserAgent('SomeUserAgent'), $httpClient); $loader->{$loadingMethod}('https://www.example.com/foo'); $firstResponse = microtime(true); $loader->{$loadingMethod}('https://www.example.com/bar'); $secondResponse = microtime(true); $diff = $secondResponse - $firstResponse; expect($diff)->toBeGreaterThan(0.3) ->and($diff)->toBeLessThan(0.62); })->with(['load', 'loadOrFail']); it('also throttles requests using the headless browser', function ($loadingMethod) { $browserMock = Mockery::mock(Browser::class); $pageMock = Mockery::mock(Page::class); $sessionMock = Mockery::mock(Session::class); $sessionMock->shouldReceive('once'); $pageMock->shouldReceive('assertNotClosed')->once(); $pageMock->shouldReceive('getSession')->andReturn($sessionMock); $pageNavigationMock = Mockery::mock(PageNavigation::class); $pageNavigationMock->shouldReceive('waitForNavigation'); $pageMock ->shouldReceive('navigate') ->once() ->andReturnUsing(function (string $url) use ($pageNavigationMock) { helper_wait300ms(); return $pageNavigationMock; }); $pageMock->shouldReceive('getCookies')->andReturn(new CookiesCollection()); $pageMock->shouldReceive('getHtml')->andReturn('foo'); $browserMock->shouldReceive('createPage')->andReturn($pageMock); $browserHelperMock = Mockery::mock(HeadlessBrowserLoaderHelper::class)->makePartial(); $browserHelperMock ->shouldAllowMockingProtectedMethods() ->shouldReceive('getBrowser') ->andReturn($browserMock); $loader = new HttpLoader(new UserAgent('SomeUserAgent')); invade($loader)->browserHelper = $browserHelperMock; $loader->useHeadlessBrowser(); $loader->{$loadingMethod}('https://www.example.com/foo'); $pageMock->shouldReceive('navigate')->andReturn($pageNavigationMock); $pageMock->shouldReceive('getCookies')->andReturn(new CookiesCollection()); $firstResponse = microtime(true); $loader->{$loadingMethod}('https://www.example.com/bar'); $secondResponse = microtime(true); $diff = $secondResponse - $firstResponse; expect($diff)->toBeGreaterThan(0.3) ->and($diff)->toBeLessThan(0.62); })->with(['load', 'loadOrFail']); it('does not throttle requests to different domains', function ($loadingMethod) { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest')->once()->andReturnUsing(function (RequestInterface $request) { $response = new Response(200, [], $request->getUri()->__toString() . ' response'); helper_wait300ms(); return $response; }); $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response(200)); $loader = new HttpLoader(new UserAgent('SomeUserAgent'), $httpClient); $loader->{$loadingMethod}('https://www.example.com/foo'); $firstResponse = microtime(true); $loader->{$loadingMethod}('https://www.example.org/bar'); $secondResponse = microtime(true); $diff = $secondResponse - $firstResponse; expect($diff)->toBeLessThan(0.001); })->with(['load', 'loadOrFail']); it('respects rules from robots.txt from load method', function () { $client = Mockery::mock(ClientInterface::class); $client->shouldReceive('sendRequest')->once()->andReturn(helper_getDummyRobotsTxtResponse()); $loader = new HttpLoader(new BotUserAgent('FooBot'), $client); $response = $loader->load('https://www.crwlr.software/secret'); expect($response)->toBeNull(); $output = $this->getActualOutputForAssertion(); expect($output)->toContain('Loaded https://www.crwlr.software/robots.txt'); expect($output)->toContain('Crawler is not allowed to load https://www.crwlr.software/secret'); }); it('respects rules from robots.txt from loadOrFail method', function () { $client = Mockery::mock(ClientInterface::class); $client->shouldReceive('sendRequest')->once()->andReturn(helper_getDummyRobotsTxtResponse()); $loader = new HttpLoader(new BotUserAgent('FooBot'), $client); $loader->loadOrFail('https://www.crwlr.software/secret'); })->throws(LoadingException::class); it('does not respect rules from robots.txt when user agent isn\'t instance of BotUserAgent', function () { $client = Mockery::mock(ClientInterface::class); $client->shouldReceive('sendRequest')->once()->andReturn(helper_getDummyRobotsTxtResponse()); $loader = new HttpLoader(new UserAgent('FooBot'), $client); $response = $loader->load('https://www.crwlr.software/secret'); expect($response)->toBeInstanceOf(RespondedRequest::class); $output = $this->getActualOutputForAssertion(); expect($output)->not()->toContain('Loaded https://www.crwlr.software/robots.txt'); expect($output)->not()->toContain('Crawler is not allowed to load https://www.crwlr.software/secret'); }); ================================================ FILE: tests/Loader/Http/HttpLoaderTest.php ================================================ shouldReceive('sendRequest')->twice()->andReturn(new Response()); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->load('https://www.crwlr.software'); $httpLoader->loadOrFail('https://www.crwlr.software'); }); it('fails and logs an error when invoked with a relative reference URI', function () { $logger = new DummyLogger(); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger); $httpLoader->load('/foo'); expect($logger->messages)->not->toBeEmpty() ->and($logger->messages[0]['message'])->toBe( 'Invalid input URL: /foo - The URI is a relative reference and therefore can\'t be loaded.', ); }); it('fails and throws an exception when loadOrFail() is called with a relative reference URI', function () { $logger = new DummyLogger(); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger); $httpLoader->loadOrFail('/foo'); })->throws(InvalidArgumentException::class); it('accepts RequestInterface as argument to load', function () { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest')->twice()->andReturn(new Response()); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->load(new Request('GET', 'https://www.crwlr.software')); $httpLoader->loadOrFail(new Request('GET', 'https://www.crwlr.software')); }); it('fails and logs an error when invoked with a RequestInterface object having a relative reference URI', function () { $logger = new DummyLogger(); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger); $httpLoader->load(new Request('GET', '/foo')); expect($logger->messages)->not->toBeEmpty() ->and($logger->messages[0]['message'])->toBe( 'Invalid input URL: /foo - The URI is a relative reference and therefore can\'t be loaded.', ); }); it( 'fails and throws an exception when loadOrFail() is called with a RequestInterface object having a relative ' . 'reference URI', function () { $logger = new DummyLogger(); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger); $httpLoader->loadOrFail(new Request('GET', '/foo')); }, )->throws(InvalidArgumentException::class); it( 'calls the before and after load hooks regardless whether the response was successful or not', function ($responseStatusCode) { $httpClient = Mockery::mock(ClientInterface::class); if ($responseStatusCode === 300) { $httpClient->shouldReceive('sendRequest') ->twice() ->andReturn(new Response($responseStatusCode), new Response(200)); } else { $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response($responseStatusCode)); } $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $beforeLoadWasCalled = false; $httpLoader->beforeLoad(function () use (&$beforeLoadWasCalled) { $beforeLoadWasCalled = true; }); $afterLoadWasCalled = false; $httpLoader->afterLoad(function () use (&$afterLoadWasCalled) { $afterLoadWasCalled = true; }); $httpLoader->load('https://www.otsch.codes'); expect($beforeLoadWasCalled)->toBeTrue() ->and($afterLoadWasCalled)->toBeTrue(); }, )->with([ [100], [200], [300], [400], [500], ]); it('calls the onSuccess hook on a successful response', function ($responseStatusCode) { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest')->twice()->andReturn(new Response($responseStatusCode)); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $onSuccessWasCalled = false; $httpLoader->onSuccess(function () use (&$onSuccessWasCalled) { $onSuccessWasCalled = true; }); $httpLoader->load('https://www.otsch.codes'); expect($onSuccessWasCalled)->toBeTrue(); $onSuccessWasCalled = false; $httpLoader->loadOrFail('https://www.otsch.codes'); expect($onSuccessWasCalled)->toBeTrue(); })->with([ [200], [201], [202], ]); it('calls the onError hook on a failed request', function ($responseStatusCode) { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response($responseStatusCode)); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $onErrorWasCalled = false; $httpLoader->onError(function () use (&$onErrorWasCalled) { $onErrorWasCalled = true; }); $httpLoader->load('https://www.otsch.codes'); expect($onErrorWasCalled)->toBeTrue(); })->with([ [400], [404], [422], [500], ]); it('calls the onCacheHit hook when a response for the request was found in the cache', function (string $loadMethod) { $cache = new FileCache(helper_cachedir()); $userAgent = helper_nonBotUserAgent(); $respondedRequest = new RespondedRequest( new Request( 'GET', 'https://www.example.com/foo', ['Host' => ['www.example.com'], 'User-Agent' => [(string) $userAgent]], ), new Response(body: 'Hello World!'), ); $cache->set($respondedRequest->cacheKey(), $respondedRequest); $httpLoader = new HttpLoader($userAgent); $httpLoader->setCache($cache); $onCacheHitWasCalled = false; $httpLoader->onCacheHit(function () use (&$onCacheHitWasCalled) { $onCacheHitWasCalled = true; }); $response = $httpLoader->{$loadMethod}('https://www.example.com/foo'); /** @var RespondedRequest $response */ expect($onCacheHitWasCalled)->toBeTrue() ->and($response->isServedFromCache())->toBeTrue(); })->with(['load', 'loadOrFail']); it('throws an Exception when request fails in loadOrFail method', function () { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response(400)); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $onErrorWasCalled = false; $httpLoader->onError(function () use (&$onErrorWasCalled) { $onErrorWasCalled = true; }); try { $httpLoader->loadOrFail('https://www.otsch.codes'); } catch (LoadingException $exception) { expect($exception)->toBeInstanceOf(LoadingException::class); } expect($onErrorWasCalled)->toBeFalse(); }); test('You can implement logic to disallow certain request', function () { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response()); $httpLoader = new class (new BotUserAgent('Foo'), $httpClient) extends HttpLoader { public function isAllowedToBeLoaded(UriInterface $uri, bool $throwsException = false): bool { return $uri->__toString() === 'https://www.example.com/foo'; } }; $response = $httpLoader->load('https://www.example.com/foo'); expect($response)->toBeInstanceOf(RespondedRequest::class); $response = $httpLoader->load('https://www.example.com/bar'); expect($response)->toBeNull(); }); test( 'The isAllowedToBeLoaded method is called with argument throwsException true when called from loadOrFail', function () { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response()); $httpLoader = new class (new BotUserAgent('Foo'), $httpClient) extends HttpLoader { public function isAllowedToBeLoaded(UriInterface $uri, bool $throwsException = false): bool { if ($throwsException) { throw new LoadingException('Fail to load ' . $uri->__toString()); } return $uri->__toString() === 'https://www.example.com'; } }; $httpLoader->load('https://www.example.com'); try { $httpLoader->loadOrFail('https://www.example.com'); } catch (LoadingException $exception) { expect($exception)->toBeInstanceOf(LoadingException::class); } }, ); it('automatically handles redirects', function (string $loadingMethod) { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest') ->twice() ->andReturn( new Response(301, ['Location' => 'https://www.redirect.com']), new Response(200, [], 'YES'), ); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $respondedRequest = $httpLoader->{$loadingMethod}('https://www.crwlr.software/packages'); /** @var RespondedRequest $respondedRequest */ expect($respondedRequest->requestedUri())->toBe('https://www.crwlr.software/packages') ->and($respondedRequest->effectiveUri())->toBe('https://www.redirect.com') ->and($respondedRequest->response->getBody()->getContents())->toBe('YES'); })->with(['load', 'loadOrFail']); it('calls request start and end tracking methods', function (string $loadingMethod) { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response(200)); $throttler = new class extends Throttler { public function trackRequestStartFor(UriInterface $url): void { echo 'Track request start ' . $url . PHP_EOL; parent::trackRequestStartFor($url); } public function trackRequestEndFor(UriInterface $url): void { echo 'Track request end ' . $url . PHP_EOL; parent::trackRequestEndFor($url); } }; $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient, throttler: $throttler); $httpLoader->{$loadingMethod}('https://www.twitter.com'); $output = $this->getActualOutputForAssertion(); expect($output)->toContain('Track request start https://www.twitter.com') ->and($output)->toContain('Track request end https://www.twitter.com'); })->with(['load', 'loadOrFail']); it( 'calls trackRequestEndFor only once and with the original request URL when there is a redirect', function (string $loadingMethod) { $httpClient = Mockery::mock(ClientInterface::class); $httpClient ->shouldReceive('sendRequest') ->once() ->withArgs(function (Request $request) { return (string) $request->getUri() === 'https://www.example.com/foo'; }) ->andReturn(new Response(301, ['Location' => 'https://www.example.com/bar'])); $httpClient ->shouldReceive('sendRequest') ->once() ->withArgs(function (Request $request) { return (string) $request->getUri() === 'https://www.example.com/bar'; }) ->andReturn(new Response(200)); $throttler = new class extends Throttler { public function trackRequestEndFor(UriInterface $url): void { echo 'Track request end ' . $url . PHP_EOL; parent::trackRequestEndFor($url); } }; $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient, throttler: $throttler); $httpLoader->{$loadingMethod}('https://www.example.com/foo'); $output = $this->getActualOutputForAssertion(); expect($output)->toContain('Track request end https://www.example.com/foo') ->and(count(explode('Track request end', $output)))->toBe(2); }, )->with(['load', 'loadOrFail']); it('automatically logs loading success message', function ($loadingMethod) { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response()); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->{$loadingMethod}(new Request('GET', 'https://phpstan.org/')); $output = $this->getActualOutputForAssertion(); expect($output)->toContain('Loaded https://phpstan.org/'); })->with(['load', 'loadOrFail']); it('automatically logs loading error message in normal load method', function () { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response(500)); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->load(new Request('GET', 'https://phpstan.org/')); $output = $this->getActualOutputForAssertion(); expect($output)->toContain('Failed to load https://phpstan.org/'); }); it('automatically adds the User-Agent header before sending', function () { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest') ->once() ->withArgs(function ($request) { return str_contains($request->getHeaderLine('User-Agent'), 'FooBot'); }) ->andReturn(new Response()); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->load('https://www.facebook.com'); }); it('tries to get responses from cache', function () { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldNotReceive('sendRequest'); $cache = Mockery::mock(CacheInterface::class); $cache->shouldReceive('has')->once()->andReturn(true); $cache->shouldReceive('get') ->once() ->andReturn(new RespondedRequest(new Request('GET', '/'), new Response())); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->setCache($cache); $httpLoader->load('https://www.facebook.com'); }); test( 'when a response is served from cache, the RespondedRequest::isServedFromCache() method returns true,', function (string $loadMethod) { $cache = new FileCache(helper_cachedir()); $userAgent = helper_nonBotUserAgent(); $respondedRequest = new RespondedRequest( new Request( 'GET', 'https://www.example.com/bar', ['Host' => ['www.example.com'], 'User-Agent' => [(string) $userAgent]], ), new Response(body: 'Hi!'), ); $cache->set($respondedRequest->cacheKey(), $respondedRequest); $clientMock = Mockery::mock(Client::class); $clientMock ->shouldReceive('sendRequest') ->once() ->withArgs(function (Request $request) { return (string) $request->getUri() === 'https://www.example.com/foo'; }) ->andReturn(new Response(body: 'Hi!')); $httpLoader = (new HttpLoader($userAgent, $clientMock))->setCache($cache); $response = $httpLoader->{$loadMethod}('https://www.example.com/foo'); /** @var RespondedRequest $response */ expect($response->isServedFromCache())->toBeFalse(); $response = $httpLoader->{$loadMethod}('https://www.example.com/bar'); /** @var RespondedRequest $response */ expect($response->isServedFromCache())->toBeTrue(); }, )->with(['load', 'loadOrFail']); it( 'does not serve a request from the cache, when skipCacheForNextRequest() was called', function (string $loadMethod) { $cache = new FileCache(helper_cachedir()); $userAgent = helper_nonBotUserAgent(); $respondedRequest = new RespondedRequest( new Request( 'GET', 'https://www.example.com/blog/posts', ['Host' => ['www.example.com'], 'User-Agent' => [(string) $userAgent]], ), new Response(body: 'previously cached blog posts'), ); $cache->set($respondedRequest->cacheKey(), $respondedRequest); $clientMock = Mockery::mock(Client::class); $clientMock ->shouldReceive('sendRequest') ->once() ->withArgs(function (Request $request) { return (string) $request->getUri() === 'https://www.example.com/blog/posts'; }) ->andReturn(new Response(body: 'loaded blog posts')); $httpLoader = (new HttpLoader($userAgent, $clientMock)) ->setCache($cache) ->skipCacheForNextRequest(); $response = $httpLoader->{$loadMethod}('https://www.example.com/blog/posts'); /** @var RespondedRequest $response */ expect($response->isServedFromCache())->toBeFalse() ->and(Http::getBodyString($response))->toBe('loaded blog posts'); // Skipping the cache is only effective for loading. It still adds the loaded response to the cache. // So on the next request, when not again calling the skip cache method, the cache will return that // previously loaded response. $response = $httpLoader->{$loadMethod}('https://www.example.com/blog/posts'); expect($response->isServedFromCache())->toBeTrue() ->and(Http::getBodyString($response))->toBe('loaded blog posts'); }, )->with(['load', 'loadOrFail']); it('still handles legacy (until v0.7) cached responses', function () { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldNotReceive('sendRequest'); $cache = Mockery::mock(CacheInterface::class); $cache->shouldReceive('has')->once()->andReturn(true); $cache->shouldReceive('get') ->once() ->andReturn([ 'requestMethod' => 'GET', 'requestUri' => 'https://www.example.com/index', 'requestHeaders' => ['foo' => ['bar']], 'requestBody' => 'requestbody', 'effectiveUri' => 'https://www.example.com/home', 'responseStatusCode' => 201, 'responseHeaders' => ['baz' => ['quz']], 'responseBody' => 'responsebody', ]); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->setCache($cache); $respondedRequest = $httpLoader->load('https://www.example.com/index'); expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class) ->and($respondedRequest?->request->getMethod())->toBe('GET') ->and($respondedRequest?->requestedUri())->toBe('https://www.example.com/index') ->and($respondedRequest?->request->getHeaders())->toHaveKey('foo') ->and($respondedRequest?->request->getBody()->getContents())->toBe('requestbody') ->and($respondedRequest?->effectiveUri())->toBe('https://www.example.com/home') ->and($respondedRequest?->response->getStatusCode())->toBe(201) ->and($respondedRequest?->response->getHeaders())->toHaveKey('baz') ->and($respondedRequest?->response->getBody()->getContents())->toBe('responsebody'); }); it('fails when it gets a failed response from cache', function () { $httpClient = Mockery::mock(ClientInterface::class); $cache = Mockery::mock(CacheInterface::class); $cache->shouldReceive('has')->once()->andReturn(true); $cache->shouldReceive('get') ->once() ->andReturn(new RespondedRequest(new Request('GET', '/'), new Response(404))); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->setCache($cache); $onErrorWasCalled = false; $httpLoader->onError(function () use (&$onErrorWasCalled) { $onErrorWasCalled = true; }); $httpLoader->load('https://www.facebook.com'); expect($onErrorWasCalled)->toBeTrue(); }); it('fails when it gets a failed response from cache in loadOrFail', function () { $httpClient = Mockery::mock(ClientInterface::class); $cache = Mockery::mock(CacheInterface::class); $cache->shouldReceive('has')->once()->andReturn(true); $cache->shouldReceive('get') ->once() ->andReturn(new RespondedRequest(new Request('GET', 'facebook'), new Response(404))); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->setCache($cache); $httpLoader->loadOrFail('https://www.facebook.com'); })->throws(LoadingException::class); it('adds loaded responses to the cache when it has a cache', function ($loadingMethod) { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response()); $cache = Mockery::mock(CacheInterface::class); $cache->shouldReceive('has')->once()->andReturn(false); $cache->shouldReceive('set')->once(); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->setCache($cache); $httpLoader->{$loadingMethod}('https://laravel.com/'); })->with(['load', 'loadOrFail']); test( 'when a cached response was an error response it retries to load it when retryCachedErrorResponses() was called', function (string $loadingMethod) { $httpClient = Mockery::mock(ClientInterface::class); $httpClient ->shouldReceive('sendRequest') ->twice() ->andReturn(new Response(404), new Response(200)); $cache = new FileCache(helper_cachedir()); $httpLoader = helper_getFastLoader(httpClient: $httpClient); $httpLoader->setCache($cache); $httpLoader->retryCachedErrorResponses(); try { $httpLoader->{$loadingMethod}('https://www.example.com/articles/123'); } catch (Throwable $exception) { } try { $httpLoader->{$loadingMethod}('https://www.example.com/articles/123'); } catch (Throwable $exception) { } }, )->with(['load', 'loadOrFail']); test('retrying cached error responses can be restricted to only certain response status codes', function () { $httpClient = Mockery::mock(ClientInterface::class); $httpClient ->shouldReceive('sendRequest') ->twice() ->andReturn(new Response(404), new Response(400)); $cache = new FileCache(helper_cachedir()); $httpLoader = helper_getFastLoader(httpClient: $httpClient); $httpLoader->setCache($cache); $httpLoader ->retryCachedErrorResponses() ->only([404, 503]); $respondedRequest = $httpLoader->load('https://www.example.com/foo'); expect($respondedRequest?->response->getStatusCode())->toBe(404); $respondedRequest = $httpLoader->load('https://www.example.com/foo'); expect($respondedRequest?->response->getStatusCode())->toBe(400); $respondedRequest = $httpLoader->load('https://www.example.com/foo'); expect($respondedRequest?->response->getStatusCode())->toBe(400); }); test('certain error status codes can be excluded from being retried', function () { $httpClient = Mockery::mock(ClientInterface::class); $httpClient ->shouldReceive('sendRequest') ->twice() ->andReturn(new Response(404), new Response(500)); $cache = new FileCache(helper_cachedir()); $httpLoader = helper_getFastLoader(httpClient: $httpClient); $httpLoader->setCache($cache); $httpLoader ->retryCachedErrorResponses() ->except([410, 500]); $respondedRequest = $httpLoader->load('https://www.example.com/foo'); expect($respondedRequest?->response->getStatusCode())->toBe(404); $respondedRequest = $httpLoader->load('https://www.example.com/foo'); expect($respondedRequest?->response->getStatusCode())->toBe(500); $respondedRequest = $httpLoader->load('https://www.example.com/foo'); expect($respondedRequest?->response->getStatusCode())->toBe(500); }); it( 'adds responses to the cache but doesn\'t try to get them from the cache, when writeOnlyCache() was called', function ($loadingMethod) { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest')->twice()->andReturn(new Response()); $cache = new FileCache(helper_cachedir()); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->setCache($cache); $httpLoader->writeOnlyCache(); try { $httpLoader->{$loadingMethod}('https://www.example.com/articles/123'); } catch (Throwable $exception) { } try { $httpLoader->{$loadingMethod}('https://www.example.com/articles/123'); } catch (Throwable $exception) { } }, )->with(['load', 'loadOrFail']); test( 'When cache filters are defined via the cacheOnlyWhereUrl() method it caches only responses for matching URLs', function (string $loadingMethod) { $httpClient = Mockery::mock(ClientInterface::class); $httpClient ->shouldReceive('sendRequest') ->twice() ->andReturnUsing(function (Request $request) { return new Response(200, body: $request->getUri() . ' response'); }); $cache = new FileCache(helper_cachedir()); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->setCache($cache); $httpLoader->cacheOnlyWhereUrl(Filter::urlPathStartsWith('/bar/')); $respondedRequest = $httpLoader->{$loadingMethod}('https://www.example.com/foo/something'); expect($cache->get($respondedRequest->cacheKey()))->toBeNull(); $respondedRequest = $httpLoader->{$loadingMethod}('https://www.example.com/bar/something'); expect($cache->get($respondedRequest->cacheKey()))->toBeInstanceOf(RespondedRequest::class); }, )->with(['load', 'loadOrFail']); test( 'When multiple cache filters are defined via the cacheOnlyWhereUrl() method, all of them are used', function (string $loadingMethod) { $httpClient = Mockery::mock(ClientInterface::class); $httpClient ->shouldReceive('sendRequest') ->times(3) ->andReturnUsing(function (Request $request) { return new Response(200, body: $request->getUri() . ' response'); }); $cache = new FileCache(helper_cachedir()); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->setCache($cache); $httpLoader ->cacheOnlyWhereUrl(Filter::urlPathStartsWith('/bar/')) ->cacheOnlyWhereUrl(Filter::urlHost('www.example.com')); $respondedRequest = $httpLoader->{$loadingMethod}('https://www.example.com/foo/something'); expect($cache->get($respondedRequest->cacheKey()))->toBeNull(); $respondedRequest = $httpLoader->{$loadingMethod}('https://www.crwlr.software/bar/something'); expect($cache->get($respondedRequest->cacheKey()))->toBeNull(); $respondedRequest = $httpLoader->{$loadingMethod}('https://www.example.com/bar/something'); expect($cache->get($respondedRequest->cacheKey()))->toBeInstanceOf(RespondedRequest::class); }, )->with(['load', 'loadOrFail']); test( 'when a request was redirected, only one of the URLs has to match the filters defined via cacheOnlyWhereUrl()', function (string $loadingMethod) { $httpClient = Mockery::mock(ClientInterface::class); $httpClient ->shouldReceive('sendRequest') ->andReturnUsing(function (Request $request) { $url = (string) $request->getUri(); $redirectUrl = null; if ($url === 'https://www.example.com/foo/something') { $redirectUrl = 'https://www.example.com/bar/something'; } elseif ($url === 'https://www.example.com/bar/something') { $redirectUrl = 'https://www.example.com/baz/something'; } if ($redirectUrl) { return new Response(301, ['Location' => $redirectUrl]); } return new Response(200, body: $request->getUri() . ' response'); }); $cache = new FileCache(helper_cachedir()); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->setCache($cache); $httpLoader->cacheOnlyWhereUrl(Filter::urlPathStartsWith('/bar/')); $respondedRequest = $httpLoader->{$loadingMethod}('https://www.example.com/foo/something'); expect($cache->get($respondedRequest->cacheKey()))->toBeInstanceOf(RespondedRequest::class); $cache->clear(); $respondedRequest = $httpLoader->{$loadingMethod}('https://www.example.com/bar/something'); expect($cache->get($respondedRequest->cacheKey()))->toBeInstanceOf(RespondedRequest::class); $cache->clear(); $respondedRequest = $httpLoader->{$loadingMethod}('https://www.example.com/baz/something'); expect($cache->get($respondedRequest->cacheKey()))->toBeNull(); }, )->with(['load', 'loadOrFail']); it('uses the cache only for requests that meet the filter criteria', function (string $loadingMethod) { $httpClient = Mockery::mock(ClientInterface::class); $httpClient ->shouldReceive('sendRequest') ->once() ->andReturnUsing(function (Request $request) { return new Response(200, body: $request->getUri() . ' response'); }); $userAgent = helper_nonBotUserAgent(); $cache = new FileCache(helper_cachedir()); $cachedResponse = new RespondedRequest( new Request('GET', 'https://www.example.com/foo/test', headers: ['User-Agent' => $userAgent->__toString()]), new Response(), ); $cache->set($cachedResponse->cacheKey(), $cachedResponse); $cachedResponse = new RespondedRequest( new Request('GET', 'https://www.example.com/bar/test', headers: ['User-Agent' => $userAgent->__toString()]), new Response(), ); $cache->set($cachedResponse->cacheKey(), $cachedResponse); $httpLoader = new HttpLoader($userAgent, $httpClient); $httpLoader->setCache($cache); $httpLoader->cacheOnlyWhereUrl(Filter::urlPathStartsWith('/bar/')); $httpLoader->{$loadingMethod}('https://www.example.com/foo/test'); $httpLoader->{$loadingMethod}('https://www.example.com/bar/test'); })->with(['load', 'loadOrFail']); it('updates an existing cached response', function () { $httpClient = Mockery::mock(ClientInterface::class); $httpClient ->shouldReceive('sendRequest') ->once() ->andReturn(new Response(body: 'hello')); $cache = new FileCache(helper_cachedir()); $cache->clear(); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->setCache($cache); $response = $httpLoader->load('https://www.example.com/idontknow'); if (!$response) { throw new Exception('failed to get response'); } $extendedResponse = RespondedRequestChild::fromRespondedRequest($response); $httpLoader->addToCache($extendedResponse); $response = $httpLoader->load('https://www.example.com/idontknow'); /** @var RespondedRequestChild $response */ expect($response)->toBeInstanceOf(RespondedRequestChild::class) ->and($response->itseme())->toBe('mario'); }); it('does not add cookies to the cookie jar when a response was served from the cache', function () { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldNotReceive('sendRequest'); $cache = new FileCache(helper_cachedir()); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->setCache($cache); $respondedRequest = new RespondedRequest( new Request( 'GET', 'https://www.example.com/wtf', ['Host' => ['www.example.com'], 'User-Agent' => [(string) helper_nonBotUserAgent()]], ), new Response(headers: ['Set-Cookie' => 'foo=bar'], body: 'Wtf!'), ); $cache->set($respondedRequest->cacheKey(), $respondedRequest); $httpLoader->load('https://www.example.com/wtf'); $cookieJar = invade($httpLoader)->cookieJar; /** @var CookieJar $cookieJar */ $cookies = $cookieJar->allByDomain('example.com'); expect($cookies)->toHaveCount(0); }); test('By default it uses the cookie jar and passes on cookies', function () { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) { return $request->getUri()->__toString() === 'https://www.crwlr.software/'; })->andReturn(new Response(200, ['Set-Cookie' => ['cookie1=foo']])); $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) { $cookiesHeader = $request->getHeader('Cookie'); return $request->getUri()->__toString() === 'https://www.crwlr.software/blog' && $cookiesHeader === ['cookie1=foo']; })->andReturn(new Response(200, ['Set-Cookie' => ['cookie1=foo', 'cookie2=bar']])); $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) { $cookiesHeader = $request->getHeader('Cookie'); return $request->getUri()->__toString() === 'https://www.crwlr.software/contact' && $cookiesHeader === ['cookie1=foo', 'cookie2=bar']; })->andReturn(new Response(200, ['Set-Cookie' => ['cookie1=foo2', 'cookie2=bar2', 'cookie3=baz']])); $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) { $cookiesHeader = $request->getHeader('Cookie'); return $request->getUri()->__toString() === 'https://www.crwlr.software/packages' && $cookiesHeader === ['cookie1=foo2', 'cookie2=bar2', 'cookie3=baz']; })->andReturn(new Response()); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->load('https://www.crwlr.software/'); $httpLoader->load('https://www.crwlr.software/blog'); $httpLoader->loadOrFail('https://www.crwlr.software/contact'); $httpLoader->loadOrFail('https://www.crwlr.software/packages'); expect(true)->toBeTrue(); // Just here so pest doesn't complain that there is no assertion. }); test('You can turn off using the cookie jar', function () { $httpClient = Mockery::mock(ClientInterface::class); $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) { return $request->getUri()->__toString() === 'https://www.crwlr.software/'; })->andReturn(new Response(200, ['Set-Cookie' => ['cookie1=foo']])); $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) { $cookiesHeader = $request->getHeader('Cookie'); return $request->getUri()->__toString() === 'https://www.crwlr.software/blog' && $cookiesHeader === []; })->andReturn(new Response(200, ['Set-Cookie' => ['cookie1=foo', 'cookie2=bar']])); $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) { $cookiesHeader = $request->getHeader('Cookie'); return $request->getUri()->__toString() === 'https://www.crwlr.software/contact' && $cookiesHeader === []; })->andReturn(new Response(200, ['Set-Cookie' => ['cookie1=foo2', 'cookie2=bar2', 'cookie3=baz']])); $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) { $cookiesHeader = $request->getHeader('Cookie'); return $request->getUri()->__toString() === 'https://www.crwlr.software/packages' && $cookiesHeader === []; })->andReturn(new Response()); $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient); $httpLoader->dontUseCookies(); $httpLoader->load('https://www.crwlr.software/'); $httpLoader->load('https://www.crwlr.software/blog'); $httpLoader->loadOrFail('https://www.crwlr.software/contact'); $httpLoader->loadOrFail('https://www.crwlr.software/packages'); expect(true)->toBeTrue(); // Just here so pest doesn't complain that there is no assertion. }); ================================================ FILE: tests/Loader/Http/Messages/RespondedRequestTest.php ================================================ toBeInstanceOf(RespondedRequest::class); }); test('creating with a redirect response adds a redirect uri.', function ($statusCode) { $request = new Request('GET', '/'); $response = new Response($statusCode); $respondedRequest = new RespondedRequest($request, $response); expect($respondedRequest->redirects())->toHaveCount(1); })->with([300, 301, 302, 303, 304, 305, 307, 308]); test('creating with non redirect responses doesn\'t add a redirect uri.', function ($statusCode) { $request = new Request('GET', '/'); $response = new Response($statusCode); $respondedRequest = new RespondedRequest($request, $response); expect($respondedRequest->redirects())->toHaveCount(0); })->with([101, 200, 404, 500]); test('isRedirect returns false when the response is not a redirect', function () { $request = new Request('GET', '/'); $response = new Response(200); $respondedRequest = new RespondedRequest($request, $response); expect($respondedRequest->isRedirect())->toBeFalse(); }); test('isRedirect returns true when the response is a redirect', function () { $request = new Request('GET', '/'); $response = new Response(301); $respondedRequest = new RespondedRequest($request, $response); expect($respondedRequest->isRedirect())->toBeTrue(); }); test('isRedirect returns true when the last response is a redirect', function () { $request = new Request('GET', '/'); $response = new Response(301); $respondedRequest = new RespondedRequest($request, $response); $respondedRequest->setResponse(new Response(302)); expect($respondedRequest->isRedirect())->toBeTrue(); }); test('isRedirect returns false when the last response is not a redirect', function () { $request = new Request('GET', '/'); $response = new Response(301); $respondedRequest = new RespondedRequest($request, $response); $respondedRequest->setResponse(new Response(200)); expect($respondedRequest->isRedirect())->toBeFalse(); }); test('the requested uri remains the same when the request was redirected.', function () { $request = new Request('GET', '/request-uri'); $response = new Response(301, ['Location' => '/redirect-uri']); $respondedRequest = new RespondedRequest($request, $response); $respondedRequest->setResponse(new Response(200)); expect($respondedRequest->requestedUri())->toBe('/request-uri'); }); test('when request was not redirected the effective uri equals the requested uri', function () { $request = new Request('GET', '/request-uri'); $response = new Response(200); $respondedRequest = new RespondedRequest($request, $response); expect($respondedRequest->effectiveUri())->toBe('/request-uri'); }); test('when request was redirected the effective uri is the redirect uri', function () { $request = new Request('GET', '/request-uri'); $response = new Response(301, ['Location' => '/redirect-uri']); $respondedRequest = new RespondedRequest($request, $response); $respondedRequest->setResponse(new Response(200)); expect($respondedRequest->effectiveUri())->toBe('/redirect-uri'); }); test('the allUris() method returns all unique URIs', function () { $request = new Request('GET', '/request-uri'); $response = new Response(301, ['Location' => '/redirect-uri']); $respondedRequest = new RespondedRequest($request, $response); $respondedRequest->setResponse(new Response(301, ['Location' => '/request-uri'])); $respondedRequest->setResponse(new Response(301, ['Location' => '/another-redirect-uri'])); $respondedRequest->setResponse(new Response(200)); expect($respondedRequest->allUris())->toBe([ '/request-uri', '/redirect-uri', '/another-redirect-uri', ]); }); it('can be serialized', function () { $respondedRequest = new RespondedRequest( new Request('POST', '/home', ['key' => 'val'], 'bod'), new Response(201, ['k' => 'v'], 'res'), [new Screenshot('/path/to/screenshot.png'), new Screenshot('/another/path/to/screenshot.webp')], ); $respondedRequest->addRedirectUri('/index'); $serialized = serialize($respondedRequest); expect($serialized)->toBe( 'O:51:"Crwlr\\Crawler\\Loader\\Http\\Messages\\RespondedRequest":9:{s:13:"requestMethod";s:4:"POST";s:10:' . '"requestUri";s:5:"/home";s:14:"requestHeaders";a:1:{s:3:"key";a:1:{i:0;s:3:"val";}}s:11:"requestBody";' . 's:3:"bod";s:12:"effectiveUri";s:6:"/index";s:18:"responseStatusCode";i:201;s:15:"responseHeaders";a:1:{' . 's:1:"k";a:1:{i:0;s:1:"v";}}s:12:"responseBody";s:3:"res";s:11:"screenshots";a:2:{i:0;' . 's:23:"/path/to/screenshot.png";i:1;s:32:"/another/path/to/screenshot.webp";}}', ); }); test('an old serialized instance without screenshots array can be unserialized', function () { $serialized = 'O:51:"Crwlr\Crawler\Loader\Http\Messages\RespondedRequest":8:{s:13:"requestMethod";s:4:"POST";' . 's:10:"requestUri";s:5:"/home";s:14:"requestHeaders";a:1:{s:3:"key";a:1:{i:0;s:3:"val";}}s:11:"requestBody";' . 's:3:"bod";s:12:"effectiveUri";s:6:"/index";s:18:"responseStatusCode";i:201;s:15:"responseHeaders";a:1:{' . 's:1:"k";a:1:{i:0;s:1:"v";}}s:12:"responseBody";s:3:"res";}'; $respondedRequest = unserialize($serialized); /** @var RespondedRequest $respondedRequest */ expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class) ->and($respondedRequest->request->getMethod())->toBe('POST') ->and($respondedRequest->request->getUri()->__toString())->toBe('/home') ->and($respondedRequest->request->getHeaders())->toBe(['key' => ['val']]) ->and($respondedRequest->request->getBody()->getContents())->toBe('bod') ->and($respondedRequest->effectiveUri())->toBe('/index') ->and($respondedRequest->response->getStatusCode())->toBe(201) ->and($respondedRequest->response->getHeaders())->toBe(['k' => ['v']]) ->and($respondedRequest->response->getBody()->getContents())->toBe('res'); }); test('a serialized instance can be unserialized', function () { // We need actual existing file paths for screenshots $screenshot1 = helper_testfilesdir('screenshot1.png'); $screenshot2 = helper_testfilesdir('screenshot2.jpeg'); $serialized = 'O:51:"Crwlr\Crawler\Loader\Http\Messages\RespondedRequest":9:{s:13:"requestMethod";s:4:"POST";' . 's:10:"requestUri";s:5:"/home";s:14:"requestHeaders";a:1:{s:3:"key";a:1:{i:0;s:3:"val";}}s:11:"requestBody";' . 's:3:"bod";s:12:"effectiveUri";s:6:"/index";s:18:"responseStatusCode";i:201;s:15:"responseHeaders";a:1:{' . 's:1:"k";a:1:{i:0;s:1:"v";}}s:12:"responseBody";s:3:"res";s:11:"screenshots";a:2:{i:0;' . 's:' . strlen($screenshot1) . ':"' . $screenshot1 . '";i:1;' . 's:' . strlen($screenshot2) . ':"' . $screenshot2 . '";}}'; $respondedRequest = unserialize($serialized); /** @var RespondedRequest $respondedRequest */ expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class) ->and($respondedRequest->request->getMethod())->toBe('POST') ->and($respondedRequest->request->getUri()->__toString())->toBe('/home') ->and($respondedRequest->request->getHeaders())->toBe(['key' => ['val']]) ->and($respondedRequest->request->getBody()->getContents())->toBe('bod') ->and($respondedRequest->effectiveUri())->toBe('/index') ->and($respondedRequest->response->getStatusCode())->toBe(201) ->and($respondedRequest->response->getHeaders())->toBe(['k' => ['v']]) ->and($respondedRequest->response->getBody()->getContents())->toBe('res') ->and($respondedRequest->screenshots[0]->path)->toBe($screenshot1) ->and($respondedRequest->screenshots[1]->path)->toBe($screenshot2); }); it('can be created from an old serialized array that was not containing the screenshots array', function () { $serialized = 'a:8:{s:13:"requestMethod";s:3:"GET";s:10:"requestUri";s:4:"/foo";s:14:"requestHeaders";a:0:{}s:11:' . '"requestBody";s:0:"";s:12:"effectiveUri";s:4:"/bar";s:18:"responseStatusCode";i:200;s:15:"responseHeaders";' . 'a:0:{}s:12:"responseBody";s:0:"";}'; $respondedRequest = RespondedRequest::fromArray(unserialize($serialized)); expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class) ->and($respondedRequest->request->getUri()->__toString())->toBe('/foo') ->and($respondedRequest->effectiveUri())->toBe('/bar'); }); it('can be created from a serialized array that is containing the screenshots array', function () { // We need actual existing file paths $screenshot1 = helper_testfilesdir('screenshot1.png'); $screenshot2 = helper_testfilesdir('screenshot2.jpeg'); $serialized = 'a:9:{s:13:"requestMethod";s:3:"GET";s:10:"requestUri";s:4:"/foo";s:14:"requestHeaders";a:0:{}s:11:' . '"requestBody";s:0:"";s:12:"effectiveUri";s:4:"/bar";s:18:"responseStatusCode";i:200;s:15:"responseHeaders";' . 'a:0:{}s:12:"responseBody";s:0:"";s:11:"screenshots";a:2:{i:0;' . 's:' . strlen($screenshot1) . ':"' . $screenshot1 . '";i:1;' . 's:' . strlen($screenshot2) . ':"' . $screenshot2 . '";}}'; $respondedRequest = RespondedRequest::fromArray(unserialize($serialized)); expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class) ->and($respondedRequest->request->getUri()->__toString())->toBe('/foo') ->and($respondedRequest->effectiveUri())->toBe('/bar') ->and($respondedRequest->screenshots[0]->path)->toBe($screenshot1) ->and($respondedRequest->screenshots[1]->path)->toBe($screenshot2); }); test( 'when creating from a serialized array, it checks screenshot paths for existence and throws away screenshots ' . 'when the files don\'t exist', function () { $serialized = 'a:9:{s:13:"requestMethod";s:3:"GET";s:10:"requestUri";s:4:"/foo";s:14:"requestHeaders";' . 'a:0:{}s:11:"requestBody";s:0:"";s:12:"effectiveUri";s:4:"/bar";s:18:"responseStatusCode";i:200;' . 's:15:"responseHeaders";a:0:{}s:12:"responseBody";s:0:"";s:11:"screenshots";a:2:{i:0;' . 's:24:"/path/to/screenshot1.png";i:1;s:25:"/path/to/screenshot2.jpeg";}}'; $respondedRequest = RespondedRequest::fromArray(unserialize($serialized)); expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class) ->and($respondedRequest->request->getUri()->__toString())->toBe('/foo') ->and($respondedRequest->effectiveUri())->toBe('/bar') ->and($respondedRequest->screenshots)->toHaveCount(0); }, ); it('has a toArrayForResult() method', function () { $respondedRequest = new RespondedRequest( new Request('POST', '/home', ['key' => 'val'], 'bod'), new Response(201, ['k' => 'v'], 'res'), [new Screenshot('/path/to/screenshot.jpg')], ); expect($respondedRequest->toArrayForResult())->toBe([ 'requestMethod' => 'POST', 'requestUri' => '/home', 'requestHeaders' => ['key' => ['val']], 'requestBody' => 'bod', 'effectiveUri' => '/home', 'responseStatusCode' => 201, 'responseHeaders' => ['k' => ['v']], 'responseBody' => 'res', 'screenshots' => ['/path/to/screenshot.jpg'], 'url' => '/home', 'uri' => '/home', 'status' => 201, 'headers' => ['k' => ['v']], 'body' => 'res', ]); }); it('generates a cache key for an instance', function () { $respondedRequest = new RespondedRequest(new Request('GET', '/foo/bar'), new Response()); expect($respondedRequest->cacheKey())->toBe('27ca75942fb28ed0d8fb3f9b077dd582'); }); ================================================ FILE: tests/Loader/Http/Politeness/RobotsTxtHandlerTest.php ================================================ shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) { return str_ends_with($request->getUri()->__toString(), '/robots.txt'); })->andReturn(new Response(200, [], Utils::streamFor($robotsTxtContent))); } return new HttpLoader($userAgent, $httpClient); } /** @var TestCase $this */ test('route is disallowed when it\'s disallowed for my user agent', function () { $robotsTxt = <<isAllowed('https://www.example.com/foo/bar'))->toBeFalse(); }); test('route is disallowed when it\'s disallowed for all user agents', function () { $robotsTxt = <<isAllowed('https://www.example.com/foo/bar'))->toBeFalse(); }); test( 'route is not disallowed when it\'s disallowed for all user agents but my user agent is not a BotUserAgent', function () { $robotsTxt = <<isAllowed('https://www.example.com/foo/bar'))->toBeTrue(); }, ); test( 'route is not disallowed when it\'s disallowed for all user agent but I want to ignore wildcard rules', function () { $robotsTxt = <<ignoreWildcardRules(); expect($robotsTxt->isAllowed('https://www.example.com/foo/bar'))->toBeTrue(); }, ); it('gets all the sitemap URLs from robots.txt', function () { $robotsTxt = <<getSitemaps('https://www.example.com/home'))->toBe([ 'https://www.example.com/sitemap.xml', 'https://www.example.com/sitemap2.xml', 'https://www.example.com/sitemap3.xml', ]); }); it('fails silently when parsing fails', function () { $robotsTxt = <<isAllowed('https://www.example.com/anything'))->toBeTrue(); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput)->toContain('Failed to parse robots.txt'); }); ================================================ FILE: tests/Loader/Http/Politeness/ThrottlerTest.php ================================================ waitAtLeast(Microseconds::fromSeconds(0.001)); $throttler->trackRequestStartFor($url); usleep(Microseconds::fromSeconds(0.1)->value); $throttler->trackRequestEndFor($url); $requestEndTime = Microseconds::fromSeconds(microtime(true)); $throttler->waitForGo($url); $readyForNextRequest = Microseconds::fromSeconds(microtime(true)); $diff = $readyForNextRequest->subtract($requestEndTime); expect($diff->value)->toBeGreaterThan(100000) ->and($diff->value)->toBeLessThan(220000); // A bit more than * 2.0 because other things happening also take time. }); it('waits min 0.25s by default', function () { $url = Url::parsePsr7('https://www.example.com'); $throttler = new Throttler(); $throttler->trackRequestStartFor($url); $throttler->trackRequestEndFor($url); $requestEndTime = Microseconds::fromSeconds(microtime(true)); $throttler->waitForGo($url); $readyForNextRequest = Microseconds::fromSeconds(microtime(true)); $diff = $readyForNextRequest->subtract($requestEndTime); expect($diff->value)->toBeGreaterThan(250000); }); it('respects the max wait time you set', function () { $url = Url::parsePsr7('https://www.example.com'); $throttler = new Throttler(); $throttler ->waitBetween(new MultipleOf(10), new MultipleOf(20)) ->waitAtMax(Microseconds::fromSeconds(0.1)); $throttler->trackRequestStartFor($url); usleep(Microseconds::fromSeconds(0.1)->value); $throttler->trackRequestEndFor($url); $requestEndTime = Microseconds::fromSeconds(microtime(true)); $throttler->waitForGo($url); $readyForNextRequest = Microseconds::fromSeconds(microtime(true)); $diff = $readyForNextRequest->subtract($requestEndTime); expect($diff->value)->toBeLessThan(110000); // A bit more than * 1.0 because other things happening also take time. }); it('waits only if there was already a request to the same domain', function () { $url = Url::parsePsr7('https://www.example.com'); $throttler = new Throttler(); $throttler ->waitBetween(new MultipleOf(10), new MultipleOf(20)) ->waitAtMax(Microseconds::fromSeconds(0.1)); $throttler->trackRequestStartFor($url); usleep(Microseconds::fromSeconds(0.01)->value); $throttler->trackRequestEndFor($url); $requestEndTime = Microseconds::fromSeconds(microtime(true)); $throttler->waitForGo(Url::parsePsr7('https://www.crwlr.software')); $readyForNextRequest = Microseconds::fromSeconds(microtime(true)); $diff = $readyForNextRequest->subtract($requestEndTime); expect($diff->value)->toBeLessThan(1000); }); it('throws an exception if you try to set different types for from and to', function () { new Throttler(Microseconds::fromSeconds(0.1), new MultipleOf(0.5)); })->throws(InvalidArgumentException::class); it('throws an exception if you try to set the from value bigger than the to value with Microseconds', function () { new Throttler(Microseconds::fromSeconds(2.0), Microseconds::fromSeconds(1.0)); })->throws(InvalidArgumentException::class); it('throws an exception if you try to set the from value bigger than the to value with MultipleOf', function () { new Throttler(new MultipleOf(1.0), new MultipleOf(0.9)); })->throws(InvalidArgumentException::class); it('does not throw an exception when from and to values are equal', function () { new Throttler(Microseconds::fromSeconds(2.0), Microseconds::fromSeconds(2.0)); new Throttler(new MultipleOf(1.0), new MultipleOf(1.0)); expect(true)->toBeTrue(); }); test('internal _requestToUrlWasStarted returns false when _internalTrackStartFor was not called', function () { $url = Url::parsePsr7('https://www.example.com'); $throttler = new Throttler(); $throttler ->waitBetween(Microseconds::fromSeconds(0.001), Microseconds::fromSeconds(0.002)) ->waitAtMax(Microseconds::fromSeconds(0.002)); expect(invade($throttler)->_requestToUrlWasStarted($url))->toBeFalse(); $throttler->trackRequestEndFor($url); // To check if no error/exception occurs when start was not called before. }); test('internal _requestToUrlWasStarted returns true when _internalTrackStartFor was called', function () { $url = Url::parsePsr7('https://www.example.com'); $throttler = new Throttler(); $throttler ->waitBetween(Microseconds::fromSeconds(0.001), Microseconds::fromSeconds(0.002)) ->waitAtMax(Microseconds::fromSeconds(0.002)); $throttler->trackRequestStartFor($url); $invadedThrottler = invade($throttler); expect($invadedThrottler->_requestToUrlWasStarted($url))->toBeTrue(); // And after end of the request is tracked, it should return false again. $throttler->trackRequestEndFor($url); expect($invadedThrottler->_requestToUrlWasStarted($url))->toBeFalse(); }); ================================================ FILE: tests/Loader/Http/Politeness/TimingUnits/MultipleOfTest.php ================================================ calc(Microseconds::fromSeconds(1.23)) ->toSeconds(), )->toBe(9.7047); }); ================================================ FILE: tests/Loader/Http/ProxyManagerTest.php ================================================ hasOnlySingleProxy()) ->toBeTrue() ->and($manager->hasMultipleProxies()) ->toBeFalse(); $manager = new ProxyManager(['http://127.0.0.1:8001', 'http://127.0.0.1:8002']); expect($manager->hasOnlySingleProxy()) ->toBeFalse() ->and($manager->hasMultipleProxies()) ->toBeTrue(); }); it('returns the proxy when only one is defined', function () { $manager = new ProxyManager(['http://127.0.0.1:8003']); expect($manager->getProxy()) ->toBe('http://127.0.0.1:8003') ->and($manager->getProxy()) ->toBe('http://127.0.0.1:8003'); }); it('rotates the proxies when multiple are defined', function () { $manager = new ProxyManager(['http://127.0.0.1:8001', 'http://127.0.0.1:8002', 'http://127.0.0.1:8003']); expect($manager->getProxy()) ->toBe('http://127.0.0.1:8001') ->and($manager->getProxy()) ->toBe('http://127.0.0.1:8002') ->and($manager->getProxy()) ->toBe('http://127.0.0.1:8003') ->and($manager->getProxy()) ->toBe('http://127.0.0.1:8001'); }); ================================================ FILE: tests/Loader/LoaderTest.php ================================================ hookName === 'afterLoad') { $this->callHook('beforeLoad'); // Loader won't run afterLoad when beforeLoad wasn't called. } $this->callHook($this->hookName); return 'something'; } public function loadOrFail(mixed $subject): mixed { return 'something'; } }; $callback1Called = false; $loader->{$hookName}(function () use (&$callback1Called) { $callback1Called = true; }); $callback2Called = false; $loader->{$hookName}(function () use (&$callback2Called) { $callback2Called = true; }); $callback3Called = false; $loader->{$hookName}(function () use (&$callback3Called) { $callback3Called = true; }); $loader->load('something'); expect($callback1Called)->toBeTrue() ->and($callback2Called)->toBeTrue() ->and($callback3Called)->toBeTrue(); })->with([ 'beforeLoad', 'onCacheHit', 'onSuccess', 'onError', 'afterLoad', ]); it('does not call the afterLoad hook when beforeLoad was not called before it', function () { $logger = new DummyLogger(); $loader = new class (new BotUserAgent('FooBot'), $logger) extends Loader { public function load(mixed $subject): mixed { $this->callHook('afterLoad'); return 'something'; } public function loadOrFail(mixed $subject): mixed { return 'something'; } }; $callbackCalled = false; $loader->afterLoad(function () use (&$callbackCalled) { $callbackCalled = true; }); $loader->load('something'); expect($callbackCalled)->toBeFalse() ->and($logger->messages[0]['message'])->toStartWith( 'The afterLoad hook was called without a preceding call to the beforeLoad hook.', ); }); it('calls the afterLoad hook when beforeLoad was called before it', function () { $logger = new DummyLogger(); $loader = new class (new BotUserAgent('FooBot'), $logger) extends Loader { public function load(mixed $subject): mixed { $this->callHook('beforeLoad'); $this->callHook('afterLoad'); return 'something'; } public function loadOrFail(mixed $subject): mixed { return 'something'; } }; $callbackCalled = false; $loader->afterLoad(function () use (&$callbackCalled) { $callbackCalled = true; }); $loader->load('something'); expect($callbackCalled)->toBeTrue() ->and($logger->messages)->toHaveCount(0); }); test('You can set a cache and use it in the load function', function () { $loader = new class (new BotUserAgent('FooBot')) extends Loader { public function load(mixed $subject): string { $this->cache?->get('foo'); return 'something'; } public function loadOrFail(mixed $subject): mixed { return 'something'; } }; $cache = Mockery::mock(CacheInterface::class); $cache->shouldReceive('get')->with('foo')->once(); $loader->setCache($cache); $loader->load('something'); }); ================================================ FILE: tests/Logger/CliLoggerTest.php ================================================ log('info', 'Some log message.'); $output = $this->getActualOutputForAssertion(); expect($output)->toContain('Some log message.'); }); test('It prints the log level', function () { $logger = new CliLogger(); $logger->log('alert', 'Everybody panic!'); $output = $this->getActualOutputForAssertion(); expect($output)->toContain('[ALERT]'); }); test('It starts with printing the time', function () { $logger = new CliLogger(); $logger->log('warning', 'Warn about something.'); $this->expectOutputRegex('/^\d\d:\d\d:\d\d:\d\d\d\d\d\d/'); }); test('It has methods for all the log levels', function ($logLevel) { $logger = new CliLogger(); $logger->{$logLevel}('Some message'); $output = $this->getActualOutputForAssertion(); expect($output)->toContain('Some message'); expect($output)->toContain('[' . strtoupper($logLevel) . ']'); })->with([ 'emergency', 'alert', 'critical', 'error', 'warning', 'notice', 'info', 'debug', ]); ================================================ FILE: tests/Logger/PreStepInvocationLoggerTest.php ================================================ info('test'); $logger->warning('foo'); $logger->error('some error'); expect($logger->messages)->toHaveCount(3) ->and($logger->messages[0]['level'])->toBe('info') ->and($logger->messages[0]['message'])->toBe('test') ->and($logger->messages[1]['level'])->toBe('warning') ->and($logger->messages[1]['message'])->toBe('foo') ->and($logger->messages[2]['level'])->toBe('error') ->and($logger->messages[2]['message'])->toBe('some error'); }); it('passes log messages to another logger', function () { $logger = new PreStepInvocationLogger(); $logger->info('test'); $logger->warning('foo'); $logger->error('some error'); $anotherLogger = new DummyLogger(); $logger->passToOtherLogger($anotherLogger); expect($anotherLogger->messages)->toHaveCount(3) ->and($anotherLogger->messages[0]['level'])->toBe('info') ->and($anotherLogger->messages[0]['message'])->toBe('test') ->and($anotherLogger->messages[1]['level'])->toBe('warning') ->and($anotherLogger->messages[1]['message'])->toBe('foo') ->and($anotherLogger->messages[2]['level'])->toBe('error') ->and($anotherLogger->messages[2]['message'])->toBe('some error'); }); ================================================ FILE: tests/Pest.php ================================================ group('integration') ->beforeEach(function () { if (!isset(TestServerProcess::$process)) { TestServerProcess::$process = Process::fromShellCommandline( 'php -S localhost:8000 ' . __DIR__ . '/_Integration/Server.php', ); TestServerProcess::$process->start(); usleep(100000); } }) ->afterAll(function () { TestServerProcess::$process?->stop(3, SIGINT); TestServerProcess::$process = null; }) ->in('_Integration'); function helper_dump(mixed $var): void { error_log(var_export($var, true)); } function helper_dieDump(mixed $var): void { var_dump($var); ob_end_flush(); exit; } function helper_getValueReturningStep(mixed $value): Step { return new class ($value) extends Step { public function __construct(private mixed $value) {} protected function invoke(mixed $input): Generator { yield $this->value; } public function outputType(): StepOutputType { return OutputTypeHelper::isAssociativeArrayOrObject($this->value) ? StepOutputType::AssociativeArrayOrObject : StepOutputType::Scalar; } }; } function helper_getInputReturningStep(): Step { return new class extends Step { protected function invoke(mixed $input): Generator { yield $input; } }; } function helper_getNumberIncrementingStep(): Step { return new class extends Step { protected function invoke(mixed $input): Generator { yield $input + 1; } }; } function helper_getStepYieldingMultipleNumbers(): Step { return new class extends Step { protected function invoke(mixed $input): Generator { foreach (['one', 'two', 'two', 'three', 'four', 'three', 'five', 'three'] as $number) { yield $number; } } }; } function helper_getStepYieldingMultipleArraysWithNumber(): Step { return new class extends Step { protected function invoke(mixed $input): Generator { foreach (['one', 'two', 'two', 'three', 'four', 'three', 'five', 'three'] as $key => $number) { yield ['number' => $number, 'foo' => 'bar' . ($input === true ? ' ' . $key : '')]; } } }; } function helper_getStepYieldingObjectWithNumber(int $number): Step { return new class ($number) extends Step { public function __construct(private int $number) {} protected function invoke(mixed $input): Generator { yield helper_getStdClassWithData( ['number' => $this->number, 'foo' => 'bar' . (is_int($input) ? ' ' . $input : '')], ); } }; } function helper_getStepYieldingMultipleObjectsWithNumber(): Step { return new class extends Step { protected function invoke(mixed $input): Generator { foreach (['one', 'two', 'two', 'three', 'four', 'three', 'five', 'three'] as $key => $number) { yield helper_getStdClassWithData( ['number' => $number, 'foo' => 'bar' . ($input === true ? ' ' . $key : '')], ); } } }; } function helper_getStepYieldingInputArrayAsSeparateOutputs(): Step { return new class extends Step { protected function invoke(mixed $input): Generator { foreach ($input as $output) { yield $output; } } }; } function helper_getLoadingStep(): Step { return new class extends Step { /** * @use LoadingStep */ use LoadingStep; protected function invoke(mixed $input): Generator { yield 'yo'; } }; } function helper_getDummyRobotsTxtResponse(?string $forDomain = null): Response { return new Response( 200, [], "User-agent: FooBot\n" . "Disallow: " . ($forDomain ? '/' . $forDomain . '/secret' : 'secret'), ); } /** * @param iterable $iterable * @return void */ function helper_traverseIterable(iterable $iterable): void { foreach ($iterable as $key => $value) { // just traverse } } /** * @param mixed[] $array * @return Generator */ function helper_arrayToGenerator(array $array): Generator { foreach ($array as $element) { yield $element; } } /** * @param Generator $generator * @return mixed[] */ function helper_generatorToArray(Generator $generator): array { $array = []; foreach ($generator as $value) { $array[] = $value; } return $array; } /** * @return Output[] */ function helper_invokeStepWithInput(StepInterface $step, mixed $input = null): array { return helper_generatorToArray($step->invokeStep(new Input($input ?? 'anything'))); } function helper_getStepFilesContent(string $filePathInFilesFolder): string { $content = file_get_contents(__DIR__ . '/Steps/_Files/' . $filePathInFilesFolder); if ($content === false) { return ''; } return $content; } /** * @param mixed[] $data */ function helper_getStdClassWithData(array $data): stdClass { $object = new stdClass(); foreach ($data as $key => $value) { $object->{$key} = $value; } return $object; } function helper_getSimpleListHtml(): string { return <<
  • one
  • two
  • three
  • four
  • HTML; } function helper_getFastLoader( ?UserAgentInterface $userAgent = null, ?LoggerInterface $logger = null, ?ClientInterface $httpClient = null, ): HttpLoader { $loader = new HttpLoader($userAgent ?? UserAgent::mozilla5CompatibleBrowser(), $httpClient, $logger); $loader->throttle() ->waitBetween(new MultipleOf(0.0001), new MultipleOf(0.0002)) ->waitAtLeast(Microseconds::fromSeconds(0.0001)); return $loader; } function helper_getFastCrawler(): HttpCrawler { return new class extends HttpCrawler { protected function userAgent(): UserAgentInterface { return new UserAgent('TestBot'); } protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface { return helper_getFastLoader($userAgent, $logger); } }; } function helper_nonBotUserAgent(): UserAgent { return new UserAgent('Mozilla/5.0 (compatible; FooBot)'); } function helper_getMinThrottler(): Throttler { return new Throttler(new MultipleOf(0.0001), new MultipleOf(0.0002), Microseconds::fromSeconds(0.0001)); } /** * @param array $requestHeaders * @param array $responseHeaders */ function helper_getRespondedRequest( string $method = 'GET', string $url = 'https://www.example.com/foo', array $requestHeaders = [], ?string $requestBody = null, int $statusCode = 200, array $responseHeaders = [], ?string $responseBody = null, ): RespondedRequest { if ($requestBody !== null) { $request = new Request($method, $url, $requestHeaders, Utils::streamFor($requestBody)); } else { $request = new Request($method, $url, $requestHeaders); } if ($responseBody !== null) { $response = new Response($statusCode, $responseHeaders, body: Utils::streamFor($responseBody)); } else { $response = new Response($statusCode, $responseHeaders); } return new RespondedRequest($request, $response); } function helper_cachedir(?string $inDir = null): string { $path = __DIR__ . '/_Temp/_cachedir'; if ($inDir !== null) { return $path . (str_starts_with($inDir, '/') ? $inDir : '/' . $inDir); } return $path; } function helper_resetCacheDir(): void { helper_resetTempDir(helper_cachedir()); } function helper_storagedir(?string $inDir = null): string { $path = __DIR__ . '/_Temp/_storagedir'; if ($inDir !== null) { return $path . (str_starts_with($inDir, '/') ? $inDir : '/' . $inDir); } return $path; } function helper_resetStorageDir(): void { helper_resetTempDir(helper_storagedir()); } function helper_resetTempDir(string $dirPath): void { $files = scandir($dirPath); if (is_array($files)) { foreach ($files as $file) { if ($file === '.' || $file === '..' || $file === '.gitkeep') { continue; } @unlink($dirPath . '/' . $file); } } } function helper_testfilesdir(?string $inDir = null): string { $path = __DIR__ . '/_Temp/_testfilesdir'; if ($inDir !== null) { return $path . (str_starts_with($inDir, '/') ? $inDir : '/' . $inDir); } return $path; } ================================================ FILE: tests/ResultTest.php ================================================ set('title', 'PHP Web Developer'); expect($result->get('title'))->toBe('PHP Web Developer'); }); test('You can set multiple values for a property', function () { $result = new Result(); $result->set('location', 'Linz'); expect($result->get('location'))->toBe('Linz'); $result->set('location', 'Wien'); expect($result->get('location'))->toBe(['Linz', 'Wien']); }); test('The get method has a default value that you can set yourself', function () { $result = new Result(); expect($result->get('foo'))->toBeNull() ->and($result->get('foo', '123'))->toBe('123'); }); test('You can convert it to a plain array', function () { $result = new Result(); $result->set('title', 'PHP Web Developer (w/m/x)'); $result->set('location', 'Linz'); $result->set('location', 'Wien'); expect($result->toArray())->toBe([ 'title' => 'PHP Web Developer (w/m/x)', 'location' => ['Linz', 'Wien'], ]); }); test('Converting to an array, also converts all objects at any level in the array to arrays', function () { $result = new Result(); $result->set('foo', 'one'); $result->set( 'bar', helper_getStdClassWithData([ 'a' => 'b', 'c' => helper_getStdClassWithData(['d' => 'e', 'f' => 'g']), ]), ); $resultArray = $result->toArray(); expect($resultArray)->toBe([ 'foo' => 'one', 'bar' => [ 'a' => 'b', 'c' => ['d' => 'e', 'f' => 'g'], ], ]); }); test( 'when the only element of the output array is some unnamed property, but the value is an array with keys, ' . 'it returns only that child array', function () { $result = new Result(); $result->set('unnamed', new RespondedRequest( new Request('GET', 'https://www.example.com/foo'), new Response(200, [], 'Hello World!'), [new Screenshot('/path/to/screenshot.png')], )); $resultArray = $result->toArray(); expect($resultArray)->toBeArray() ->and(count($resultArray))->toBeGreaterThanOrEqual(14) ->and($resultArray['url'])->toBe('https://www.example.com/foo') ->and($resultArray['status'])->toBe(200) ->and($resultArray['body'])->toBe('Hello World!') ->and($resultArray['screenshots'][0])->toBe('/path/to/screenshot.png'); }, ); test( 'when the only element of the output array is an unnamed property, with a scalar value, it returns the unnamed key', function () { $result = new Result(); $result->set('unnamed', 'foo'); $resultArray = $result->toArray(); expect($resultArray)->toBe(['unnamed' => 'foo']); }, ); test('when you add something with empty string as key it creates a name with incrementing number', function () { $result = new Result(); $result->set('', 'foo'); expect($result->get('unnamed1'))->toBe('foo'); $result->set('', 'bar'); expect($result->get('unnamed2'))->toBe('bar'); $result->set('', 'baz'); expect($result->get('unnamed3'))->toBe('baz'); }); test('you can create a new instance from another instance', function () { $instance1 = new Result(); $instance1->set('foo', 'bar'); $instance2 = new Result($instance1); expect($instance1->get('foo'))->toBe('bar') ->and($instance2->get('foo'))->toBe('bar'); $instance2->set('baz', 'quz'); expect($instance1->get('baz'))->toBeNull() ->and($instance2->get('baz'))->toBe('quz'); }); test('it makes a proper array of arrays if you repeatedly add (associative) arrays with the same key', function () { $result = new Result(); $result->set('foo', ['bar' => 'one', 'baz' => 'two']); expect($result->get('foo'))->toBe(['bar' => 'one', 'baz' => 'two']); $result->set('foo', ['bar' => 'three', 'baz' => 'four']); expect($result->get('foo'))->toBe([ ['bar' => 'one', 'baz' => 'two'], ['bar' => 'three', 'baz' => 'four'], ]); }); ================================================ FILE: tests/Steps/BaseStepTest.php ================================================ passesAllFilters = $this->passesAllFilters($input->get()); yield new Output('yo'); } } /** @var TestCase $this */ test('You can set a filter and passesAllFilters() tells if an output value passes that filter', function () { $step = new TestStep(); $step->where(Filter::equal('hello')); helper_invokeStepWithInput($step, new Input('hello')); expect($step->passesAllFilters)->toBeTrue(); helper_invokeStepWithInput($step, new Input('hola')); expect($step->passesAllFilters)->toBeFalse(); }); test('You can set multiple filters and passesAllFilters() tells if an output value passes that filters', function () { $step = new TestStep(); $step->where(Filter::stringContains('foo')) ->where(Filter::equal('boo foo too')) ->where(Filter::notEqual('pew foo tew')); helper_invokeStepWithInput($step, new Input('boo foo too')); expect($step->passesAllFilters)->toBeTrue(); helper_invokeStepWithInput($step, new Input('foo something')); expect($step->passesAllFilters)->toBeFalse(); helper_invokeStepWithInput($step, new Input('pew foo tew')); expect($step->passesAllFilters)->toBeFalse(); }); test( 'you can link filters using orWhere and passesAllFilters() is true when one of those filters evaluates to true', function () { $step = new TestStep(); $step->where(Filter::stringStartsWith('foo')) ->orWhere(Filter::stringStartsWith('bar')) ->orWhere(Filter::stringEndsWith('foo')); helper_invokeStepWithInput($step, new Input('foo bar baz')); expect($step->passesAllFilters)->toBeTrue(); helper_invokeStepWithInput($step, new Input('bar foo baz')); expect($step->passesAllFilters)->toBeTrue(); helper_invokeStepWithInput($step, new Input('bar baz foo')); expect($step->passesAllFilters)->toBeTrue(); helper_invokeStepWithInput($step, new Input('funky town')); expect($step->passesAllFilters)->toBeFalse(); }, ); it('uses a key from an array when providing a key to the filter() method', function () { $step = new TestStep(); $step->where('vendor', Filter::equal('crwlr')); helper_invokeStepWithInput($step, new Input(['vendor' => 'crwlr', 'package' => 'url'])); expect($step->passesAllFilters)->toBeTrue(); helper_invokeStepWithInput($step, new Input(['vendor' => 'illuminate', 'package' => 'support'])); expect($step->passesAllFilters)->toBeFalse(); }); it('uses a key from an object when providing a key to the filter() method', function () { $step = new TestStep(); $step->where('vendor', Filter::equal('crwlr')); helper_invokeStepWithInput($step, new Input( helper_getStdClassWithData(['vendor' => 'crwlr', 'package' => 'url']), )); expect($step->passesAllFilters)->toBeTrue(); helper_invokeStepWithInput($step, new Input( helper_getStdClassWithData(['vendor' => 'illuminate', 'package' => 'support']), )); expect($step->passesAllFilters)->toBeFalse(); }); it('filters using a custom Closure filter', function () { $step = new TestStep(); $step->where('bar', Filter::custom(function (mixed $value) { return in_array($value, ['one', 'two', 'three'], true); })); helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two']); expect($step->passesAllFilters)->toBeTrue(); helper_invokeStepWithInput($step, ['foo' => 'three', 'bar' => 'four']); expect($step->passesAllFilters)->toBeFalse(); }); it('throws an exception when you provide a string as first argument to filter but no second argument', function () { $step = new TestStep(); $step->where('test'); })->throws(InvalidArgumentException::class); it('removes an UTF-8 byte order mark from the beginning of a string', function () { $step = new class extends Step { protected function invoke(mixed $input): Generator { yield $input; } protected function validateAndSanitizeInput(mixed $input): mixed { return parent::validateAndSanitizeStringOrHttpResponse($input); } }; $stringWithBom = helper_getStepFilesContent('Xml/rss-with-bom.xml'); $response = new RespondedRequest( new Request('GET', 'https://www.example.com/rss'), new Response(body: $stringWithBom), ); $outputs = helper_invokeStepWithInput($step, $response); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBeString() ->and(substr($outputs[0]->get(), 0, 5))->toBe('toHaveCount(1) ->and($outputs[0]->get())->toBeString() ->and(substr($outputs[0]->get(), 0, 5))->toBe('addLogger(new PreStepInvocationLogger()); $this->logger?->info('test'); $this->logger?->warning('foo'); } protected function invoke(mixed $input): Generator { yield $input; } }; $crawlerLogger = new DummyLogger(); $step->addLogger($crawlerLogger); expect($crawlerLogger->messages)->toHaveCount(2) ->and($crawlerLogger->messages[0]['level'])->toBe('info') ->and($crawlerLogger->messages[0]['message'])->toBe('test') ->and($crawlerLogger->messages[1]['level'])->toBe('warning') ->and($crawlerLogger->messages[1]['message'])->toBe('foo'); }, ); it( 'when using a PreStepInvocationLogger, the later created logger is also passed to refiners, so its log messages ' . 'won\'t be lost', function () { $step = new class extends Step { public function __construct() { $this->addLogger(new PreStepInvocationLogger()); $this->logger?->info('test'); } protected function invoke(mixed $input): Generator { yield $input; } }; $step->refineOutput('foo', StringRefiner::replace('foo', 'bar')); $logger = new DummyLogger(); $step->addLogger($logger); helper_invokeStepWithInput($step, ['foo' => 1.2]); expect($logger->messages)->toHaveCount(2) ->and($logger->messages[1]['message'])->toBe( 'Refiner StringRefiner::replace() can\'t be applied to value of type double', ); }, ); /* ----------------------------- validateBeforeRun() ----------------------------- */ it( 'throws an exception in validateBeforeRun() when output type is scalar and keep() was used but not keepAs()', function () { $step = new class extends Step { protected function invoke(mixed $input): Generator { yield $input; } public function outputType(): StepOutputType { return StepOutputType::Scalar; } }; $step->keep()->validateBeforeRun(Http::get()); }, )->throws(PreRunValidationException::class); it( 'logs a warning in validateBeforeRun() when output type is mixed and keep() was used but not keepAs()', function () { class SomeDemoStep extends Step { protected function invoke(mixed $input): Generator { yield $input; } } $step = new SomeDemoStep(); $step->addLogger(new CliLogger())->keep()->validateBeforeRun(Http::get()); expect($this->getActualOutputForAssertion()) ->toContain('The tests\Steps\SomeDemoStep step potentially yields scalar value outputs'); }, ); test( 'the warning message, when output type is mixed and keep() was used but not keepAs() with an anonymous step ' . 'class, extending a step that isn\'t one of the abstract classes Step or BaseStep, contains the parent step ' . 'class', function () { class ParentStepClass extends Step { protected function invoke(mixed $input): Generator { yield $input; } } $step = new class extends ParentStepClass {}; $step->addLogger(new CliLogger())->keep()->validateBeforeRun(Http::get()); expect($this->getActualOutputForAssertion()) ->toContain( 'An anonymous class step, that is extending the tests\\Steps\\ParentStepClass step potentially ' . 'yields scalar value outputs', ); }, ); test( 'the warning message, when output type is mixed and keep() was used but not keepAs() with an anonymous step ' . 'class, extending one of the abstract classes Step or BaseStep, only mentions that it is an anonymous step class', function (string $extendClass) { $step = null; if ($extendClass === Step::class) { $step = new class extends Step { protected function invoke(mixed $input): Generator { yield $input; } }; } elseif ($extendClass === BaseStep::class) { $step = new class extends BaseStep { protected function invoke(mixed $input): Generator { yield $input; } public function invokeStep(Input $input): Generator { yield from $this->invoke($input); } }; } if ($step === null) { throw new Exception('Invalid $extendClass parameter'); } $step->addLogger(new CliLogger())->keep()->validateBeforeRun(Http::get()); expect($this->getActualOutputForAssertion()) ->toContain( 'An anonymous class step potentially yields scalar value outputs', ); }, )->with([ [Step::class], [BaseStep::class], ]); it('does not throw an exception or log a warning when output type is scalar and keepAs() was called', function () { helper_getInputReturningStep()->addLogger(new CliLogger())->keepAs('foo')->validateBeforeRun(Http::get()); expect($this->getActualOutputForAssertion()) ->not() ->toContain('The tests\Steps\SomeDemoStep step potentially yields scalar value outputs'); }); it('does not throw an exception or log a warning when output type is scalar and outputKey() was called', function () { helper_getInputReturningStep()->addLogger(new CliLogger())->outputKey('foo')->validateBeforeRun(Http::get()); expect($this->getActualOutputForAssertion()) ->not() ->toContain('The tests\Steps\SomeDemoStep step potentially yields scalar value outputs'); }); it('throws an exception when keepFromInput() was called and initial inputs contain a scalar value', function () { Http::get() ->keepFromInput() ->validateBeforeRun([ ['foo' => 'bar', 'baz' => 'quz'], 'scalar', ]); })->throws(PreRunValidationException::class); it('does not throw an exception when keepFromInput() was called and initial inputs are associative array', function () { Http::get() ->keepFromInput() ->validateBeforeRun([ ['foo' => 'one'], ['foo' => 'two'], ]); })->throwsNoExceptions(); it('logs an error when initial inputs are empty', function () { Http::get() ->addLogger(new CliLogger()) ->validateBeforeRun([]); expect($this->getActualOutputForAssertion()) ->toContain('You did not provide any initial inputs for your crawler.'); }); it('throws an exception when keepFromInput() was called and previous step yields scalar outputs', function () { Http::get() ->keepFromInput() ->validateBeforeRun(Html::getLink('.link')); })->throws(PreRunValidationException::class); it('does not throw an exception when keepInputAs() was called and previous step yields scalar outputs', function () { Http::get() ->keepInputAs('link') ->validateBeforeRun(Html::getLink('.link')); })->throwsNoExceptions(); it('logs a warning, when keepFromInput() was called and previous step yields mixed outputs', function () { $stepWithMixedOutputType = new class extends Step { protected function invoke(mixed $input): Generator { yield 'yo'; } public function outputType(): StepOutputType { return StepOutputType::Mixed; } }; Http::get() ->keepFromInput() ->addLogger(new CliLogger()) ->validateBeforeRun($stepWithMixedOutputType); expect($this->getActualOutputForAssertion()) ->toContain('potentially yields scalar value outputs ') ->toContain('the next step can not keep it by using keepFromInput()'); }); test( 'the warning message, when keepFromInput() was called and previous step yields mixed outputs with an anonymous ' . 'step class, extending a step that isn\'t one of the abstract classes Step or BaseStep, contains the parent step ' . 'class', function () { class ParentStepClassTwo extends Step { protected function invoke(mixed $input): Generator { yield 'yo'; } public function outputType(): StepOutputType { return StepOutputType::Mixed; } } $stepWithMixedOutputType = new class extends ParentStepClassTwo {}; Http::get() ->keepFromInput() ->addLogger(new CliLogger()) ->validateBeforeRun($stepWithMixedOutputType); expect($this->getActualOutputForAssertion()) ->toContain( 'An anonymous class step, that is extending the tests\\Steps\\ParentStepClassTwo step potentially ' . 'yields scalar value outputs', ); }, ); test( 'the warning message, when keepFromInput() was called and previous step yields mixed outputs with an anonymous ' . 'step class, extending one of the abstract classes Step or BaseStep, only mentions that it is an anonymous step ' . 'class', function (string $extendClass) { $stepWithMixedOutputType = null; if ($extendClass === Step::class) { $stepWithMixedOutputType = new class extends Step { protected function invoke(mixed $input): Generator { yield 'yo'; } public function outputType(): StepOutputType { return StepOutputType::Mixed; } }; } elseif ($extendClass === BaseStep::class) { $stepWithMixedOutputType = new class extends BaseStep { protected function invoke(mixed $input): Generator { yield 'yo'; } public function outputType(): StepOutputType { return StepOutputType::Mixed; } public function invokeStep(Input $input): Generator { yield from $this->invoke($input); } }; } if ($stepWithMixedOutputType === null) { throw new Exception('Invalid $extendClass parameter'); } Http::get() ->keepFromInput() ->addLogger(new CliLogger()) ->validateBeforeRun($stepWithMixedOutputType); expect($this->getActualOutputForAssertion()) ->toContain('An anonymous class step potentially yields scalar value outputs'); }, )->with([ [Step::class], [BaseStep::class], ]); /* ----------------------------- keep() ----------------------------- */ it('adds all from array output to the keep array in the output object, when keep() is called', function () { $step = helper_getInputReturningStep()->keep(); $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two']); expect($outputs[0]->keep)->toBe(['foo' => 'one', 'bar' => 'two']); }); it('adds all from object output to the keep array in the output object, when keep() is called', function () { $step = helper_getInputReturningStep()->keep(); $outputObject = new class { /** * @return array */ public function toArray(): array { return ['key' => 'value', 'key2' => 'value2']; } }; $outputs = helper_invokeStepWithInput($step, $outputObject); expect($outputs[0]->keep)->toBe(['key' => 'value', 'key2' => 'value2']); }); it('adds a key from array output to the keep array in the output, when keep() was called with a string', function () { $step = helper_getInputReturningStep()->keep('bar'); $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two']); expect($outputs[0]->keep)->toBe(['bar' => 'two']); }); it('adds multiple keys to the keep array in the output, when keep() was called with an array', function () { $step = helper_getInputReturningStep()->keep(['foo', 'baz']); $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two', 'baz' => 'three']); expect($outputs[0]->keep)->toBe(['foo' => 'one', 'baz' => 'three']); }); it('maps output data to the keep array in the output, when keep() was called with an associative array', function () { $step = helper_getInputReturningStep()->keep(['foo', 'mappedKey' => 'baz']); $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two', 'baz' => 'three']); expect($outputs[0]->keep)->toBe(['foo' => 'one', 'mappedKey' => 'three']); }); it('logs an error when output is scalar value and keep was used, and adds the value with an unnamed key', function () { $step = helper_getInputReturningStep() ->addLogger(new CliLogger()) ->keep(); $outputs = helper_invokeStepWithInput($step, 'hello'); expect($outputs[0]->keep)->toBe(['unnamed1' => 'hello']) ->and($this->getActualOutputForAssertion()) ->toContain('yielded an output that is neither an associative array, nor an object'); }); it('repeatedly adds properties with unnamed keys with increasing numbers', function () { $step = helper_getValueReturningStep('world') ->keepFromInput() ->keep(); $outputs = helper_invokeStepWithInput($step, new Input('hello', keep: ['unnamed1' => 'servus'])); expect($outputs)->toHaveCount(1) ->and($outputs[0]->keep)->toBe(['unnamed1' => 'servus', 'unnamed2' => 'hello', 'unnamed3' => 'world']); }); /* ----------------------------- keepAs() ----------------------------- */ it('adds scalar value output with the defined key to keep output data, when keepAs() was used', function () { $step = helper_getInputReturningStep() ->keepAs('greeting'); $outputs = helper_invokeStepWithInput($step, 'hello'); expect($outputs[0]->keep)->toBe(['greeting' => 'hello']); }); it('adds array output with the defined key to keep output data, when keepAs() was used', function () { $step = helper_getInputReturningStep() ->keepAs('test'); $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar']); expect($outputs[0]->keep)->toBe(['test' => ['foo' => 'bar']]); }); /* ----------------------------- keepFromInput() ----------------------------- */ it('adds all from array input to the keep array in the output object, when keepFromInput() is called', function () { $step = helper_getValueReturningStep('foo')->keepFromInput(); $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two']); expect($outputs[0]->keep)->toBe(['foo' => 'one', 'bar' => 'two']); }); it('adds all from object input to the keep array in the output object, when keepFromInput() is called', function () { $step = helper_getValueReturningStep('foo')->keepFromInput(); $inputObject = new class { /** * @return array */ public function toArray(): array { return ['key' => 'value', 'key2' => 'value2']; } }; $outputs = helper_invokeStepWithInput($step, $inputObject); expect($outputs[0]->keep)->toBe(['key' => 'value', 'key2' => 'value2']); }); it( 'adds a key from array input to the keep array in the output, when keepFromInput() was called with a string', function () { $step = helper_getValueReturningStep('foo')->keepFromInput('bar'); $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two']); expect($outputs[0]->keep)->toBe(['bar' => 'two']); }, ); it( 'adds multiple keys from the input to the keep array in the output, when keepFromInput() was called with an array', function () { $step = helper_getValueReturningStep('foo')->keepFromInput(['foo', 'baz']); $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two', 'baz' => 'three']); expect($outputs[0]->keep)->toBe(['foo' => 'one', 'baz' => 'three']); }, ); it( 'maps input data to the keep array in the output, when keepFromInput() was called with an associative array', function () { $step = helper_getValueReturningStep('foo')->keepFromInput(['foo', 'mappedKey' => 'baz']); $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two', 'baz' => 'three']); expect($outputs[0]->keep)->toBe(['foo' => 'one', 'mappedKey' => 'three']); }, ); it('logs an error when input is scalar value and keep was used, and adds the value with an unnamed key', function () { $step = helper_getValueReturningStep('foo') ->addLogger(new CliLogger()) ->keepFromInput(); $outputs = helper_invokeStepWithInput($step, 'hey'); expect($outputs[0]->keep)->toBe(['unnamed1' => 'hey']) ->and($this->getActualOutputForAssertion()) ->toContain('received an input that is neither an associative array, nor an object'); }); /* ----------------------------- keepInputAs() ----------------------------- */ it('adds scalar value input with the defined key to keep output data, when keepInputAs() was used', function () { $step = helper_getValueReturningStep('yo') ->keepInputAs('greeting'); $outputs = helper_invokeStepWithInput($step, 'hello'); expect($outputs[0]->keep)->toBe(['greeting' => 'hello']); }); it('adds array input with the defined key to keep output data, when keepAs() was used', function () { $step = helper_getValueReturningStep('yay') ->keepInputAs('test'); $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar']); expect($outputs[0]->keep)->toBe(['test' => ['foo' => 'bar']]); }); /* ------------------------ combinations of keep calls ------------------------ */ it('makes an array of values when the same key should be kept from input and output', function () { $step = helper_getValueReturningStep(['foo' => 'one', 'bar' => 'two']) ->keepFromInput('foo') ->keep(['foo', 'bar']); $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar']); expect($outputs[0]->keep)->toBe(['foo' => ['bar', 'one'], 'bar' => 'two']); }); test('same key in input and output, but they are mapped to different keys for keep data', function () { $step = helper_getValueReturningStep(['foo' => 'one', 'bar' => 'two']) ->keepFromInput(['inputFoo' => 'foo']) ->keep(['foo', 'bar']); $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar']); expect($outputs[0]->keep)->toBe(['inputFoo' => 'bar', 'foo' => 'one', 'bar' => 'two']); }); it('merges data for the same key recursively', function () { $step = helper_getValueReturningStep(['foo' => ['one', 'two'], 'bar' => 'two']) ->keepFromInput('foo') ->keep(['foo', 'bar']); $outputs = helper_invokeStepWithInput( $step, new Input(['foo' => ['bar', 'baz']], keep: ['foo' => 'test']), ); expect($outputs[0]->keep)->toBe(['foo' => ['test', 'bar', 'baz', 'one', 'two'], 'bar' => 'two']); }); /* ----------------------------- keepsAnything() ----------------------------- */ test( 'keepsAnything() returns true when one of keep(), keepAs(), keepFromInput() or keepInputAs() was called', function (bool $callKeep, bool $callKeepAs, bool $callKeepFromInput, bool $callKeepInputAs, bool $expected) { $step = helper_getInputReturningStep(); if ($callKeep) { $step->keep(); } if ($callKeepAs) { $step->keepAs('foo'); } if ($callKeepFromInput) { $step->keepFromInput(); } if ($callKeepInputAs) { $step->keepInputAs('bar'); } expect($step->keepsAnything())->toBe($expected); }, )->with([ [false, false, false, false, false], [true, false, false, false, true], [false, true, false, false, true], [false, false, true, false, true], [false, false, false, true, true], ]); test( 'keepsAnythingFromInputData() returns true when one of keepFromInput() or keepInputAs() was called', function (bool $callKeep, bool $callKeepAs, bool $callKeepFromInput, bool $callKeepInputAs, bool $expected) { $step = helper_getInputReturningStep(); if ($callKeep) { $step->keep(); } if ($callKeepAs) { $step->keepAs('foo'); } if ($callKeepFromInput) { $step->keepFromInput(); } if ($callKeepInputAs) { $step->keepInputAs('bar'); } expect($step->keepsAnythingFromInputData())->toBe($expected); }, )->with([ [false, false, false, false, false], [true, false, false, false, false], [false, true, false, false, false], [false, false, true, false, true], [false, false, false, true, true], ]); test( 'keepsAnythingFromOutputData() returns true when one of keep() or keepAs() was called', function (bool $callKeep, bool $callKeepAs, bool $callKeepFromInput, bool $callKeepInputAs, bool $expected) { $step = helper_getInputReturningStep(); if ($callKeep) { $step->keep(); } if ($callKeepAs) { $step->keepAs('foo'); } if ($callKeepFromInput) { $step->keepFromInput(); } if ($callKeepInputAs) { $step->keepInputAs('bar'); } expect($step->keepsAnythingFromOutputData())->toBe($expected); }, )->with([ [false, false, false, false, false], [true, false, false, false, true], [false, true, false, false, true], [false, false, true, false, false], [false, false, false, true, false], ]); /* ----------------------------- sub crawlers ----------------------------- */ it('logs an error message when a sub crawler is defined and step has no reference to a parent crawler', function () { $step = helper_getInputReturningStep()->addLogger(new CliLogger()); $step->subCrawlerFor('bar', function (Crawler $crawler) { return $crawler->addStep(Http::get()); }); helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => ['https://www.example.com']]); expect($this->getActualOutputForAssertion())->toContain( 'Can\'t make sub crawler, because the step has no reference to the parent crawler.', ); }); it('logs an error message when a sub crawler is defined and output is scalar value', function () { $step = helper_getInputReturningStep()->addLogger(new CliLogger()); $step->setParentCrawler(HttpCrawler::make()->withUserAgent('Test')); $step->subCrawlerFor('bar', function (Crawler $crawler) { return $crawler->addStep(Http::get()); }); helper_invokeStepWithInput($step, 'foo'); expect($this->getActualOutputForAssertion()) ->toContain('The sub crawler feature works only with outputs that are associative arrays'); }); it('runs a sub crawler for a certain output property', function () { $step = helper_getInputReturningStep()->addLogger(new CliLogger()); $step->setParentCrawler(HttpCrawler::make()->withUserAgent('Test')); $step->subCrawlerFor('bar', function (Crawler $crawler) { return $crawler->addStep(Html::root()->extract(['title' => 'h1'])); }); $results = helper_invokeStepWithInput($step, [ 'foo' => 'hey', 'bar' => '

    Hello World!

    ', ]); expect($results)->toHaveCount(1) ->and($results[0]->get())->toBe(['foo' => 'hey', 'bar' => ['title' => 'Hello World!']]); }); test('when a sub crawler returns multiple results, they are an array in the parent output', function () { $step = helper_getInputReturningStep()->addLogger(new CliLogger()); $step->setParentCrawler(HttpCrawler::make()->withUserAgent('Test')); $step->subCrawlerFor('bar', function (Crawler $crawler) { return $crawler->addStep(Html::each('.item')->extract(['title' => 'h3'])); }); $html = <<

    one

    two

    three

    HTML; $results = helper_invokeStepWithInput($step, ['foo' => 'hey', 'bar' => $html, 'baz' => 'yo']); expect($results)->toHaveCount(1) ->and($results[0]->get()) ->toBe([ 'foo' => 'hey', 'bar' => [ ['title' => 'one'], ['title' => 'two'], ['title' => 'three'], ], 'baz' => 'yo', ]); }); it('runs a sub crawler with multiple inputs, when defined property is array', function () { $step = helper_getInputReturningStep()->addLogger(new CliLogger()); $step->setParentCrawler(HttpCrawler::make()->withUserAgent('Test')); $step->subCrawlerFor('bar', function (Crawler $crawler) { return $crawler->addStep(Html::root()->extract(['title' => 'h1'])); }); $results = helper_invokeStepWithInput($step, [ 'foo' => 'hey', 'bar' => [ '

    No. 1

    ', '

    No. 2

    ', '

    No. 3

    ', ], 'baz' => 'yo', ]); expect($results)->toHaveCount(1) ->and($results[0]->get()) ->toBe([ 'foo' => 'hey', 'bar' => [ ['title' => 'No. 1'], ['title' => 'No. 2'], ['title' => 'No. 3'], ], 'baz' => 'yo', ]); }); it('does not run a sub crawler, when output does not contain the defined key', function () { $step = helper_getInputReturningStep()->addLogger(new CliLogger()); $step->setParentCrawler(HttpCrawler::make()->withUserAgent('Test')); $step->subCrawlerFor('bar', function (Crawler $crawler) { return $crawler->addStep(Html::root()->extract(['title' => 'h1'])); }); $results = helper_invokeStepWithInput($step, ['foo' => 'hey', 'baz' => 'ho']); expect($results)->toHaveCount(1) ->and($results[0]->get())->toBe(['foo' => 'hey', 'baz' => 'ho']); }); ================================================ FILE: tests/Steps/CsvTest.php ================================================ toHaveCount(4) ->and($outputs[0]->get())->toBe(['id' => '123', 'domain' => 'crwl.io', 'url' => 'https://www.crwl.io']) ->and($outputs[1]->get())->toBe(['id' => '234', 'domain' => 'example.com', 'url' => 'https://www.example.com']) ->and($outputs[2]->get())->toBe(['id' => '345', 'domain' => 'otsch.codes', 'url' => 'https://www.otsch.codes']) ->and($outputs[3]->get())->toBe( ['id' => '456', 'domain' => 'crwlr.software', 'url' => 'https://www.crwlr.software'], ); }); it('maps a file', function () { $outputs = helper_invokeStepWithInput(Csv::parseFile(['id', 'name', 'homepage']), helper_csvFilePath('basic.csv')); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe(['id' => '123', 'name' => 'Otsch', 'homepage' => 'https://www.otsch.codes']) ->and($outputs[1]->get())->toBe(['id' => '234', 'name' => 'John Doe', 'homepage' => 'https://www.john.doe']) ->and($outputs[2]->get())->toBe(['id' => '345', 'name' => 'Jane Doe', 'homepage' => 'https://www.jane.doe']); }); it('works with a RespondedRequest as input', function () { $body = <<toHaveCount(2) ->and($outputs[0]->get())->toBe(['id' => '123', 'name' => 'John Doe', 'phone' => '+431234567']) ->and($outputs[1]->get())->toBe(['id' => '234', 'name' => 'Jane Doe', 'phone' => '+432345678']); }); it('works with an object having a __toString method', function () { $object = new class { public function __toString(): string { return <<toHaveCount(2) ->and($outputs[0]->get())->toBe(['id' => '123', 'name' => 'Max Mustermann', 'phone' => '+431234567']) ->and($outputs[1]->get())->toBe(['id' => '234', 'name' => 'Julia Musterfrau', 'phone' => '+432345678']); }); it('logs an error message for other inputs', function (string $method, mixed $input) { $logger = new DummyLogger(); $step = ($method === 'string' ? Csv::parseString(['column']) : Csv::parseFile(['column']))->addLogger($logger); helper_traverseIterable($step->invokeStep(new Input($input))); $logMessages = $logger->messages; expect($logMessages)->not->toBeEmpty() ->and($logMessages[0]['message'])->toStartWith( 'The Crwlr\\Crawler\\Steps\\Csv step was called with input that it can not work with: ', ) ->and($logMessages[0]['message'])->toEndWith('. The invalid input is of type ' . gettype($input) . '.'); })->with([ ['string', 123], ['string', new stdClass()], ['string', 12.345], ['string', true], ['string', null], ['file', 123], ['file', new stdClass()], ['file', 12.345], ['file', true], ['file', null], ]); it('can map columns using numerical array keys for the columns', function () { $string = << 'domain', 3 => 'description']), $string); expect($outputs)->toHaveCount(2) ->and($outputs[0]->get())->toBe([ 'domain' => 'crwlr.software', 'description' => 'PHP Web Crawling and Scraping Library', ]) ->and($outputs[1]->get())->toBe(['domain' => 'otsch.codes', 'description' => 'I am Otsch, I code']); }); it('can map columns using numerical array keys for the columns when parsing file', function () { $outputs = helper_invokeStepWithInput( Csv::parseFile([1 => 'name', 2 => 'homepage']), helper_csvFilePath('basic.csv'), ); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe(['name' => 'Otsch', 'homepage' => 'https://www.otsch.codes']) ->and($outputs[1]->get())->toBe(['name' => 'John Doe', 'homepage' => 'https://www.john.doe']) ->and($outputs[2]->get())->toBe(['name' => 'Jane Doe', 'homepage' => 'https://www.jane.doe']); }); it('can map columns using null for columns to skip', function () { $string = <<toHaveCount(3) ->and($outputs[0]->get())->toBe(['make' => 'Ford', 'price' => '3000.00']) ->and($outputs[1]->get())->toBe(['make' => 'Chevy', 'price' => '4900.00']) ->and($outputs[2]->get())->toBe(['make' => 'Chevy', 'price' => '5000.00']); }); it('can map columns using null for columns to skip when parsing file', function () { $outputs = helper_invokeStepWithInput(Csv::parseFile(['id', null, 'homepage']), helper_csvFilePath('basic.csv')); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe(['id' => '123', 'homepage' => 'https://www.otsch.codes']) ->and($outputs[1]->get())->toBe(['id' => '234', 'homepage' => 'https://www.john.doe']) ->and($outputs[2]->get())->toBe(['id' => '345', 'homepage' => 'https://www.jane.doe']); }); it('uses the values from the first line as output keys when no column mapping defined', function () { $string = <<skipFirstLine(), $string); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe(['id' => '1', 'title' => 'Raspberry Pi Zero 2 W', 'price' => '16.99']); }); it('uses the values from the first line as output keys when no column mapping defined when parsing file', function () { $outputs = helper_invokeStepWithInput( Csv::parseFile()->skipFirstLine(), helper_csvFilePath('with-column-headlines.csv'), ); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe([ 'Stunde' => '1', 'Montag' => 'Mathematik', 'Dienstag' => 'Deutsch', 'Mittwoch' => 'Englisch', 'Donnerstag' => 'Erdkunde', 'Freitag' => 'Politik', ]); }); it('skips the first line when defined via method call to skipFirstLine method', function () { $string = <<skipFirstLine(); $outputs = helper_invokeStepWithInput($step, $string); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe(['make' => 'Ford', 'price' => '3000.00']); }); it('skips the first line when parsing file when defined via method call to skipFirstLine method', function () { $step = Csv::parseFile([1 => 'fach-erste', 2 => 'fach-zweite']) ->skipFirstLine(); $outputs = helper_invokeStepWithInput($step, helper_csvFilePath('with-column-headlines.csv')); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe(['fach-erste' => 'Mathematik', 'fach-zweite' => 'Deutsch']) ->and($outputs[1]->get())->toBe(['fach-erste' => 'Sport', 'fach-zweite' => 'Deutsch']) ->and($outputs[2]->get())->toBe(['fach-erste' => 'Sport', 'fach-zweite' => 'Religion (ev., kath.)']); }); it('skips the first line when defined via constructor param', function () { $string = <<toHaveCount(1) ->and($outputs[0]->get())->toBe(['make' => 'Ford', 'price' => '3000.00']); }); it('skips the first line when parsing file when defined via constructor param', function () { $outputs = helper_invokeStepWithInput( Csv::parseFile([1 => 'fach-erste', 3 => 'fach-dritte'], true), helper_csvFilePath('with-column-headlines.csv'), ); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe(['fach-erste' => 'Mathematik', 'fach-dritte' => 'Englisch']) ->and($outputs[1]->get())->toBe(['fach-erste' => 'Sport', 'fach-dritte' => 'Englisch']) ->and($outputs[2]->get())->toBe(['fach-erste' => 'Sport', 'fach-dritte' => 'Kunst']); }); it('uses a different separator when you set one', function () { $string = << 'username', 2 => 'firstname', 3 => 'surname']) ->separator('|'); $outputs = helper_invokeStepWithInput($step, $string); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe(['username' => 'CoDerOtsch', 'firstname' => 'Christian', 'surname' => 'Olear']) ->and($outputs[1]->get())->toBe(['username' => 'g3n1u5', 'firstname' => 'Albert', 'surname' => 'Einstein']) ->and($outputs[2]->get())->toBe(['username' => 'sWiFtY', 'firstname' => 'Taylor', 'surname' => 'Swift']); }); it('uses a different separator when you set one, when parsing a file', function () { $step = Csv::parseFile([1 => 'username', 4 => 'age']) ->separator('*'); $outputs = helper_invokeStepWithInput($step, helper_csvFilePath('separator.csv')); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe(['username' => 'CoDerOtsch', 'age' => '35']) ->and($outputs[1]->get())->toBe(['username' => 'g3n1u5', 'age' => '143']) ->and($outputs[2]->get())->toBe(['username' => 'sWiFtY', 'age' => '32']); }); it('throws an InvalidArgumentException when you try to set a multi character separator', function () { Csv::parseString([])->separator('***'); })->throws(InvalidArgumentException::class); it('uses a different enclosure when you set one', function () { $string = << 'meal', 2 => 'price']) ->enclosure('/'); $outputs = helper_invokeStepWithInput($step, $string); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe(['meal' => 'Fritattensuppe', 'price' => '3.9']) ->and($outputs[1]->get())->toBe(['meal' => 'Wiener Schnitzel vom Schwein', 'price' => '12.7']) ->and($outputs[2]->get())->toBe(['meal' => 'Semmelknödel mit Schwammerlsauce', 'price' => '9.5']); }); it('uses a different enclosure when you set one, when parsing a file', function () { $step = Csv::parseFile([1 => 'meal', 2 => 'price']) ->enclosure('?'); $outputs = helper_invokeStepWithInput($step, helper_csvFilePath('enclosure.csv')); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe(['meal' => 'Kräftige Rindsuppe', 'price' => '4.5']) ->and($outputs[1]->get())->toBe(['meal' => 'Crispy Chicken Burger', 'price' => '12']) ->and($outputs[2]->get())->toBe(['meal' => 'Duett von Saibling und Forelle', 'price' => '21']); }); it('uses a different escape character when you set one', function () { $string = << 'escaped']) ->escape('&'); $outputs = helper_invokeStepWithInput($step, $string); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe(['escaped' => 'test &"escape&" test']); }); it('uses a different escape character when you set one, when parsing a file', function () { $step = Csv::parseFile([1 => 'escaped']) ->escape('%'); $outputs = helper_invokeStepWithInput($step, helper_csvFilePath('escape.csv')); expect($outputs)->toHaveCount(2) ->and($outputs[0]->get())->toBe(['escaped' => 'test %"escape%" test']) ->and($outputs[1]->get())->toBe(['escaped' => 'foo %"escape%" bar %"baz%" lorem']); }); it('filters rows', function () { $string = << 'isPremium']) ->skipFirstLine() ->where('isPremium', Filter::equal('1')); $outputs = helper_invokeStepWithInput($step, $string); expect($outputs)->toHaveCount(2) ->and($outputs[0]->get())->toBe(['id' => '123', 'isPremium' => '1']) ->and($outputs[1]->get())->toBe(['id' => '124', 'isPremium' => '1']); }); it('filters rows when parsing a file', function () { $step = Csv::parseFile(['Stunde', 'Fach']) ->skipFirstLine() ->where('Fach', Filter::equal('Sport')); $outputs = helper_invokeStepWithInput($step, helper_csvFilePath('with-column-headlines.csv')); expect($outputs)->toHaveCount(2) ->and($outputs[0]->get())->toBe(['Stunde' => '2', 'Fach' => 'Sport']) ->and($outputs[1]->get())->toBe(['Stunde' => '3', 'Fach' => 'Sport']); }); it('filters rows by multiple filters', function () { $string = << 'isVip', 4 => 'isQueenBandMember']) ->skipFirstLine() ->where('isVip', Filter::equal('1')) ->where('isQueenBandMember', Filter::equal('1')); $outputs = helper_invokeStepWithInput($step, $string); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe(['id' => '123', 'isVip' => '1', 'isQueenBandMember' => '1']); }); it('filters rows by multiple filters when parsing a file', function () { $step = Csv::parseFile(['Stunde', 'Montag', 'Dienstag', 'Mittwoch', 'Donnerstag', 'Freitag']) ->skipFirstLine() ->where('Montag', Filter::equal('Sport')) ->where('Donnerstag', Filter::equal('Sport')); $outputs = helper_invokeStepWithInput($step, helper_csvFilePath('with-column-headlines.csv')); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe([ 'Stunde' => '2', 'Montag' => 'Sport', 'Dienstag' => 'Deutsch', 'Mittwoch' => 'Englisch', 'Donnerstag' => 'Sport', 'Freitag' => 'Geschichte', ]); }); it('filters rows with a StringCheck filter', function () { $string = <<skipFirstLine() ->where('firstname', Filter::stringContains('Christian')); $outputs = helper_invokeStepWithInput($step, $string); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe(['id' => '123', 'firstname' => 'Christian']) ->and($outputs[1]->get())->toBe(['id' => '124', 'firstname' => 'Christian Anton']) ->and($outputs[2]->get())->toBe(['id' => '125', 'firstname' => 'Another Christian']); }); ================================================ FILE: tests/Steps/Dom/HtmlDocumentTest.php ================================================ foohello'; $document = new HtmlDocument($html); expect($document->getBaseHref())->toBe('/foo/bar'); }); it('gets the href of the first base tag in the document', function () { $html = << foo hey HTML; $document = new HtmlDocument($html); expect($document->getBaseHref())->toBe('/foo'); }); test('getBaseHref() returns null if the document does not contain a base tag', function () { $html = 'foohey'; $document = new HtmlDocument($html); expect($document->getBaseHref())->toBeNull(); }); test('the querySelector() method returns an HtmlElement object', function () { $html = 'foo
    hello
    '; $document = new HtmlDocument($html); expect($document->querySelector('.element'))->toBeInstanceOf(HtmlElement::class); }); test('the querySelectorAll() method returns a NodeList of HtmlElement objects', function () { $html = 'foo
    • foo
    • bar
    '; $document = new HtmlDocument($html); $nodeList = $document->querySelectorAll('ul li'); expect($nodeList)->toBeInstanceOf(NodeList::class); $anyNodesChecked = false; foreach ($nodeList as $node) { expect($node)->toBeInstanceOf(HtmlElement::class); $anyNodesChecked = true; } expect($anyNodesChecked)->toBeTrue(); }); test('the queryXPath() method returns a NodeList of HtmlElement objects', function () { $html = 'foo
    • foo
    • bar
    '; $document = new HtmlDocument($html); $nodeList = $document->queryXPath('//ul/li'); expect($nodeList)->toBeInstanceOf(NodeList::class); $anyNodesChecked = false; foreach ($nodeList as $node) { expect($node)->toBeInstanceOf(HtmlElement::class); $anyNodesChecked = true; } expect($anyNodesChecked)->toBeTrue(); }); ================================================ FILE: tests/Steps/Dom/HtmlElementTest.php ================================================
    HTML; $document = new HtmlDocument($html); $wrapperElement = $document->querySelector('#wrapper'); expect($wrapperElement)->toBeInstanceOf(HtmlElement::class) ->and($wrapperElement?->querySelector('.element'))->toBeInstanceOf(HtmlElement::class); }); test('child nodes selected via querySelectorAll() are HtmlElement instances', function () { $html = <<
    foo
    bar
    HTML; $document = new HtmlDocument($html); $wrapperElement = $document->querySelector('#wrapper'); expect($wrapperElement)->toBeInstanceOf(HtmlElement::class); $childNodeList = $wrapperElement?->querySelectorAll('.element'); expect($childNodeList)->toBeInstanceOf(NodeList::class) ->and($childNodeList?->count())->toBe(2) ->and($childNodeList?->first())->toBeInstanceOf(HtmlElement::class) ->and($childNodeList?->last())->toBeInstanceOf(HtmlElement::class); }); test('child nodes selected via queryXPath() are HtmlElement instances', function () { $html = <<
    foo
    bar
    HTML; $document = new HtmlDocument($html); $wrapperElement = $document->queryXPath('//*[@id="wrapper"]')->first(); expect($wrapperElement)->toBeInstanceOf(HtmlElement::class); $childNodeList = $wrapperElement?->queryXPath('//*[contains(@class, "element")]'); expect($childNodeList)->toBeInstanceOf(NodeList::class) ->and($childNodeList?->count())->toBe(2) ->and($childNodeList?->first())->toBeInstanceOf(HtmlElement::class) ->and($childNodeList?->first()?->text())->toBe('foo') ->and($childNodeList?->last())->toBeInstanceOf(HtmlElement::class) ->and($childNodeList?->last()?->text())->toBe('bar'); }); it('gets the node name', function () { $html = <<
    HTML; $document = new HtmlDocument($html); $node = $document->querySelector('.element'); expect($node?->nodeName())->toBe('div') ->and($node?->querySelector('.child')?->nodeName())->toBe('span'); }); it('gets the text of a node', function () { $html = <<
    bli bla blub
    HTML; $document = new HtmlDocument($html); $node = $document->querySelector('.element'); expect($node?->text())->toBe('bli bla blub'); }); it('gets the outer HTML of a node', function () { $html = <<
    bli bla blub
    HTML; $document = new HtmlDocument($html); $node = $document->querySelector('.element'); expect($node?->outerHtml())->toBe( '
    ' . PHP_EOL . ' bli bla blub' . PHP_EOL . '
    ', ); }); it('gets the inner HTML of a node', function () { $html = <<
    bli bla blub
    HTML; $document = new HtmlDocument($html); $node = $document->querySelector('.element'); expect($node?->innerHtml())->toBe( PHP_EOL . ' bli bla blub' . PHP_EOL, ); }); it('gets an attribute from a node', function () { $html = << Link HTML; $document = new HtmlDocument($html); $node = $document->querySelector('.element'); expect($node?->getAttribute('href'))->toBe('/foo/bar'); }); ================================================ FILE: tests/Steps/Dom/NodeListTest.php ================================================
    • foo
    • bar
    • baz
    HTML; $crawler = new Crawler($html); $filtered = $crawler->filter('ul li'); $nodeList = new NodeList( $filtered, function (object $node): HtmlElement { /** @var \Dom\Node|DOMNode|Crawler $node */ return new HtmlElement($node); }, ); expect($nodeList->count())->toBe(3) ->and($nodeList->first()?->text())->toBe('foo') ->and($nodeList->nth(2)?->text())->toBe('bar') ->and($nodeList->last()?->text())->toBe('baz') ->and($nodeList->each(fn($node) => $node->text()))->toBe(['foo', 'bar', 'baz']); }); it('can be constructed from a \Dom\NodeList instance', function () { $html = <<
    • foo
    • bar
    • baz
    HTML; $document = \Dom\HTMLDocument::createFromString($html, LIBXML_NOERROR); $nodeList = new NodeList( $document->querySelectorAll('ul li'), function (object $node): HtmlElement { /** @var \Dom\Node|DOMNode|Crawler $node */ return new HtmlElement($node); }, ); expect($nodeList->count())->toBe(3) ->and($nodeList->first()?->text())->toBe('foo') ->and($nodeList->nth(2)?->text())->toBe('bar') ->and($nodeList->last()?->text())->toBe('baz') ->and($nodeList->each(fn($node) => $node->text()))->toBe(['foo', 'bar', 'baz']); })->group('php84'); it('can be instantiated from an array of Nodes (object instances from this library)', function () { $html = <<
    foo
    bar
    baz
    HTML; $document = new HtmlDocument($html); $array = []; foreach ($document->querySelectorAll('.list .element') as $node) { $array[] = $node; } $newNodeList = new NodeList($array); expect($newNodeList->count())->toBe(3) ->and($newNodeList->first()?->text())->toBe('foo') ->and($newNodeList->last()?->text())->toBe('baz') ->and($newNodeList->nth(2)?->text())->toBe('bar'); }); it('gets the count of the node list', function () { $html = << Foo
    • foo
    • bar
    • baz
    HTML; $document = new HtmlDocument($html); expect($document->querySelectorAll('ul li')->count())->toBe(3); }); it('can be iterated and the elements are instances of Crwlr\Crawler\Steps\Dom\Node', function () { $html = << Foo
    • foo
    • bar
    • baz
    HTML; $document = new HtmlDocument($html); $iteratesAnyNodes = false; foreach ($document->querySelectorAll('ul li') as $node) { expect($node)->toBeInstanceOf(Node::class); $iteratesAnyNodes = true; } expect($iteratesAnyNodes)->toBeTrue(); }); it( 'can be iterated with the each() method and return values are returned as an array from the each() call', function () { $html = <<
    foo
    bar
    baz
    quz
    HTML; $document = new HtmlDocument($html); $result = $document->querySelectorAll('.list .element')->each(function ($node) { return $node->text() . ' check'; }); expect($result)->toBe([ 'foo check', 'bar check', 'baz check', 'quz check', ]); }, ); test('an empty NodeList can be iterated', function () { $html = << Foo
    • foo
    • bar
    • baz
    HTML; $document = new HtmlDocument($html); $iteratesAnyNodes = false; foreach ($document->querySelectorAll('ul lulu') as $node) { $iteratesAnyNodes = true; } expect($iteratesAnyNodes)->toBeFalse(); }); it('returns the first, last and nth element of the NodeList', function () { $html = <<
    foo
    bar
    baz
    quz
    HTML; $document = new HtmlDocument($html); $list = $document->querySelectorAll('.list .element'); expect($list->first())->toBeInstanceOf(HtmlElement::class) ->and($list->first()?->text())->toBe('foo') ->and($list->nth(2))->toBeInstanceOf(HtmlElement::class) ->and($list->nth(2)?->text())->toBe('bar') ->and($list->nth(3))->toBeInstanceOf(HtmlElement::class) ->and($list->nth(3)?->text())->toBe('baz') ->and($list->last())->toBeInstanceOf(HtmlElement::class) ->and($list->last()?->text())->toBe('quz'); }); ================================================ FILE: tests/Steps/Dom/NodeTest.php ================================================ filter($selectNode)->first(); } /** * @throws Exception */ function helper_getLegacyDomNodeInstanceFromSource(string $source, string $selectNode = 'body'): DOMNode { $node = (new Crawler($source))->filter($selectNode)->first()->getNode(0); if (!$node) { throw new Exception('Can\'t get legacy node'); } return $node; } function helper_getPhp84HtmlDomNodeInstanceFromSource(string $source, string $selectNode = 'body'): Element { $node = HTMLDocument::createFromString($source, HTML_NO_DEFAULT_NS | LIBXML_NOERROR)->querySelector($selectNode); /** @var Element $node */ return $node; } function helper_getPhp84XmlDomNodeInstanceFromSource(string $source, string $selectNode = 'body'): Element { $node = XMLDocument::createFromString($source, LIBXML_NOERROR)->querySelector($selectNode); /** @var Element $node */ return $node; } /** * @param \Dom\Node|Element|DOMNode|Crawler $originalNode */ function helper_getAbstractNodeInstance(object $originalNode, bool $html = true): HtmlNodeStub|XmlNodeStub { if ($html) { return new HtmlNodeStub($originalNode); } return new XmlNodeStub($originalNode); } /* ----------------------------- Instantiation ----------------------------- */ it('can be created from a \DOM\Node instance', function () { $xml = << 1Foo XML; $domNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'items item'); expect($domNode)->toBeInstanceOf(\Dom\Node::class); $node = new class ($domNode) extends Node { protected function makeChildNodeInstance(object $node): Node { return new XmlElement($node); } }; expect($node)->toBeInstanceOf(Node::class) ->and($node->text())->toBe('1Foo'); })->group('php84'); it('can be instantiated from a symfony Crawler instance', function () { $xml = << 1Foo XML; $crawler = helper_getSymfonyCrawlerInstanceFromSource($xml, 'items item'); expect($crawler)->toBeInstanceOf(Crawler::class); $node = new class ($crawler) extends Node { protected function makeChildNodeInstance(object $node): Node { return new XmlElement($node); } }; expect($node)->toBeInstanceOf(Node::class) ->and($node->text())->toBe('1Foo'); }); it('can be instantiated from a DOMNode instance', function () { $xml = << 1Foo XML; $domNode = helper_getLegacyDomNodeInstanceFromSource($xml, 'items item'); expect($domNode)->toBeInstanceOf(DOMNode::class); $node = new class ($domNode) extends Node { protected function makeChildNodeInstance(object $node): Node { return new XmlElement($node); } }; expect($node)->toBeInstanceOf(Node::class) ->and($node->text())->toBe('1Foo'); }); /* ----------------------------- querySelector(All)() ----------------------------- */ $html = << Foo

    Title

    HTML; it('selects an element within a node via querySelector()', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); $selectedNode = $node->querySelector('.foo h1'); expect($selectedNode)->toBeInstanceOf(Node::class) ->and($selectedNode?->text())->toBe('Title'); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($html)], [helper_getLegacyDomNodeInstanceFromSource($html)], ]); it('selects an element within a node via querySelector() in PHP >= 8.4', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); $node = helper_getAbstractNodeInstance($originalNode); $selectedNode = $node->querySelector('.foo h1'); expect($selectedNode)->toBeInstanceOf(Node::class) ->and($selectedNode?->text())->toBe('Title'); })->group('php84'); $html = << Bar

    Foo

    Bar

    HTML; test( 'querySelector() selects the first element within a node, when multiple nodes match a selector', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); $selectedNode = $node->querySelector('.foo h2'); expect($selectedNode)->toBeInstanceOf(Node::class) ->and($selectedNode?->text())->toBe('Foo'); }, )->with([ [helper_getSymfonyCrawlerInstanceFromSource($html)], [helper_getLegacyDomNodeInstanceFromSource($html)], ]); it( 'selects the first element within a node using querySelector(), when multiple nodes match a selector in PHP >= 8.4', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); $node = helper_getAbstractNodeInstance($originalNode); $selectedNode = $node->querySelector('.foo h2'); expect($selectedNode)->toBeInstanceOf(Node::class) ->and($selectedNode?->text())->toBe('Foo'); }, )->group('php84'); $html = << Foo yo HTML; it('returns null when the selector passed to querySelector() matches nothing', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); $selectedNode = $node->querySelector('.foo h2'); expect($selectedNode)->toBeNull(); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($html)], [helper_getLegacyDomNodeInstanceFromSource($html)], ]); it('returns null when the selector passed to querySelector() matches nothing in PHP >= 8.4', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); $node = helper_getAbstractNodeInstance($originalNode); $selectedNode = $node->querySelector('.foo h2'); expect($selectedNode)->toBeNull(); })->group('php84'); $xml = << 1Foo 2Bar 3Baz XML; it('selects all elements within a node, matching a selector using querySelectorAll()', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); $selected = $node->querySelectorAll('items item title'); expect($selected)->toBeInstanceOf(NodeList::class) ->and($selected->count())->toBe(3) ->and($selected->first()?->text())->toBe('Foo') ->and($selected->nth(2)?->text())->toBe('Bar') ->and($selected->last()?->text())->toBe('Baz'); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($xml, 'feed')], [helper_getLegacyDomNodeInstanceFromSource($xml, 'feed')], ]); it( 'selects all elements within a node, matching a selector using querySelectorAll() in PHP >= 8.4', function () use ($xml) { $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'feed'); $node = helper_getAbstractNodeInstance($originalNode); $selected = $node->querySelectorAll('items item title'); expect($selected)->toBeInstanceOf(NodeList::class) ->and($selected->count())->toBe(3) ->and($selected->first()?->text())->toBe('Foo') ->and($selected->nth(2)?->text())->toBe('Bar') ->and($selected->last()?->text())->toBe('Baz'); }, )->group('php84'); $xml = << 123 XML; it( 'gets an empty NodeList when nothing matches the selector passed to querySelectorAll()', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); $selected = $node->querySelectorAll('items item author'); expect($selected)->toBeInstanceOf(NodeList::class) ->and($selected->count())->toBe(0); }, )->with([ [helper_getSymfonyCrawlerInstanceFromSource($xml, 'feed')], [helper_getLegacyDomNodeInstanceFromSource($xml, 'feed')], ]); it( 'gets an empty NodeList when nothing matches the selector passed to querySelectorAll() in PHP >= 8.4', function () use ($xml) { $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'feed'); $node = helper_getAbstractNodeInstance($originalNode); $selected = $node->querySelectorAll('items item author'); expect($selected)->toBeInstanceOf(NodeList::class) ->and($selected->count())->toBe(0); }, )->group('php84'); $html = << Lorem Ipsum
    • hip
    • hop
    • hooray
    HTML; /* ----------------------------- queryXPath() ----------------------------- */ it( 'selects all elements within a node, matching an XPath query using queryXPath()', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); $selected = $node->queryXPath('//ul/li'); expect($selected)->toBeInstanceOf(NodeList::class) ->and($selected->count())->toBe(3) ->and($selected->first()?->text())->toBe('hip') ->and($selected->nth(2)?->text())->toBe('hop') ->and($selected->last()?->text())->toBe('hooray'); }, )->with([ [helper_getSymfonyCrawlerInstanceFromSource($html)], [helper_getLegacyDomNodeInstanceFromSource($html)], ]); it( 'selects all elements within a node, matching an XPath query using queryXPath() in PHP >= 8.4', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); $node = helper_getAbstractNodeInstance($originalNode); $selected = $node->queryXPath('//ul/li'); expect($selected)->toBeInstanceOf(NodeList::class) ->and($selected->count())->toBe(3) ->and($selected->first()?->text())->toBe('hip') ->and($selected->nth(2)?->text())->toBe('hop') ->and($selected->last()?->text())->toBe('hooray'); }, )->group('php84'); it('gets an empty NodeList when nothing matches the selector passed to queryXPath()', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); $selected = $node->queryXPath('//ul/li/strong'); expect($selected)->toBeInstanceOf(NodeList::class) ->and($selected->count())->toBe(0); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($html)], [helper_getLegacyDomNodeInstanceFromSource($html)], ]); it( 'gets an empty NodeList when nothing matches the selector passed to queryXPath() in PHP => 8.4', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); $node = helper_getAbstractNodeInstance($originalNode); $selected = $node->queryXPath('//ul/li/strong'); expect($selected)->toBeInstanceOf(NodeList::class) ->and($selected->count())->toBe(0); }, )->group('php84'); /* ----------------------------- removeNodesMatchingSelector() ----------------------------- */ $html = <<
    • foo
    • bar
    • baz
    • quz
    • lorem
    HTML; it('removes all nodes that match a given CSS selector', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); $node->removeNodesMatchingSelector('#list .remove'); $sourceAfterRemoval = $node->outer(); expect($sourceAfterRemoval)->toContain('
  • bar
  • ') ->toContain('
  • baz
  • ') ->not()->toContain('
  • ') ->not()->toContain('foo') ->not()->toContain('quz'); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($html)], [helper_getLegacyDomNodeInstanceFromSource($html)], ]); it('removes all nodes that match a given CSS selector in PHP >= 8.4', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); $node = helper_getAbstractNodeInstance($originalNode); $node->removeNodesMatchingSelector('#list .remove'); $sourceAfterRemoval = $node->outer(); expect($sourceAfterRemoval)->toContain('
  • bar
  • ') ->toContain('
  • baz
  • ') ->not()->toContain('
  • ') ->not()->toContain('foo') ->not()->toContain('quz'); })->group('php84'); $xml = << 1 foo lorem 2 bar ipsum 3 baz dolor XML; it('removes all nodes that match a given CSS selector from XML', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode, false); $node->removeNodesMatchingSelector('feed items item title'); $sourceAfterRemoval = $node->outer(); expect($sourceAfterRemoval)->toContain('') ->toContain('') ->not()->toContain(''); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($xml, 'feed')], ]); it('removes all nodes that match a given CSS selector from XML in PHP >= 8.4', function () use ($xml) { $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'feed'); $node = helper_getAbstractNodeInstance($originalNode); $node->removeNodesMatchingSelector('feed items item title'); $sourceAfterRemoval = $node->outer(); expect($sourceAfterRemoval)->toContain('<id>') ->toContain('<description>') ->not()->toContain('<title>'); })->group('php84'); /* ----------------------------- removeNodesMatchingXPath() ----------------------------- */ $html = <<<HTML <!doctype html> <html> <head></head> <body> <ul id="list"> <li class="remove">foo</li> <li>bar</li> <li>baz</li> <li class="remove">quz</li> <li>lorem</li> </ul> </body> </html> HTML; it('removes all nodes that match a given XPath query', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); $node->removeNodesMatchingXPath('//li[contains(@class, \'remove\')]'); $sourceAfterRemoval = $node->outer(); expect($sourceAfterRemoval)->toContain('<li>bar</li>') ->toContain('<li>baz</li>') ->not()->toContain('<li class="remove">') ->not()->toContain('foo') ->not()->toContain('quz'); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($html)], [helper_getLegacyDomNodeInstanceFromSource($html)], ]); it('removes all nodes that match a given XPath query in PHP >= 8.4', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); $node = helper_getAbstractNodeInstance($originalNode); $node->removeNodesMatchingXPath('//li[contains(@class, \'remove\')]'); $sourceAfterRemoval = $node->outer(); expect($sourceAfterRemoval)->toContain('<li>bar</li>') ->toContain('<li>baz</li>') ->not()->toContain('<li class="remove">') ->not()->toContain('foo') ->not()->toContain('quz'); })->group('php84'); $xml = <<<XML <?xml version="1.0" encoding="utf-8"?> <feed> <items> <item> <id>1</id> <title>foo lorem 2 bar ipsum 3 baz dolor XML; it('removes all nodes that match a given XPath query from XML', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); $node->removeNodesMatchingXPath('//feed/items/item/title'); $sourceAfterRemoval = $node->outer(); expect($sourceAfterRemoval)->toContain('') ->toContain('') ->not()->toContain(''); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($xml, 'feed')], ]); it('removes all nodes that match a given XPath query from XML in PHP >= 8.4', function () use ($xml) { $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'feed'); $node = helper_getAbstractNodeInstance($originalNode); $node->removeNodesMatchingXPath('//feed/items/item/title'); $sourceAfterRemoval = $node->outer(); expect($sourceAfterRemoval)->toContain('<id>') ->toContain('<description>') ->not()->toContain('<title>'); })->group('php84'); /* ----------------------------- getAttribute() ----------------------------- */ $html = <<<HTML <!doctype html> <html> <head><title>Foo
    HTML; it('gets the value of an attribute', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); expect($node->getAttribute('data-test'))->toBe('hi'); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($html, '.element')], [helper_getLegacyDomNodeInstanceFromSource($html, '.element')], ]); it('gets the value of an attribute in PHP >= 8.4', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, '.element'); $node = helper_getAbstractNodeInstance($originalNode); expect($node->getAttribute('data-test'))->toBe('hi'); })->group('php84'); $html = << Foo
    HTML; it('returns null when an attribute does not exist', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); expect($node->getAttribute('data-test'))->toBeNull(); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($html, '.element')], [helper_getLegacyDomNodeInstanceFromSource($html, '.element')], ]); it('returns null when an attribute does not exist in PHP >= 8.4', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, '.element'); $node = helper_getAbstractNodeInstance($originalNode); expect($node->getAttribute('data-test'))->toBeNull(); })->group('php84'); /* ----------------------------- nodeName() ----------------------------- */ it('gets the name of a node', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); expect($node->nodeName())->toBe('div'); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($html, '.element')], [helper_getLegacyDomNodeInstanceFromSource($html, '.element')], ]); it('gets the name of a node in PHP >= 8.4', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, '.element'); $node = helper_getAbstractNodeInstance($originalNode); expect($node->nodeName())->toBe('div'); })->group('php84'); $html = << Bar

    Title

    Lorem ipsum.

    HTML; /* ----------------------------- text() ----------------------------- */ it('gets the text content of an HTML node', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); expect($node->text())->toBe('Title Lorem ipsum.'); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($html, 'article')], [helper_getLegacyDomNodeInstanceFromSource($html, 'article')], ]); it('gets the text content of an HTML node in PHP >= 8.4', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, 'article'); $node = helper_getAbstractNodeInstance($originalNode); expect($node->text())->toBe('Title Lorem ipsum.'); })->group('php84'); /* ----------------------------- innerSource() ----------------------------- */ it('gets the inner source of an HTML node', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); expect($node->inner())->toBe('

    Title

    Lorem ipsum.

    '); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($html, 'article')], [helper_getLegacyDomNodeInstanceFromSource($html, 'article')], ]); it('gets the inner source of an HTML node in PHP >= 8.4', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, 'article'); $node = helper_getAbstractNodeInstance($originalNode); expect($node->inner())->toBe('

    Title

    Lorem ipsum.

    '); })->group('php84'); /* ----------------------------- outerSource () ----------------------------- */ it('gets the outer source of an HTML node', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); expect($node->outer())->toBe('

    Title

    Lorem ipsum.

    '); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($html, 'article')], [helper_getLegacyDomNodeInstanceFromSource($html, 'article')], ]); it('gets the outer source of an HTML node in PHP >= 8.4', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, 'article'); $node = helper_getAbstractNodeInstance($originalNode); expect($node->outer())->toBe('

    Title

    Lorem ipsum.

    '); })->group('php84'); $xml = << 1 Lorem Ipsum XML; /* ----------------------------- text() ----------------------------- */ it('gets the text content of an XML node', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); expect($node->text())->toBe('1 Lorem Ipsum'); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($xml, 'items item')], [helper_getLegacyDomNodeInstanceFromSource($xml, 'items item')], ]); it('gets the text content of an XML node in PHP >= 8.4', function () use ($xml) { $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'items item'); $node = helper_getAbstractNodeInstance($originalNode); expect($node->text())->toBe('1 Lorem Ipsum'); })->group('php84'); /* ----------------------------- innerSource() XML ----------------------------- */ it('gets the inner source of an XML node', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); expect($node->inner())->toBe(' 1 Lorem Ipsum '); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($xml, 'items item')], [helper_getLegacyDomNodeInstanceFromSource($xml, 'items item')], ]); it('gets the inner source of an XML node in PHP >= 8.4', function () use ($xml) { $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'items item'); $node = helper_getAbstractNodeInstance($originalNode); expect($node->inner())->toBe(' 1 Lorem Ipsum '); })->group('php84'); /* ----------------------------- outerSource() XML ----------------------------- */ it('gets the outer source of an XML node', function (object $originalNode) { /** @var Crawler|DOMNode $originalNode */ $node = helper_getAbstractNodeInstance($originalNode); expect($node->outer())->toBe(' 1 Lorem Ipsum '); })->with([ [helper_getSymfonyCrawlerInstanceFromSource($xml, 'items item')], [helper_getLegacyDomNodeInstanceFromSource($xml, 'items item')], ]); it('gets the outer source of an XML node in PHP >= 8.4', function () use ($xml) { $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'items item'); $node = helper_getAbstractNodeInstance($originalNode); expect($node->outer())->toBe(' 1 Lorem Ipsum '); })->group('php84'); $html = << Bar
    • one
    • foo
    HTML; /* ------------ :has() :not() CSS pseudo class selectors in PHP 8.4 ------------- */ it('selects elements using a CSS selector containing the :has() pseudo class', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); $node = helper_getAbstractNodeInstance($originalNode); $selected = $node->querySelector('ul:has(.foo)'); expect($selected)->toBeInstanceOf(HtmlElement::class) ->and($selected?->text())->toBe('one'); })->group('php84'); it('selects elements using a CSS selector containing the :not() pseudo class', function () use ($html) { $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); $node = helper_getAbstractNodeInstance($originalNode); $selected = $node->querySelector('ul:not(:has(.foo))'); expect($selected)->toBeInstanceOf(HtmlElement::class) ->and($selected?->text())->toBe('foo'); })->group('php84'); ================================================ FILE: tests/Steps/Dom/XmlDocumentTest.php ================================================ 1 XML; $document = new XmlDocument($xml); expect($document->querySelector('feed items item'))->toBeInstanceOf(XmlElement::class); }); test('the querySelectorAll() method returns a NodeList of XmlElement objects', function () { $xml = << 123 XML; $document = new XmlDocument($xml); $nodeList = $document->querySelectorAll('feed items item'); expect($nodeList)->toBeInstanceOf(NodeList::class); $anyNodesChecked = false; foreach ($nodeList as $node) { expect($node)->toBeInstanceOf(XmlElement::class); $anyNodesChecked = true; } expect($anyNodesChecked)->toBeTrue(); }); test('the queryXPath() method returns a NodeList of XmlElement objects', function () { $xml = << 123 XML; $document = new XmlDocument($xml); $nodeList = $document->queryXPath('//feed/items/item'); expect($nodeList)->toBeInstanceOf(NodeList::class); $anyNodesChecked = false; foreach ($nodeList as $node) { expect($node)->toBeInstanceOf(XmlElement::class); $anyNodesChecked = true; } expect($anyNodesChecked)->toBeTrue(); }); it('is able to parse documents containing characters that aren\'t valid within XML documents', function (string $char) { $xml = << <![CDATA[foo - {$char} - bar]]> XML; $document = new XmlDocument($xml); $titles = $document->querySelectorAll('channel item title'); expect($titles)->toBeInstanceOf(NodeList::class) ->and($titles->count())->toBe(1) ->and($titles->first()?->text())->toStartWith('foo - ') ->and($titles->first()?->text())->toEndWith(' - bar'); })->with([ [mb_chr(0)], [mb_chr(6)], [mb_chr(12)], [mb_chr(20)], [mb_chr(31)], [mb_chr(128)], [mb_chr(157)], [mb_chr(195)], [mb_chr(253)], ]); ================================================ FILE: tests/Steps/Dom/XmlElementTest.php ================================================ foo foo abc-123 2024-11-07T11:00:31Z Foo bar baz! https://www.example.com/item-1?utm_source=foo&utm_medium=feed-xml test abc-124 2024-12-04T22:43:14Z Lorem Ipsum! https://www.example.com/item-2?utm_source=foo&utm_medium=feed-xml heyho XML; test('child nodes selected via querySelector() are HtmlElement instances', function () use ($xml) { $document = new XmlDocument($xml); $wrapperElement = $document->querySelector('feed'); expect($wrapperElement)->toBeInstanceOf(XmlElement::class) ->and($wrapperElement?->querySelector('items item'))->toBeInstanceOf(XmlElement::class); }); test('child nodes selected via querySelectorAll() are HtmlElement instances', function () use ($xml) { $document = new XmlDocument($xml); $wrapperElement = $document->querySelector('feed'); expect($wrapperElement)->toBeInstanceOf(XmlElement::class); $childNodeList = $wrapperElement?->querySelectorAll('items item'); expect($childNodeList)->toBeInstanceOf(NodeList::class) ->and($childNodeList?->count())->toBe(2) ->and($childNodeList?->first())->toBeInstanceOf(XmlElement::class) ->and($childNodeList?->last())->toBeInstanceOf(XmlElement::class); }); it('gets the node name', function () use ($xml) { $document = new XmlDocument($xml); $node = $document->querySelector('feed'); expect($node?->nodeName())->toBe('feed') ->and($node?->querySelector('items item')?->nodeName())->toBe('item'); }); it('gets the text of a node', function () use ($xml) { $document = new XmlDocument($xml); $node = $document->querySelector('feed items item:nth-child(2) foo'); expect($node?->text())->toBe('heyho'); }); it('gets the outer XML of a node', function () use ($xml) { $document = new XmlDocument($xml); $node = $document->querySelector('feed items item foo baRbaz'); expect($node?->outerXml())->toBe('test'); }); it('gets the inner XML of a node', function () use ($xml) { $document = new XmlDocument($xml); $node = $document->querySelector('feed items item foo'); expect($node?->innerXml())->toBe(' test '); }); it('gets an attribute from a node', function () use ($xml) { $document = new XmlDocument($xml); $node = $document->querySelector('feed items item:first-child title'); expect($node?->getAttribute('lang'))->toBe('en'); }); ================================================ FILE: tests/Steps/Dom/_Stubs/HtmlNodeStub.php ================================================ innerSource(); } public function outer(): string { return $this->outerSource(); } protected function makeChildNodeInstance(object $node): Node { return new HtmlElement($node); } } ================================================ FILE: tests/Steps/Dom/_Stubs/XmlNodeStub.php ================================================ innerSource(); } public function outer(): string { return $this->outerSource(); } protected function makeChildNodeInstance(object $node): Node { return new XmlElement($node); } } ================================================ FILE: tests/Steps/DomTest.php ================================================

    Überschrift

    '; $output = helper_invokeStepWithInput(helper_getDomStepInstance()::root(), $html); expect($output[0]->get())->toBe([]); }); test('ResponseInterface is a valid input', function () { $output = helper_invokeStepWithInput(helper_getDomStepInstance()::root(), new Response()); expect($output[0]->get())->toBe([]); }); test('RespondedRequest is a valid input', function () { $output = helper_invokeStepWithInput( helper_getDomStepInstance()::root(), new RespondedRequest(new Request('GET', '/'), new Response()), ); expect($output[0]->get())->toBe([]); }); test('For other inputs an error message is logged', function (mixed $input) { $logger = new DummyLogger(); helper_traverseIterable(helper_getDomStepInstance()::root()->addLogger($logger)->invokeStep(new Input($input))); expect($logger->messages)->not->toBeEmpty() ->and($logger->messages[0]['message'])->toStartWith('A step was called with input that it can not work with: ') ->and($logger->messages[0]['message'])->toEndWith('. The invalid input is of type ' . gettype($input) . '.'); })->with([ [123], [123.456], [new stdClass()], ]); it('outputs a single string when argument for extract is a selector string matching only one element', function () { $outputs = helper_invokeStepWithInput( helper_getDomStepInstance()::root()->extract('.list .item:first-child .match'), helper_getStepFilesContent('Html/basic.html'), ); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe('match 2'); }); it('outputs multiple strings when argument for extract is a selector string matching multiple elements', function () { $outputs = helper_invokeStepWithInput( helper_getDomStepInstance()::root()->extract('.match'), helper_getStepFilesContent('Html/basic.html'), ); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe('match 1') ->and($outputs[2]->get())->toBe('match 3'); }); it('also takes a DomQuery instance as argument for extract', function () { $outputs = helper_invokeStepWithInput( helper_getDomStepInstance()::root()->extract(Dom::cssSelector('.list .item:first-child .match')), helper_getStepFilesContent('Html/basic.html'), ); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe('match 2'); }); test('Extracting with single selector also works with each', function () { $outputs = helper_invokeStepWithInput( helper_getDomStepInstance()::each('.list .item')->extract('.match'), helper_getStepFilesContent('Html/basic.html'), ); expect($outputs)->toHaveCount(2) ->and($outputs[0]->get())->toBe('match 2') ->and($outputs[1]->get())->toBe('match 3'); }); test('Extracting with single selector also works with first', function () { $outputs = helper_invokeStepWithInput( helper_getDomStepInstance()::first('.list .item')->extract('.match'), helper_getStepFilesContent('Html/basic.html'), ); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe('match 2'); }); test('Extracting with single selector also works with last', function () { $outputs = helper_invokeStepWithInput( helper_getDomStepInstance()::last('.list .item')->extract('.match'), helper_getStepFilesContent('Html/basic.html'), ); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe('match 3'); }); test('Extracting with single selector that doesn\'t match anything doesn\'t yield any output', function () { $outputs = helper_invokeStepWithInput( helper_getDomStepInstance()::last('.list .item')->extract('.m\ätch'), helper_getStepFilesContent('Html/basic.html'), ); expect($outputs)->toHaveCount(0); }); it('extracts one result from the root node when the root method is used', function () { $output = helper_invokeStepWithInput( helper_getDomStepInstance()::root()->extract(['matches' => '.match']), helper_getStepFilesContent('Html/basic.html'), ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['matches' => ['match 1', 'match 2', 'match 3']]); }); it('extracts each matching result when the each method is used', function () { $output = helper_invokeStepWithInput( helper_getDomStepInstance()::each('.list .item')->extract(['match' => '.match']), helper_getStepFilesContent('Html/basic.html'), ); expect($output)->toHaveCount(2) ->and($output[0]->get())->toBe(['match' => 'match 2']) ->and($output[1]->get())->toBe(['match' => 'match 3']); }); it('logs a warning, when the each() method is used with an empty selector', function (string|DomQuery $selector) { $logger = new DummyLogger(); $step = helper_getDomStepInstance()::each($selector)->extract(['match' => '.match']); $step->addLogger($logger); $outputs = helper_invokeStepWithInput($step, helper_getStepFilesContent('Html/basic.html')); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe(['match' => ['match 1', 'match 2', 'match 3']]) ->and($logger->messages[0]['level'])->toBe('warning') ->and($logger->messages[0]['message']) ->toStartWith('The selector you provided for the ‘each’ option is empty.'); })->with([ [''], [Dom::cssSelector('')], [Dom::xPath('')], ]); it('extracts the first matching result when the first method is used', function () { $output = helper_invokeStepWithInput( helper_getDomStepInstance()::first('.list .item')->extract(['match' => '.match']), helper_getStepFilesContent('Html/basic.html'), ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['match' => 'match 2']); }); it('logs a warning, when the first() method is used with an empty selector', function (string|DomQuery $selector) { $logger = new DummyLogger(); $step = helper_getDomStepInstance()::first($selector)->extract(['match' => '.match']); $step->addLogger($logger); $outputs = helper_invokeStepWithInput($step, helper_getStepFilesContent('Html/basic.html')); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe(['match' => ['match 1', 'match 2', 'match 3']]) ->and($logger->messages[0]['level'])->toBe('warning') ->and($logger->messages[0]['message']) ->toStartWith('The selector you provided for the ‘first’ option is empty.'); })->with([ [''], [Dom::cssSelector('')], [Dom::xPath('')], ]); it('extracts the last matching result when the last method is used', function () { $output = helper_invokeStepWithInput( helper_getDomStepInstance()::last('.list .item')->extract(['match' => '.match']), helper_getStepFilesContent('Html/basic.html'), ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['match' => 'match 3']); }); it('logs a warning, when the last() method is used with an empty selector', function (string|DomQuery $selector) { $logger = new DummyLogger(); $step = helper_getDomStepInstance()::last($selector)->extract(['match' => '.match']); $step->addLogger($logger); $outputs = helper_invokeStepWithInput($step, helper_getStepFilesContent('Html/basic.html')); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe(['match' => ['match 1', 'match 2', 'match 3']]) ->and($logger->messages[0]['level'])->toBe('warning') ->and($logger->messages[0]['message']) ->toStartWith('The selector you provided for the ‘last’ option is empty.'); })->with([ [''], [Dom::cssSelector('')], [Dom::xPath('')], ]); it('doesn\'t yield any output when the each selector doesn\'t match anything', function () { $output = helper_invokeStepWithInput( helper_getDomStepInstance()::each('.list .ytem')->extract(['match' => '.match']), helper_getStepFilesContent('Html/basic.html'), ); expect($output)->toHaveCount(0); }); it('doesn\'t yield any output when the first selector doesn\'t match anything', function () { $output = helper_invokeStepWithInput( helper_getDomStepInstance()::first('.list .ytem')->extract(['match' => '.match']), helper_getStepFilesContent('Html/basic.html'), ); expect($output)->toHaveCount(0); }); it('doesn\'t yield any output when the last selector doesn\'t match anything', function () { $output = helper_invokeStepWithInput( helper_getDomStepInstance()::last('.list .otem')->extract(['match' => '.match']), helper_getStepFilesContent('Html/basic.html'), ); expect($output)->toHaveCount(0); }); it('returns an array with null values when selectors in an extract array mapping don\'t match anything', function () { $output = helper_invokeStepWithInput( helper_getDomStepInstance()::last('.list .item')->extract(['match' => '.match', 'noMatch' => '.doesntMatch']), helper_getStepFilesContent('Html/basic.html'), ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['match' => 'match 3', 'noMatch' => null]); }); test('The static cssSelector method returns an instance of CssSelector using the provided selector', function () { $cssSelector = Dom::cssSelector('.item'); expect($cssSelector)->toBeInstanceOf(CssSelector::class); $itemContent = $cssSelector->apply(new Dom\HtmlDocument('yes')); expect($itemContent)->toBe('yes'); }); test('The static xPath method returns an instance of XPathQuery using the provided query', function () { $xPathQuery = Dom::xPath('//item'); expect($xPathQuery)->toBeInstanceOf(XPathQuery::class); $itemContent = $xPathQuery->apply(new Dom\XmlDocument('yes')); expect($itemContent)->toBe('yes'); }); it('uses the keys of the provided mapping as keys in the returned output', function () { $output = helper_invokeStepWithInput( helper_getDomStepInstance()::root()->extract(['foo' => '.foo', 'notBar' => '.bar', '.baz']), '

    foo content

    bar content

    baz content

    ', ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['foo' => 'foo content', 'notBar' => 'bar content', 0 => 'baz content']); }); it('trims the extracted data', function () { $output = helper_invokeStepWithInput( helper_getDomStepInstance()::root()->extract(['foo' => '.foo']), "

    \n foo content \n \n

    ", ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['foo' => 'foo content']); }); it('automatically passes on the base url to dom query instances when the input is a RespondedRequest', function () { $output = helper_invokeStepWithInput( helper_getDomStepInstance()::root()->extract([ 'one' => Dom::cssSelector('#one')->attribute('href')->toAbsoluteUrl(), 'two' => Dom::cssSelector('#two')->link(), ]), new RespondedRequest( new Request('GET', 'https://www.example.com/home'), new Response(body: '

    foo bar yolo

    '), ), ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe([ 'one' => 'https://www.example.com/foo/bar', 'two' => 'https://www.example.com/yo/lo', ]); }); it('removes the fragment part from URLs when the withoutFragment method is called on a DomQuery instance', function () { $body = << one
    two
    three
    four

    HTML; $output = helper_invokeStepWithInput( helper_getDomStepInstance()::root()->extract([ 'one' => Dom::cssSelector('#one')->link(), 'two' => Dom::xPath('//a[@id=\'two\']')->link(), 'three' => Dom::cssSelector('#three')->link()->withoutFragment(), 'four' => Dom::xPath('//a[@id=\'four\']')->link()->withoutFragment(), ]), new RespondedRequest( new Request('GET', 'https://www.example.com/home'), new Response(body: $body), ), ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe([ 'one' => 'https://www.example.com/foo#foo', 'two' => 'https://www.example.com/bar#bar', 'three' => 'https://www.example.com/baz', 'four' => 'https://www.example.com/quz', ]); }); ================================================ FILE: tests/Steps/Filters/ArrayFilterTest.php ================================================ where(Filter::equal('foo')); expect($filter->evaluate($values))->toBe($evaluationResult); })->with([ [['foo', 'bar', 'baz'], true], [['bar', 'baz', 'quz'], false], ]); it('filters a multi-level array by a key of the array elements (which are also arrays)', function () { $values = [ ['foo' => 'one', 'bar' => 'two'], ['foo' => 'two', 'bar' => 'three'], ['foo' => 'three', 'bar' => 'four'], ]; $filter = Filter::arrayHasElement()->where('foo', Filter::equal('four')); expect($filter->evaluate($values))->toBeFalse(); $filter = Filter::arrayHasElement()->where('foo', Filter::equal('two')); expect($filter->evaluate($values))->toBeTrue(); }); it('applies multiple complex filters on a multi-level array', function () { $values = [ [ 'id' => '123', 'name' => 'abc', 'tags' => [ ['type' => 'companyId', 'value' => '123'], ['type' => 'type', 'value' => 'job-ad'], ['type' => 'companyId', 'value' => '125'], ], ], [ 'id' => '124', 'name' => 'abd', 'tags' => [ ['type' => 'companyId', 'value' => '123'], ['type' => 'type', 'value' => 'blog-post'], ['type' => 'author', 'value' => 'John Doe'], ], ], [ 'id' => '125', 'name' => 'abf', 'tags' => [ ['type' => 'companyId', 'value' => '123'], ['type' => 'companyId', 'value' => '124'], ['type' => 'type', 'value' => 'job-ad'], ['type' => 'companyId', 'value' => '125'], ], ], ]; $filter = Filter::arrayHasElement() ->where( 'tags', Filter::arrayHasElement() ->where('type', Filter::equal('companyId')) ->where('value', Filter::equal('123')), ) ->where( 'tags', Filter::arrayHasElement() ->where('type', Filter::equal('companyId')) ->where('value', Filter::equal('124')) ->negate(), ) ->where( 'tags', Filter::arrayHasElement() ->where('type', Filter::equal('type')) ->where('value', Filter::equal('job-ad')), ); expect($filter->evaluate($values))->toBeTrue(); $filter = Filter::arrayHasElement() ->where( 'tags', Filter::arrayHasElement() ->where('type', Filter::equal('companyId')) ->where('value', Filter::equal('123')), ) ->where( 'tags', Filter::arrayHasElement() ->where('type', Filter::equal('companyId')) ->where('value', Filter::equal('125')) ->negate(), ) ->where( 'tags', Filter::arrayHasElement() ->where('type', Filter::equal('type')) ->where('value', Filter::equal('job-ad')), ); expect($filter->evaluate($values))->toBeFalse(); }); ================================================ FILE: tests/Steps/Filters/ClosureFilterTest.php ================================================ evaluate('one'))->toBeTrue(); expect($closure->evaluate('four'))->toBeFalse(); }); it('evaluates with a value from an array by key', function () { $closure = new ClosureFilter(function (mixed $value) { return in_array($value, ['one', 'two', 'three'], true); }); $closure->useKey('bar'); expect($closure->evaluate(['foo' => 'one', 'bar' => 'two']))->toBeTrue(); expect($closure->evaluate(['foo' => 'three', 'bar' => 'four']))->toBeFalse(); }); it('compares a value from an object by key', function () { $closure = new ClosureFilter(function (mixed $value) { return in_array($value, ['one', 'two', 'three'], true); }); $closure->useKey('bar'); expect($closure->evaluate(helper_getStdClassWithData(['foo' => 'one', 'bar' => 'two'])))->toBeTrue(); expect($closure->evaluate(helper_getStdClassWithData(['foo' => 'three', 'bar' => 'four'])))->toBeFalse(); }); ================================================ FILE: tests/Steps/Filters/ComparisonFilterTest.php ================================================ evaluate(4))->toBeTrue() ->and($comparison->evaluate(2))->toBeFalse(); }); it('compares a value from an array by key', function () { $comparison = new ComparisonFilter(ComparisonFilterRule::NotEqual, 'barValue'); $comparison->useKey('bar'); expect($comparison->evaluate(['foo' => 'fooValue', 'bar' => 'barValue']))->toBeFalse() ->and($comparison->evaluate(['foo' => 'fooValue', 'bar' => 'barzValue']))->toBeTrue(); }); it('compares a value from an object by key', function () { $comparison = new ComparisonFilter(ComparisonFilterRule::NotEqual, 'barValue'); $comparison->useKey('bar'); expect($comparison->evaluate(helper_getStdClassWithData(['foo' => 'fooValue', 'bar' => 'barValue'])))->toBeFalse() ->and($comparison->evaluate(helper_getStdClassWithData(['foo' => 'fooValue', 'bar' => 'barzValue'])))->toBeTrue(); }); ================================================ FILE: tests/Steps/Filters/Enums/ComparisonFilterRuleTest.php ================================================ evaluate($value1, $value2))->toBe($expectedResult); })->with([ [true, 1, 1], [true, 'one', 'one'], [true, 1.12, 1.12], [false, 1, 2], [false, 1, '1'], [false, 'one', 'two'], [false, 1.12, 1.122], ]); it('correctly applies not equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) { $comparisonFilterRule = ComparisonFilterRule::NotEqual; expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult); })->with([ [false, 1, 1], [false, 'one', 'one'], [false, 1.12, 1.12], [true, 1, 2], [true, 1, '1'], [true, 'one', 'two'], [true, 1.12, 1.122], ]); it('correctly applies greater than operator', function (bool $expectedResult, mixed $value1, mixed $value2) { $comparisonFilterRule = ComparisonFilterRule::GreaterThan; expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult); })->with([ [true, 1, 0], [true, 12, 3], [true, 1.12, 1.11], [false, 11, 11], [false, 0, 1], [false, 3.59, 3.591], [true, '123', '122'], [true, '123', 122], [true, 123, '122'], [false, '123', '124'], [false, '123', 124], [false, 123, '124'], [true, '123.45', '123.44'], [true, '123.45', 123.44], [true, 123.45, '123.44'], [false, '123.45', '123.46'], [false, '123.45', 123.46], [false, 123.45, '123.46'], ]); it('correctly applies greater than or equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) { $comparisonFilterRule = ComparisonFilterRule::GreaterThanOrEqual; expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult); })->with([ [true, 1, 0], [true, 12, 3], [true, 1.12, 1.11], [true, 11, 11], [false, 0, 1], [false, 3.59, 3.591], [true, '123', '122'], [true, '123', 122], [true, 123, '123'], [false, '123', '124'], [false, '123', 124], [false, 123, '124'], [true, '123.45', '123.44'], [true, '123.44', 123.44], [true, 123.45, '123.44'], [false, '123.45', '123.46'], [false, '123.45', 123.46], [false, 123.45, '123.46'], ]); it('correctly applies less than operator', function (bool $expectedResult, mixed $value1, mixed $value2) { $comparisonFilterRule = ComparisonFilterRule::LessThan; expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult); })->with([ [true, 0, 1], [true, 4, 5], [true, 5.79, 5.7901], [false, 11, 11], [false, 1, 0], [false, 9.2901, 9.29], [true, '123', '124'], [true, '123', 124], [true, 123, '124'], [false, '123', '122'], [false, '123', 122], [false, 123, '122'], [true, '123.45', '123.46'], [true, '123.45', 123.46], [true, 123.45, '123.46'], [false, '123.45', '123.44'], [false, '123.45', 123.44], [false, 123.45, '123.44'], ]); it('correctly applies less than or equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) { $comparisonFilterRule = ComparisonFilterRule::LessThanOrEqual; expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult); })->with([ [true, 0, 1], [true, 4, 5], [true, 5.79, 5.7901], [true, 11, 11], [false, 1, 0], [false, 9.2901, 9.29], [true, '123', '124'], [true, '123', 124], [true, 123, '123'], [false, '123', '122'], [false, '123', 122], [false, 123, '122'], [true, '123.45', '123.46'], [true, '123.45', 123.45], [true, 123.45, '123.46'], [false, '123.45', '123.44'], [false, '123.45', 123.44], [false, 123.45, '123.44'], ]); ================================================ FILE: tests/Steps/Filters/Enums/StringFilterRuleTest.php ================================================ evaluate($haystack, $needle))->toBe($expectedResult); })->with([ [true, 'foobarbaz', 'foo'], [true, 'foo bar baz', 'foo'], [true, 'foo bar baz', 'bar'], [true, 'foo bar baz', 'baz'], [false, 'foo bar baz', 'Foo'], ]); it('checks if a string starts with another string', function ( bool $expectedResult, mixed $haystack, mixed $needle, ) { $stringFilterRule = StringFilterRule::StartsWith; expect($stringFilterRule->evaluate($haystack, $needle))->toBe($expectedResult); })->with([ [true, 'foobarbaz', 'foo'], [true, 'foo bar baz', 'foo'], [true, 'foo bar baz', 'foo bar'], [false, 'foo bar baz', 'bar'], [false, 'foo bar baz', 'baz'], [false, 'foo bar baz', 'Foo'], ]); it('checks if a string ends with another string', function ( bool $expectedResult, mixed $haystack, mixed $needle, ) { $stringFilterRule = StringFilterRule::EndsWith; expect($stringFilterRule->evaluate($haystack, $needle))->toBe($expectedResult); })->with([ [true, 'foobarbaz', 'baz'], [true, 'foo bar baz', 'baz'], [true, 'foo bar baz', 'bar baz'], [false, 'foo bar baz', 'bar'], [false, 'foo bar baz', 'foo'], [false, 'foo bar baz', 'Baz'], ]); ================================================ FILE: tests/Steps/Filters/Enums/StringLengthFilterRuleTest.php ================================================ evaluate($value1, $value2))->toBe($expectedResult); })->with([ [true, 'foo', 3], [true, 'lorem', 5], [true, 'foo bar', 7], [false, 'bar', 4], [false, 'baz quz', 6], ]); it('correctly applies not equal rule', function (bool $expectedResult, mixed $value1, mixed $value2) { $comparisonFilterRule = StringLengthFilterRule::NotEqual; expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult); })->with([ [true, 'foo', 2], [true, 'foo bar', 8], [false, 'foo', 3], [false, 'lorem ipsum', 11], ]); it('correctly applies greater than rule', function (bool $expectedResult, mixed $value1, mixed $value2) { $comparisonFilterRule = StringLengthFilterRule::GreaterThan; expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult); })->with([ [true, 'foo', 2], [true, 'foo bar', 6], [false, 'foo', 3], [false, 'foo bar', 7], ]); it('correctly applies greater than or equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) { $comparisonFilterRule = StringLengthFilterRule::GreaterThanOrEqual; expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult); })->with([ [true, 'foo', 2], [true, 'foo', 3], [true, 'foo bar', 6], [true, 'foo bar', 7], [false, 'foo', 4], [false, 'foo bar', 8], ]); it('correctly applies less than operator', function (bool $expectedResult, mixed $value1, mixed $value2) { $comparisonFilterRule = StringLengthFilterRule::LessThan; expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult); })->with([ [true, 'foo', 4], [true, 'foo bar', 8], [false, 'foo', 3], [false, 'foo bar', 7], ]); it('correctly applies less than or equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) { $comparisonFilterRule = StringLengthFilterRule::LessThanOrEqual; expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult); })->with([ [true, 'foo', 4], [true, 'foo', 3], [true, 'foo bar', 8], [true, 'foo bar', 7], [false, 'foo', 2], [false, 'foo bar', 6], ]); ================================================ FILE: tests/Steps/Filters/Enums/UrlFilterRuleTest.php ================================================ evaluate($haystack, $needle))->toBe($expectedResult); })->with([ [true, 'https://www.example.com', 'https'], [true, 'http://www.example.com', 'http'], [true, 'ftp://user:password@example.com:21/path', 'ftp'], [false, 'https://www.example.com', 'http'], ]); it('checks if a URL has a certain host', function (bool $expectedResult, mixed $haystack, mixed $needle) { $urlFilterRule = UrlFilterRule::Host; expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult); })->with([ [true, 'https://www.example.com', 'www.example.com'], [true, 'https://jobs.example.com', 'jobs.example.com'], [true, 'https://pew.pew.pew.example.com:8080/pew', 'pew.pew.pew.example.com'], [false, 'https://jobs.example.com', 'www.example.com'], ]); it('checks if a URL has a certain domain', function (bool $expectedResult, mixed $haystack, mixed $needle) { $urlFilterRule = UrlFilterRule::Domain; expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult); })->with([ [true, 'https://www.example.com', 'example.com'], [true, 'https://jobs.example.com', 'example.com'], [true, 'https://pew.pew.pew.example.com:8080/pew', 'example.com'], [false, 'https://www.example.com', 'yolo.com'], [false, 'https://www.example.com', 'www.example.com'], ]); it('checks if a URL has a certain path', function (bool $expectedResult, mixed $haystack, mixed $needle) { $urlFilterRule = UrlFilterRule::Path; expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult); })->with([ [true, 'https://www.example.com/foo/bar', '/foo/bar'], [false, 'https://www.example.com/foo/bar/baz', '/foo/bar'], ]); it('checks if a URL path starts with a certain path', function (bool $expectedResult, mixed $haystack, mixed $needle) { $urlFilterRule = UrlFilterRule::PathStartsWith; expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult); })->with([ [true, 'https://www.example.com/foo/bar', '/foo/bar'], [true, 'https://www.example.com/foo/bar', '/foo'], [false, 'https://www.example.com/foo/bar', '/bar'], ]); it('checks if a URL path matches a regex pattern', function (bool $expectedResult, mixed $haystack, mixed $needle) { $urlFilterRule = UrlFilterRule::PathMatches; expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult); })->with([ [true, 'https://www.example.com/foo/bar', '^/foo/'], [true, 'https://www.example.com/56/something/foo', '^/\d{1,5}/[a-z]{1,20}'], [false, 'https://www.example.com/56/some-thing/foo', '^/\d{1,5}/[a-z]{1,20}/'], ]); ================================================ FILE: tests/Steps/Filters/FilterTest.php ================================================ value = $this->getKey($valueInQuestion); return true; } } it('gets a key from an array', function () { $filter = new TestFilter(); $filter->useKey('foo'); $filter->evaluate(['foo' => 'fooValue', 'bar' => 'barValue']); expect($filter->value)->toBe('fooValue'); }); it('gets a key from an object', function () { $filter = new TestFilter(); $filter->useKey('foo'); $filter->evaluate(helper_getStdClassWithData(['foo' => 'fooValue', 'bar' => 'barValue'])); expect($filter->value)->toBe('fooValue'); }); it('throws an exception when the value in question is not array or object when a key to use was defined', function () { $filter = new TestFilter(); $filter->useKey('foo'); $filter->evaluate('foo'); })->throws(InvalidArgumentException::class); it('throws an exception when the key to use is not contained in an array', function () { $filter = new TestFilter(); $filter->useKey('foo'); $filter->evaluate(['bar' => 'barValue', 'baz' => 'bazValue']); })->throws(Exception::class); it('throws an exception when the key to use is not contained in an object', function () { $filter = new TestFilter(); $filter->useKey('foo'); $filter->evaluate(helper_getStdClassWithData(['bar' => 'barValue', 'baz' => 'bazValue'])); })->throws(Exception::class); ================================================ FILE: tests/Steps/Filters/NegatedFilterTest.php ================================================ evaluate('foo'))->toBeTrue(); expect($negatedFilter->evaluate('foo'))->toBeFalse(); expect($filter->evaluate('bar'))->toBeFalse(); expect($negatedFilter->evaluate('bar'))->toBeTrue(); }); ================================================ FILE: tests/Steps/Filters/StringFilterTest.php ================================================ evaluate('foo bar baz'))->toBeTrue(); expect($stringCheck->evaluate('lorem ipsum'))->toBeFalse(); }); it('checks a string from an array using a key', function () { $stringCheck = new StringFilter(StringFilterRule::StartsWith, 'waldo'); $stringCheck->useKey('bar'); expect($stringCheck->evaluate(['foo' => 'something', 'bar' => 'waldo check', 'baz' => 'test']))->toBeTrue(); expect($stringCheck->evaluate(['foo' => 'something', 'bar' => 'check waldo', 'baz' => 'test']))->toBeFalse(); }); it('checks a string from an object using a key', function () { $stringCheck = new StringFilter(StringFilterRule::EndsWith, 'waldo'); $stringCheck->useKey('bar'); $object = helper_getStdClassWithData(['foo' => 'something', 'bar' => 'check waldo', 'baz' => 'test']); expect($stringCheck->evaluate($object))->toBeTrue(); $object = helper_getStdClassWithData(['foo' => 'something', 'bar' => 'waldo check', 'baz' => 'test']); expect($stringCheck->evaluate($object))->toBeFalse(); }); ================================================ FILE: tests/Steps/Filters/StringLengthFilterTest.php ================================================ evaluate('foo'))->toBeFalse(); expect($stringCheck->evaluate('lorem ipsum'))->toBeTrue(); }); it('checks a string from an array using a key', function () { $stringCheck = new StringLengthFilter(StringLengthFilterRule::GreaterThan, 10); $stringCheck->useKey('bar'); expect($stringCheck->evaluate(['foo' => 'one', 'bar' => 'two', 'baz' => 'three']))->toBeFalse(); expect($stringCheck->evaluate(['foo' => 'one', 'bar' => 'lorem ipsum', 'baz' => 'three']))->toBeTrue(); }); it('checks a string from an object using a key', function () { $stringCheck = new StringLengthFilter(StringLengthFilterRule::GreaterThan, 10); $stringCheck->useKey('bar'); $object = helper_getStdClassWithData(['foo' => 'one', 'bar' => 'two', 'baz' => 'three']); expect($stringCheck->evaluate($object))->toBeFalse(); $object = helper_getStdClassWithData(['foo' => 'one', 'bar' => 'lorem ipsum', 'baz' => 'three']); expect($stringCheck->evaluate($object))->toBeTrue(); }); ================================================ FILE: tests/Steps/Filters/UrlFilterTest.php ================================================ evaluate('https://www.crwlr.software/packages'))->toBeTrue(); expect($urlFilter->evaluate('https://www.example.com/something'))->toBeFalse(); }); it('evaluates an url from an array using a key', function () { $urlFilter = (new UrlFilter(UrlFilterRule::Scheme, 'https'))->useKey('bar'); expect($urlFilter->evaluate(['foo' => 'yo', 'bar' => 'https://www.example.com']))->toBeTrue(); expect($urlFilter->evaluate(['foo' => 'yo', 'bar' => 'http://www.example.com']))->toBeFalse(); }); it('evaluates a string from an object using a key', function () { $urlFilter = (new UrlFilter(UrlFilterRule::PathStartsWith, '/foo'))->useKey('bar'); expect($urlFilter->evaluate( helper_getStdClassWithData(['foo' => 'yo', 'bar' => 'https://www.example.com/foo/bar/baz']), ))->toBeTrue(); expect($urlFilter->evaluate( helper_getStdClassWithData(['foo' => 'yo', 'bar' => 'https://www.example.com/articles/1']), ))->toBeFalse(); }); it('doesnt throw an exception when value is not a valid url', function () { $urlFilter = new UrlFilter(UrlFilterRule::Host, 'invalid'); expect($urlFilter->evaluate('https*://invalid'))->toBeFalse(); }); ================================================ FILE: tests/Steps/GroupTest.php ================================================ addStep($step); } return $group; } function helper_addUpdateInputUsingOutputCallbackToSteps(Closure $callback, Step ...$steps): void { foreach ($steps as $step) { $step->updateInputUsingOutput($callback); } } function helper_getStepThatRemembersIfItWasCalled(): Step { return new class extends Step { public bool $called = false; protected function invoke(mixed $input): Generator { $this->called = true; yield 'test'; } }; } test('You can add a step and it passes on the logger', function () { $step = Mockery::mock(StepInterface::class); $step->shouldReceive('addLogger')->once(); $step->shouldNotReceive('setLoader'); $group = new Group(); $group->addLogger(new CliLogger()); $group->addStep($step); }); it('also passes on a new logger to all steps when the logger is added after the steps', function () { $step1 = Mockery::mock(StepInterface::class); $step1->shouldReceive('addLogger')->once(); $step2 = Mockery::mock(StepInterface::class); $step2->shouldReceive('addLogger')->once(); $group = new Group(); $group->addStep($step1); $group->addStep($step2); $group->addLogger(new CliLogger()); }); it('also passes on the loader to the step when setLoader method exists in step', function () { $step = Mockery::mock(helper_getLoadingStep()); $step->shouldReceive('addLogger')->once(); $step->shouldReceive('setLoader')->once(); $group = new Group(); $group->addLogger(new CliLogger()); $group->setLoader(new HttpLoader(new BotUserAgent('MyBot'))); /** @var Step $step */ $group->addStep($step); }); it('also passes on a new loader to all steps when it is added after the steps', function () { $step1 = Mockery::mock(helper_getLoadingStep()); $step1->shouldReceive('setLoader')->once(); $step2 = Mockery::mock(helper_getLoadingStep()); $step2->shouldReceive('setLoader')->once(); $group = new Group(); /** @var Step $step1 */ $group->addStep($step1); /** @var Step $step2 */ $group->addStep($step2); $group->setLoader(new HttpLoader(new BotUserAgent('MyBot'))); }); test('The factory method returns a Group object instance', function () { expect(Crawler::group())->toBeInstanceOf(Group::class); }); test('You can add multiple steps and invokeStep calls all of them', function () { $step1 = helper_getStepThatRemembersIfItWasCalled(); $step2 = helper_getStepThatRemembersIfItWasCalled(); $step3 = helper_getStepThatRemembersIfItWasCalled(); $group = new Group(); $group->addStep($step1)->addStep($step2)->addStep($step3); helper_invokeStepWithInput($group); expect($step1->called)->toBeTrue() // @phpstan-ignore-line ->and($step2->called)->toBeTrue() // @phpstan-ignore-line ->and($step3->called)->toBeTrue(); // @phpstan-ignore-line }); it('combines the outputs of all it\'s steps into one output containing an array', function () { $step1 = helper_getValueReturningStep('lorem'); $step2 = helper_getValueReturningStep('ipsum'); $step3 = helper_getValueReturningStep('dolor'); $group = new Group(); $group->addStep($step1)->addStep($step2)->addStep($step3); $output = helper_invokeStepWithInput($group, 'gogogo'); expect($output)->toHaveCount(1) ->and($output[0])->toBeInstanceOf(Output::class) ->and($output[0]->get())->toBe(['lorem', 'ipsum', 'dolor']); }); test( 'When defining keys for the steps via $step->outputKey(), the combined output array has those keys', function () { $step1 = helper_getValueReturningStep('ich'); $step2 = helper_getValueReturningStep('bin'); $step3 = helper_getValueReturningStep('ein berliner'); $group = (new Group()) ->addStep($step1->outputKey('foo')) ->addStep($step2->outputKey('bar')) ->addStep($step3->outputKey('baz')); $output = helper_invokeStepWithInput($group, 'https://www.gogo.go'); expect($output)->toHaveCount(1) ->and($output[0])->toBeInstanceOf(Output::class); $expectedOutputAndResultArray = ['foo' => 'ich', 'bar' => 'bin', 'baz' => 'ein berliner']; expect($output[0]->get())->toBe($expectedOutputAndResultArray); }, ); it('merges array outputs with string keys to one array', function () { $step1 = helper_getValueReturningStep(['foo' => 'fooValue', 'bar' => 'barValue']); $step2 = helper_getValueReturningStep(['baz' => 'bazValue', 'yo' => 'lo']); $group = (new Group()) ->addStep($step1) ->addStep($step2); $output = helper_invokeStepWithInput($group); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe([ 'foo' => 'fooValue', 'bar' => 'barValue', 'baz' => 'bazValue', 'yo' => 'lo', ]); }); it('doesn\'t invoke twice with duplicate inputs when uniqueInput was called', function () { $step1 = helper_getValueReturningStep('one'); $step2 = helper_getValueReturningStep('two'); $group = helper_addStepsToGroup(new Group(), $step1, $step2); $outputs = helper_invokeStepWithInput($group, 'foo'); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($group, 'foo'); expect($outputs)->toHaveCount(1); $group->resetAfterRun(); $group->uniqueInputs(); $outputs = helper_invokeStepWithInput($group, 'foo'); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($group, 'foo'); expect($outputs)->toHaveCount(0); }); it( 'doesn\'t invoke twice with array inputs with duplicate keys when uniqueInput was called with that key', function () { $step1 = helper_getValueReturningStep('one'); $step2 = helper_getValueReturningStep('two'); $group = helper_addStepsToGroup(new Group(), $step1, $step2); $group->uniqueInputs(); $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'bttfc' => 'marty']); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'bttfc' => 'doc']); expect($outputs)->toHaveCount(1); $group->resetAfterRun(); $group->uniqueInputs('foo'); $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'bttfc' => 'marty']); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'bttfc' => 'doc']); expect($outputs)->toHaveCount(0); }, ); it( 'doesn\'t invoke twice with object inputs with duplicate keys when uniqueInput was called with that key', function () { $step1 = helper_getValueReturningStep('one'); $step2 = helper_getValueReturningStep('two'); $group = helper_addStepsToGroup(new Group(), $step1, $step2); $group->uniqueInputs(); $outputs = helper_invokeStepWithInput($group, helper_getStdClassWithData(['foo' => 'bar', 'bttfc' => 'marty'])); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($group, helper_getStdClassWithData(['foo' => 'bar', 'bttfc' => 'doc'])); expect($outputs)->toHaveCount(1); $group->resetAfterRun(); $group->uniqueInputs('foo'); $outputs = helper_invokeStepWithInput($group, helper_getStdClassWithData(['foo' => 'bar', 'bttfc' => 'marty'])); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($group, helper_getStdClassWithData(['foo' => 'bar', 'bttfc' => 'doc'])); expect($outputs)->toHaveCount(0); }, ); it('returns only unique outputs when uniqueOutput was called', function () { $step1 = helper_getInputReturningStep(); $step2 = helper_getValueReturningStep('test'); $group = helper_addStepsToGroup(new Group(), $step1, $step2)->uniqueOutputs(); $outputs = helper_invokeStepWithInput($group, 'foo'); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($group, 'bar'); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($group, 'foo'); expect($outputs)->toHaveCount(0); }); it('returns only unique outputs when outputs are arrays and uniqueOutput was called', function () { $step1 = helper_getInputReturningStep(); $step2 = helper_getValueReturningStep(['lorem' => 'ipsum']); $group = helper_addStepsToGroup(new Group(), $step1, $step2)->uniqueOutputs(); $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar']); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($group, ['baz' => 'quz']); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar']); expect($outputs)->toHaveCount(0); }); it( 'returns only unique outputs when outputs are arrays and uniqueOutput was called with a key from the output arrays', function () { $step1 = helper_getInputReturningStep(); $step2 = helper_getValueReturningStep(['lorem' => 'ipsum']); $group = helper_addStepsToGroup(new Group(), $step1, $step2)->uniqueOutputs('foo'); $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar']); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($group, ['foo' => 'baz']); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'something' => 'else']); expect($outputs)->toHaveCount(0); }, ); it('returns only unique outputs when outputs are objects and uniqueOutput was called', function () { $step1 = helper_getStepYieldingObjectWithNumber(10); $step2 = helper_getStepYieldingObjectWithNumber(11); $group = helper_addStepsToGroup(new Group(), $step1, $step2); expect(helper_invokeStepWithInput($group))->toHaveCount(1); $group->uniqueOutputs(); expect(helper_invokeStepWithInput($group))->toHaveCount(1) ->and(helper_invokeStepWithInput($group))->toHaveCount(0); $incrementNumberCallback = function (mixed $input) { return $input + 1; }; helper_addUpdateInputUsingOutputCallbackToSteps($incrementNumberCallback, $step1, $step2); expect(helper_invokeStepWithInput($group, new Input(1)))->toHaveCount(1); }); it( 'returns only unique outputs when outputs are objects and uniqueOutput was called with a property name from the ' . 'output objects', function () { $step1 = helper_getStepYieldingObjectWithNumber(21); $step2 = helper_getStepYieldingObjectWithNumber(23); $group = helper_addStepsToGroup(new Group(), $step1, $step2); expect(helper_invokeStepWithInput($group))->toHaveCount(1); $group->resetAfterRun(); $group->uniqueOutputs('number'); expect(helper_invokeStepWithInput($group))->toHaveCount(1) ->and(helper_invokeStepWithInput($group))->toHaveCount(0); $group->resetAfterRun(); $incrementNumberCallback = function (mixed $input) { return $input + 1; }; helper_addUpdateInputUsingOutputCallbackToSteps($incrementNumberCallback, $step1, $step2); expect(helper_invokeStepWithInput($group, new Input(1)))->toHaveCount(1); }, ); it( 'excludes the output of a step from the combined group output, when the excludeFromGroupOutput() method was called', function () { $step1 = helper_getValueReturningStep(['foo' => 'one']); $step2 = helper_getValueReturningStep(['bar' => 'two'])->excludeFromGroupOutput(); $step3 = helper_getValueReturningStep(['baz' => 'three']); $group = helper_addStepsToGroup(new Group(), $step1, $step2, $step3); $outputs = helper_invokeStepWithInput($group); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe(['foo' => 'one', 'baz' => 'three']); }, ); test('You can update the input for further steps with the output of a step that is before those steps', function () { $step1 = helper_getValueReturningStep(' rocks') ->updateInputUsingOutput(function (mixed $input, mixed $output) { return $input . $output['foo']; }); $step2 = helper_getInputReturningStep(); $group = (new Group()) ->addStep($step1->outputKey('foo')) ->addStep($step2->outputKey('bar')); $outputs = helper_invokeStepWithInput($group, 'crwlr.software'); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe(['foo' => ' rocks', 'bar' => 'crwlr.software rocks']); }); it('uses a key from array input when defined', function () { $step = helper_getInputReturningStep(); $group = (new Group()) ->addStep($step->outputKey('test')) ->useInputKey('bar'); $outputs = helper_invokeStepWithInput($group, new Input( ['foo' => 'fooValue', 'bar' => 'barValue', 'baz' => 'bazValue'], )); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe(['test' => 'barValue']); }); it('keeps the combined output with a certain key when keepAs() is used', function () { $step1 = helper_getValueReturningStep('foo'); $step2 = helper_getValueReturningStep('bar'); $group = (new Group()) ->addStep($step1->outputKey('key1')) ->addStep($step2->outputKey('key2')) ->keepAs('test'); $output = helper_invokeStepWithInput($group); expect($output)->toHaveCount(1) ->and($output[0]->keep)->toBe(['test' => ['key1' => 'foo', 'key2' => 'bar']]); }); it('keeps all keys from a combined array output when keep() was called without argument', function () { $step1 = helper_getValueReturningStep(['foo' => 'fooValue', 'bar' => 'barValue']); $step2 = helper_getValueReturningStep(['baz' => 'bazValue', 'yo' => 'lo']); $group = (new Group()) ->addStep($step1) ->addStep($step2) ->keep(); $output = helper_invokeStepWithInput($group); expect($output)->toHaveCount(1) ->and($output[0]->keep)->toBe([ 'foo' => 'fooValue', 'bar' => 'barValue', 'baz' => 'bazValue', 'yo' => 'lo', ]); }); it('keeps all defined keys from a combined array output when keep() was called with keys', function () { $step1 = helper_getValueReturningStep(['foo' => 'fooValue', 'bar' => 'barValue']); $step2 = helper_getValueReturningStep(['baz' => 'bazValue', 'yo' => 'lo']); $group = (new Group()) ->addStep($step1) ->addStep($step2) ->keep(['foo', 'baz', 'yo']); $output = helper_invokeStepWithInput($group); expect($output)->toHaveCount(1) ->and($output[0]->keep)->toBe([ 'foo' => 'fooValue', 'baz' => 'bazValue', 'yo' => 'lo', ]); }); it('keeps data, when keep() is called on child steps', function () { $step1 = helper_getValueReturningStep(['foo' => 'fooValue', 'bar' => 'barValue']); $step2 = helper_getValueReturningStep(['baz' => 'bazValue', 'quz' => 'quzValue']); $group = (new Group()) ->addStep($step1->keep('foo')) ->addStep($step2->keep(['baz', 'quz'])); $output = helper_invokeStepWithInput($group); expect($output)->toHaveCount(1) ->and($output[0]->keep)->toBe([ 'foo' => 'fooValue', 'baz' => 'bazValue', 'quz' => 'quzValue', ]); }); it('keeps data, when keepAs() is called on child steps', function () { $step1 = helper_getValueReturningStep('fooValue'); $step2 = helper_getValueReturningStep(['bar' => 'barValue', 'baz' => 'bazValue']); $group = (new Group()) ->addStep($step1->keepAs('foo')) ->addStep($step2->keepAs('quz')); $output = helper_invokeStepWithInput($group); expect($output)->toHaveCount(1) ->and($output[0]->keep)->toBe([ 'foo' => 'fooValue', 'quz' => [ 'bar' => 'barValue', 'baz' => 'bazValue', ], ]); }); test( 'when steps yield multiple outputs it combines the first output from first step with first output from second ' . 'step and so on.', function () { $step1 = new class extends Step { protected function invoke(mixed $input): Generator { yield ['one' => 'foo']; yield ['two' => 'bar']; } }; $step2 = new class extends Step { protected function invoke(mixed $input): Generator { yield ['three' => 'baz']; yield ['four' => 'quz']; } }; $group = (new Group()) ->addStep($step1) ->addStep($step2); $output = helper_invokeStepWithInput($group); expect($output)->toHaveCount(2) ->and($output[0]->get())->toBe(['one' => 'foo', 'three' => 'baz']) ->and($output[1]->get())->toBe(['two' => 'bar', 'four' => 'quz']); }, ); it('ignores the key set via outputKey because group step output is always an array', function () { $step1 = helper_getValueReturningStep(['one' => 'foo']); $step2 = helper_getValueReturningStep(['two' => 'bar']); $group = (new Group()) ->addStep($step1) ->addStep($step2) ->outputKey('baz'); $output = helper_invokeStepWithInput($group); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['one' => 'foo', 'two' => 'bar']); }); it( 'keeps input data when keepFromInput() was called when outputs are combined', function () { $step1 = helper_getValueReturningStep(['foo' => 'one']); $step2 = helper_getValueReturningStep(['bar' => 'two']); $group = (new Group()) ->addStep($step1) ->addStep($step2) ->keepFromInput(); $output = helper_invokeStepWithInput($group, new Input(['baz' => 'three'])); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two']) ->and($output[0]->keep)->toBe(['baz' => 'three']); }, ); it('keeps non array input data in array output with key', function () { $step1 = helper_getValueReturningStep(['foo' => 'one']); $step2 = helper_getValueReturningStep(['bar' => 'two']); $group = (new Group()) ->addStep($step1) ->addStep($step2) ->keepInputAs('baz'); $output = helper_invokeStepWithInput($group, new Input('three')); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two']) ->and($output[0]->keep)->toBe(['baz' => 'three']); }); it('keeps a value with unnamed key, when non array input should be kept but no key is defined', function () { $step1 = helper_getValueReturningStep(['foo' => 'one']); $step2 = helper_getValueReturningStep(['bar' => 'two']); $group = (new Group()) ->addStep($step1) ->addStep($step2) ->keepFromInput(); $output = helper_invokeStepWithInput($group, new Input('three')); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two']) ->and($output[0]->keep)->toBe(['unnamed1' => 'three']); }); it('contains an element with a numeric key when it contains a step that yields non array output', function () { $step1 = helper_getValueReturningStep('one'); $step2 = helper_getValueReturningStep(['bar' => 'two']); $group = (new Group()) ->addStep($step1) ->addStep($step2); $output = helper_invokeStepWithInput($group); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe([0 => 'one', 'bar' => 'two']); }); it('keeps array input data when some output is non array but converted to array using outputKey()', function () { $step1 = helper_getValueReturningStep('one'); $step2 = helper_getValueReturningStep(['bar' => 'two']); $group = (new Group()) ->addStep($step1->outputKey('foo')) ->addStep($step2) ->keepFromInput(); $output = helper_invokeStepWithInput($group, new Input(['baz' => 'three'])); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two']) ->and($output[0]->keep)->toBe(['baz' => 'three']); }); it( 'keeps an input value with an unnamed key, when it is a non array value and no key is defined (via keepInputAs())', function () { $step1 = helper_getValueReturningStep('one'); $step2 = helper_getValueReturningStep(['bar' => 'two']); $group = (new Group()) ->addStep($step1) ->addStep($step2) ->keepFromInput(); $output = helper_invokeStepWithInput($group, new Input('three')); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe([0 => 'one', 'bar' => 'two']) ->and($output[0]->keep)->toBe(['unnamed1' => 'three']); }, ); it('keeps the original input data when useInputKey() is used', function () { $step1 = helper_getValueReturningStep(['foo' => 'one']); $step2 = helper_getValueReturningStep(['bar' => 'two']); $group = (new Group()) ->addStep($step1) ->addStep($step2) ->useInputKey('baz') ->keepFromInput(); $output = helper_invokeStepWithInput($group, new Input(['baz' => 'three', 'quz' => 'four'])); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two']) ->and($output[0]->keep)->toBe(['baz' => 'three', 'quz' => 'four']); }); it('applies a Closure refiner to the steps output', function () { $step1 = helper_getValueReturningStep(['foo' => 'one']); $step2 = helper_getValueReturningStep(['bar' => 'two']); $group = (new Group()) ->addStep($step1) ->addStep($step2) ->refineOutput(function (mixed $outputValue) { $outputValue['baz'] = 'three'; $outputValue['bar'] .= ' refined'; return $outputValue; }); $outputs = helper_invokeStepWithInput($group); expect($outputs[0]->get())->toBe(['foo' => 'one', 'bar' => 'two refined', 'baz' => 'three']); }); it('applies an instance of the RefinerInterface to the steps output', function () { $step1 = helper_getValueReturningStep(['foo' => 'lorem ipsum dolor']); $step2 = helper_getValueReturningStep(['bar' => 'two']); $group = (new Group()) ->addStep($step1) ->addStep($step2) ->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor')); $outputs = helper_invokeStepWithInput($group); expect($outputs[0]->get())->toBe(['foo' => 'ipsum', 'bar' => 'two']); }); it('applies multiple refiners to the steps output in the order they\'re added', function () { $step1 = helper_getValueReturningStep(['foo' => 'lorem ipsum dolor']); $step2 = helper_getValueReturningStep(['bar' => 'two']); $group = (new Group()) ->addStep($step1) ->addStep($step2) ->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor')) ->refineOutput('bar', fn(mixed $outputValue) => $outputValue . ' refined'); $outputs = helper_invokeStepWithInput($group); expect($outputs[0]->get())->toBe(['foo' => 'ipsum', 'bar' => 'two refined']); }); test('you can apply multiple refiners to the same output array key', function () { $step1 = helper_getValueReturningStep(['foo' => 'lorem ipsum dolor']); $step2 = helper_getValueReturningStep(['bar' => 'two']); $group = (new Group()) ->addStep($step1) ->addStep($step2) ->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor')) ->refineOutput('foo', fn(mixed $outputValue) => $outputValue . ' refined'); $outputs = helper_invokeStepWithInput($group); expect($outputs[0]->get())->toBe(['foo' => 'ipsum refined', 'bar' => 'two']); }); it( 'uses the original input value when applying a refiner, not only the value of an input array key chosen via ' . 'useInputKey()', function () { $step1 = helper_getValueReturningStep(['foo' => 'one']); $step2 = helper_getValueReturningStep(['bar' => 'two']); $group = (new Group()) ->addStep($step1) ->addStep($step2) ->refineOutput(fn(mixed $outputValue, mixed $originalInputValue) => $originalInputValue); $outputs = helper_invokeStepWithInput($group, ['yo' => 'lo']); expect($outputs[0]->get())->toBe(['yo' => 'lo']); }, ); it('stops calling its steps and producing outputs when maxOutputs is reached', function () { $step1 = new class extends Step { public int $called = 0; protected function invoke(mixed $input): Generator { yield ['foo' => 'one']; $this->called++; } }; $step2 = new class extends Step { public int $called = 0; protected function invoke(mixed $input): Generator { yield ['bar' => 'two']; $this->called++; } }; $group = (new Group()) ->addStep($step1) ->addStep($step2) ->maxOutputs(2); expect(helper_invokeStepWithInput($group, 'hey'))->toHaveCount(1) ->and(helper_invokeStepWithInput($group, 'ho'))->toHaveCount(1) ->and(helper_invokeStepWithInput($group, 'hey'))->toHaveCount(0) ->and($step1->called)->toBe(2) ->and($step2->called)->toBe(2); }); it( 'also stops creating outputs when maxOutputs is reached, when maxOutputs() was called before addStep()', function () { $step1 = new class extends Step { public int $called = 0; protected function invoke(mixed $input): Generator { yield ['foo' => 'one']; $this->called++; } }; $step2 = new class extends Step { public int $called = 0; protected function invoke(mixed $input): Generator { yield ['bar' => 'two']; $this->called++; } }; $group = (new Group()) ->maxOutputs(2) ->addStep($step1) ->addStep($step2); expect(helper_invokeStepWithInput($group, 'hey'))->toHaveCount(1) ->and(helper_invokeStepWithInput($group, 'ho'))->toHaveCount(1) ->and(helper_invokeStepWithInput($group, 'hey'))->toHaveCount(0) ->and($step1->called)->toBe(2) ->and($step2->called)->toBe(2); }, ); ================================================ FILE: tests/Steps/Html/CssSelectorTest.php ================================================ throws(InvalidDomQueryException::class)->with(['.foo;', '.foo:before']); test('The apply method returns a string for a single match', function () { $html = '
    test
    '; expect((new CssSelector('.item'))->apply(new HtmlDocument($html)))->toBe('test'); }); test('The apply method returns an array of strings for multiple matches', function () { $html = '
    test
    test 2 sub
    test 3
    '; expect((new CssSelector('.item'))->apply(new HtmlDocument($html)))->toBe(['test', 'test 2 sub', 'test 3']); }); test('The apply method returns null if nothing matches', function () { $html = '
    test
    '; expect((new CssSelector('.aitem'))->apply(new HtmlDocument($html)))->toBeNull(); }); it('trims whitespace', function () { $html = << test HTML; expect((new CssSelector('.item'))->apply(new HtmlDocument($html)))->toBe('test'); }); it('contains inner tags when the html method is called', function () { $html = '
    test sub
    '; expect((new CssSelector('.item'))->html()->apply(new HtmlDocument($html)))->toBe('test sub'); }); it('contains also the outer tag when the outerHtml method is called', function () { $html = '
    test sub
    '; expect((new CssSelector('.item'))->outerHtml()->apply(new HtmlDocument($html))) ->toBe('
    test sub
    '); }); it('returns formatted text when formattedText() is called', function () { $html = '

    headline

    paragraph

    • item 1
    • item 2
    '; expect((new CssSelector('#a'))->formattedText()->apply(new HtmlDocument($html))) ->toBe(<<removeConverter('ul'); expect((new CssSelector('#a'))->formattedText($converter)->apply(new HtmlDocument($html))) ->toBe(<<attribute('data-attr')->apply(new HtmlDocument($html)))->toBe('content'); }); test('getting an attribute value returns an empty string when the attribute does not exist', function () { $html = '
    test
    '; expect((new CssSelector('.item'))->attribute('foo')->apply(new HtmlDocument($html)))->toBe(''); }); it('turns the value into an absolute url when toAbsoluteUrl() is called', function () { $html = 'getting started'; $document = new HtmlDocument($html); $selector = new CssSelector('a'); $selector->setBaseUrl('https://www.crwlr.software/') ->attribute('href'); expect($selector->apply($document))->toBe('/packages/crawler/v0.4/getting-started'); $selector->toAbsoluteUrl(); expect($selector->apply($document))->toBe('https://www.crwlr.software/packages/crawler/v0.4/getting-started'); }); it( 'turns the value into the correct absolute url when toAbsoluteUrl() is called and the HTML contains a base tag', function () { $html = << link HTML; $document = new HtmlDocument($html); $selector = new CssSelector('a'); $selector->setBaseUrl('https://www.example.com/a/b') ->attribute('href'); expect($selector->apply($document))->toBe('e'); $selector->toAbsoluteUrl(); expect($selector->apply($document))->toBe('https://www.example.com/c/e'); }, ); it('gets an absolute link from the href attribute of a link element, when the link() method is called', function () { $html = ''; $document = new HtmlDocument($html); $selector = new CssSelector('#foo .bar'); $selector->setBaseUrl('https://www.example.com/'); expect($selector->apply($document))->toBe('Foo'); $selector->link(); expect($selector->apply($document))->toBe('https://www.example.com/foo/bar'); }); it('gets only the first matching element when the first() method is called', function () { $selector = (new CssSelector('#list .item'))->first(); expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('one'); }); it('gets only the last matching element when the last() method is called', function () { $selector = (new CssSelector('#list .item'))->last(); expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('four'); }); it('gets only the nth matching element when the nth() method is called', function () { $selector = (new CssSelector('#list .item'))->nth(3); expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('three'); }); it('returns null when no nth matching element exists', function () { $selector = (new CssSelector('#list .item'))->nth(5); expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBeNull(); }); it('gets only even matching elements when the even() method is called', function () { $selector = (new CssSelector('#list .item'))->even(); expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['two', 'four']); }); it('gets only odd matching elements when the odd() method is called', function () { $selector = (new CssSelector('#list .item'))->odd(); expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['one', 'three']); }); ================================================ FILE: tests/Steps/Html/Exceptions/InvalidDomQueryExceptionTest.php ================================================ getDomQuery()) ->toBe('.foo:before') ->and($exception->getMessage()) ->toBe('error'); }); it('can be created from a symfony SyntaxErrorException', function () { $exception = InvalidDomQueryException::fromSymfonyException('.foo;', new SyntaxErrorException('error message')); expect($exception->getDomQuery()) ->toBe('.foo;') ->and($exception->getMessage()) ->toBe('error message'); }); it('can be created from a message and a query', function () { $exception = InvalidDomQueryException::make('message', '.foo > .bar;'); expect($exception->getDomQuery()) ->toBe('.foo > .bar;') ->and($exception->getMessage()) ->toBe('message'); }); ================================================ FILE: tests/Steps/Html/GetLinkTest.php ================================================ link'), )); expect($link)->toHaveCount(1) ->and($link[0]->get())->toBe('https://www.crwl.io/blog'); }); it('logs an error message when fed with invalid input', function () { $logger = new DummyLogger(); $step = (new GetLink())->addLogger($logger); helper_traverseIterable($step->invokeStep(new Input(new Response()))); expect($logger->messages)->not->toBeEmpty() ->and($logger->messages[0]['message'])->toBe( 'The Crwlr\Crawler\Steps\Html\GetLink step was called with input that it can not work with: Input must ' . 'be an instance of RespondedRequest.', ); }); test('When called without selector it just returns the first link', function () { $step = (new GetLink()); $link = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.crwlr.software/packages/url/'), new Response( 200, [], '', ), )); expect($link[0]->get())->toBe('https://www.crwlr.software/packages/url/v0.1'); }); test('When passing a CSS selector it selects the first matching link', function () { $step = (new GetLink('.matchingLink')); $responseHtml = << Jobs Numbers Products HTML; $link = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.foo.bar/company/about'), new Response(200, [], $responseHtml), )); expect($link[0]->get())->toBe('https://www.foo.bar/company/jobs'); }); test('When selector matches on a non-link element it\'s ignored', function () { $step = (new GetLink('.link')); $link = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], 'not a linklink'), )); expect($link)->toHaveCount(1) ->and($link[0]->get())->toBe('https://www.otsch.codes/foo'); }); it('finds only links on the same domain when onSameDomain() was called', function () { $html = <<link1 link2 HTML; $step = (new GetLink())->onSameDomain(); $link = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($link)->toHaveCount(1) ->and($link[0]->get())->toBe('https://blog.otsch.codes/articles'); }); it('doesn\'t find a link on the same domain when notOnSameDomain() was called', function () { $html = <<link1 link2 HTML; $step = (new GetLink())->notOnSameDomain(); $link = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($link)->toHaveCount(1) ->and($link[0]->get())->toBe('https://www.crwlr.software/packages'); }); it('finds only links from domains the onDomain() method was called with', function () { $html = <<link1 link2 link3 link4 HTML; $step = (new GetLink())->onDomain('example.com'); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(1) ->and($links[0]->get())->toBe('https://www.example.com'); }); test('onDomain() also takes an array of domains', function () { $html = <<link1 link2 HTML; $step = (new GetLink())->onDomain(['otsch.codes', 'example.com']); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(1) ->and($links[0]->get())->toBe('https://www.otsch.codes/contact'); $html = <<link1 link2 HTML; $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(1) ->and($links[0]->get())->toBe('https://www.example.com/foo'); }); test('onDomain() can be called multiple times and merges all domains it was called with', function () { $html = <<link1 HTML; $step = (new GetLink())->onDomain('crwl.io'); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(0); $step->onDomain(['otsch.codes', 'crwlr.software']); $html = <<link1 link2 HTML; $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(1) ->and($links[0]->get())->toBe('https://www.crwl.io'); $html = <<link1 link2 HTML; $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(1) ->and($links[0]->get())->toBe('https://www.otsch.codes/contact'); }); it('finds only links on the same host when onSameHost() was called', function () { $html = <<link1 link2 link3 HTML; $step = (new GetLink())->onSameHost(); $link = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($link)->toHaveCount(1) ->and($link[0]->get())->toBe('https://www.otsch.codes/contact'); }); it('doesn\'t find a link on the same host when notOnSameHost() was called', function () { $html = <<link1 link2 HTML; $step = (new GetLink())->notOnSameHost(); $link = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($link)->toHaveCount(1) ->and($link[0]->get())->toBe('https://jobs.otsch.codes'); }); it('finds only links from hosts the onHost() method was called with', function () { $html = <<link1 link2 link3 link4 HTML; $step = (new GetLink())->onHost('www.example.com'); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(1) ->and($links[0]->get())->toBe('https://www.example.com'); }); test('onHost() also takes an array of hosts', function () { $html = <<link1 link2 HTML; $step = (new GetLink())->onHost(['www.otsch.codes', 'blog.example.com']); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(1) ->and($links[0]->get())->toBe('https://www.otsch.codes/contact'); $html = <<link1 link2 link3 HTML; $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(1) ->and($links[0]->get())->toBe('https://blog.example.com/articles/1'); }); test('onHost() can be called multiple times and merges all hosts it was called with', function () { $html = <<link1 HTML; $step = (new GetLink())->onHost('www.crwl.io'); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(0); $step->onHost(['www.otsch.codes', 'www.crwlr.software']); $html = <<link1 HTML; $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(1) ->and($links[0]->get())->toBe('https://www.crwl.io'); $html = <<link1 link2 HTML; $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(1) ->and($links[0]->get())->toBe('https://www.otsch.codes/blog'); }); it('works correctly when HTML contains a base tag', function () { $html = << link HTML; $step = (new GetLink()); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.example.com/a/b'), new Response(200, [], $html), )); expect($links[0]->get())->toBe('https://www.example.com/c/e'); }); it('throws away the URL fragment part when withoutFragment() was called', function () { $html = << link HTML; $step = (new GetLink()); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/foo/baz'), new Response(200, [], $html), ); $links = helper_invokeStepWithInput($step, $respondedRequest); expect($links[0]->get())->toBe('https://www.example.com/foo/bar#fragment'); $step->withoutFragment(); $links = helper_invokeStepWithInput($step, $respondedRequest); expect($links[0]->get())->toBe('https://www.example.com/foo/bar'); }); it('ignores special non HTTP links', function () { $html = << mailto link javascript link phone link link HTML; $step = (new GetLink()); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/home'), new Response(200, [], $html), ); $links = helper_invokeStepWithInput($step, $respondedRequest); expect($links[0]->get())->toBe('https://www.example.com/foo/bar'); }); ================================================ FILE: tests/Steps/Html/GetLinksTest.php ================================================ link'), )); expect($links)->toHaveCount(1) ->and($links[0]->get())->toBe('https://www.example.com/blog'); }); it('logs an error message when fed with invalid input', function () { $logger = new DummyLogger(); $step = (new GetLinks())->addLogger($logger); helper_traverseIterable($step->invokeStep(new Input(new stdClass()))); expect($logger->messages)->not->toBeEmpty() ->and($logger->messages[0]['message'])->toBe( 'The Crwlr\Crawler\Steps\Html\GetLinks step was called with input that it can not work with: Input must ' . 'be an instance of RespondedRequest.', ); }); test('When called without selector it just gets all links', function () { $step = (new GetLinks()); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.crwlr.software/packages/url/'), new Response( 200, [], '', ), )); expect($links[0]->get())->toBe('https://www.crwlr.software/packages/url/v0.1') ->and($links[1]->get())->toBe('https://www.crwlr.software/packages/url/v1.0') ->and($links[2]->get())->toBe('https://www.crwlr.software/packages/url/v1.1'); }); test('When passing a CSS selector it only selects matching links', function () { $step = (new GetLinks('.matchingLink')); $responseHtml = << Jobs Numbers Products Team HTML; $outputs = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.example.com/company/about'), new Response(200, [], $responseHtml), )); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe('https://www.example.com/company/jobs') ->and($outputs[1]->get())->toBe('https://www.example.com/company/numbers') ->and($outputs[2]->get())->toBe('https://www.example.com/team'); }); test('When selector matches on a non-link element it\'s ignored', function () { $step = (new GetLinks('.link')); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], 'FooBar'), )); expect($links)->toHaveCount(1) ->and($links[0]->get())->toBe('https://www.otsch.codes/foo'); }); it('finds only links on the same domain when onSameDomain() was called', function () { $html = <<link1 link2 link3 HTML; $step = (new GetLinks())->onSameDomain(); $link = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($link)->toHaveCount(2) ->and($link[0]->get())->toBe('https://blog.otsch.codes/articles') ->and($link[1]->get())->toBe('https://www.otsch.codes/blog'); }); it('doesn\'t find links on the same domain when notOnSameDomain() was called', function () { $html = <<link1 link2 link3 HTML; $step = (new GetLinks())->notOnSameDomain(); $link = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($link)->toHaveCount(2) ->and($link[0]->get())->toBe('https://www.crwlr.software/packages') ->and($link[1]->get())->toBe('https://www.example.com/foo'); }); it('finds only links from domains the onDomain() method was called with', function () { $html = <<link1 link2 link3 link4 HTML; $step = (new GetLinks())->onDomain('crwlr.software'); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(2) ->and($links[0]->get())->toBe('https://www.crwlr.software/packages') ->and($links[1]->get())->toBe('https://www.crwlr.software/blog'); }); test('onDomain() also takes an array of domains', function () { $html = <<link1 link2 link3 HTML; $step = (new GetLinks())->onDomain(['otsch.codes', 'crwlr.software']); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(2) ->and($links[0]->get())->toBe('https://www.otsch.codes/contact') ->and($links[1]->get())->toBe('https://www.crwlr.software/packages'); }); test('onDomain() can be called multiple times and merges all domains it was called with', function () { $html = <<link1 link2 link3 HTML; $step = (new GetLinks())->onDomain('crwl.io'); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(0); $step->onDomain(['otsch.codes', 'crwlr.software']); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(2); $step->onDomain('example.com'); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(3); }); it('finds only links on the same host when onSameHost() was called', function () { $html = <<link1 link2 link3 link4 HTML; $step = (new GetLinks())->onSameHost(); $link = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($link)->toHaveCount(2) ->and($link[0]->get())->toBe('https://www.otsch.codes/contact') ->and($link[1]->get())->toBe('https://www.otsch.codes/blog'); }); it('doesn\'t find links on the same host when notOnSameHost() was called', function () { $html = <<link1 link2 link3 HTML; $step = (new GetLinks())->notOnSameHost(); $link = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($link)->toHaveCount(2) ->and($link[0]->get())->toBe('https://jobs.otsch.codes') ->and($link[1]->get())->toBe('https://www.crwlr.software/packages'); }); it('finds only links from hosts the onHost() method was called with', function () { $html = <<link1 link2 link3 link4 HTML; $step = (new GetLinks())->onHost('www.crwlr.software'); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(2) ->and($links[0]->get())->toBe('https://www.crwlr.software/packages') ->and($links[1]->get())->toBe('https://www.crwlr.software/packages/crawler/v0.4/getting-started'); }); test('onHost() also takes an array of hosts', function () { $html = <<link1 link2 HTML; $step = (new GetLinks())->onHost(['www.otsch.codes', 'blog.example.com']); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(1) ->and($links[0]->get())->toBe('https://www.otsch.codes/contact'); $html = <<link1 link2 link3 HTML; $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(2) ->and($links[1]->get())->toBe('https://blog.example.com/articles/1'); }); test('onHost() can be called multiple times and merges all hosts it was called with', function () { $html = <<link1 HTML; $step = (new GetLinks())->onHost('www.crwl.io'); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(0); $step->onHost(['www.otsch.codes', 'www.crwlr.software']); $html = <<link1 HTML; $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(1) ->and($links[0]->get())->toBe('https://www.crwl.io'); $html = <<link1 link2 HTML; $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.otsch.codes'), new Response(200, [], $html), )); expect($links)->toHaveCount(2) ->and($links[0]->get())->toBe('https://www.otsch.codes/blog') ->and($links[1]->get())->toBe('https://www.crwl.io'); }); it('works correctly when HTML contains a base tag', function () { $html = << link link2 link3 HTML; $step = (new GetLinks()); $links = helper_invokeStepWithInput($step, new RespondedRequest( new Request('GET', 'https://www.example.com/a/b'), new Response(200, [], $html), )); expect($links[0]->get())->toBe('https://www.example.com/c/e') ->and($links[1]->get())->toBe('https://www.example.com/f/g') ->and($links[2]->get())->toBe('https://www.example.com/c/h'); }); it('throws away the URL fragment part when withoutFragment() was called', function () { $html = << link
    another link
    HTML; $step = (new GetLinks()); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/foo/baz'), new Response(200, [], $html), ); $links = helper_invokeStepWithInput($step, $respondedRequest); expect($links[0]->get())->toBe('https://www.example.com/foo/bar#fragment') ->and($links[1]->get())->toBe('https://www.example.com/baz#quz-fragment'); $step->withoutFragment(); $links = helper_invokeStepWithInput($step, $respondedRequest); expect($links[0]->get())->toBe('https://www.example.com/foo/bar') ->and($links[1]->get())->toBe('https://www.example.com/baz'); }); it('ignores special non HTTP links', function () { $html = << mailto link link one javascript link link two phone link link three HTML; $step = (new GetLinks()); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/home'), new Response(200, [], $html), ); $links = helper_invokeStepWithInput($step, $respondedRequest); expect($links)->toHaveCount(3) ->and($links[0]->get())->toBe('https://www.example.com/one') ->and($links[1]->get())->toBe('https://www.example.com/two') ->and($links[2]->get())->toBe('https://www.example.com/three'); }); ================================================ FILE: tests/Steps/Html/MetaDataTest.php ================================================ Hello World! HTML; $outputs = helper_invokeStepWithInput(new MetaData(), $html); expect($outputs[0]->get())->toBe(['title' => '']); }); it('returns an array with the title and all meta tags having a name or property attribute', function () { $html = << Hello World! Hello World! HTML; $outputs = helper_invokeStepWithInput(new MetaData(), $html); expect($outputs[0]->get())->toBe([ 'title' => 'Hello World!', 'description' => 'This is a page saying: Hello World!', 'keywords' => 'lorem, ipsum, hello, world', 'og:title' => 'Hello World!', 'og:type' => 'website', ]); }); it('returns only the meta tags defined via the only() method', function () { $html = << Hello World! Hello World! HTML; $outputs = helper_invokeStepWithInput(Html::metaData()->only(['description', 'og:title']), $html); expect($outputs[0]->get())->toBe([ 'description' => 'This is a page saying: Hello World!', 'og:title' => 'Hello World!', ]); }); ================================================ FILE: tests/Steps/Html/SchemaOrgTest.php ================================================ Foo Bar

    Baz

    Other content

    HTML; } function helper_schemaOrgExampleMultipleObjects(): string { return << Foo Bar

    Some Article

    This is some article about something.

    HTML; } it('extracts schema.org data in JSON-LD format from an HTML document', function () { $html = helper_schemaOrgExampleOneJobPostingInBody(); $outputs = helper_invokeStepWithInput(Html::schemaOrg(), $html); expect($outputs)->toHaveCount(1); expect($outputs[0]->get())->toBeInstanceOf(JobPosting::class); }); it('converts the spatie schema.org objects to arrays when calling the toArray() method', function () { $html = helper_schemaOrgExampleOneJobPostingInBody(); $outputs = helper_invokeStepWithInput(Html::schemaOrg()->toArray(), $html); expect($outputs)->toHaveCount(1); expect($outputs[0]->get())->toBeArray(); expect($outputs[0]->get()['hiringOrganization'])->toBeArray(); expect($outputs[0]->get()['hiringOrganization'])->toHaveKey('name'); expect($outputs[0]->get()['hiringOrganization']['name'])->toBe('Foo Ltd.'); }); it('gets all the schema.org objects contained in a document', function () { $html = helper_schemaOrgExampleMultipleObjects(); $outputs = helper_invokeStepWithInput(Html::schemaOrg(), $html); expect($outputs)->toHaveCount(3); }); it('gets only schema.org objects of a certain type if you use the onlyType method', function () { $html = helper_schemaOrgExampleMultipleObjects(); $outputs = helper_invokeStepWithInput( Html::schemaOrg()->onlyType('Article'), $html, ); expect($outputs)->toHaveCount(1); expect($outputs[0]->get())->toBeInstanceOf(Article::class); }); it('also finds schema.org objects of a certain type in children of another schema.org object', function () { $html = helper_schemaOrgExampleMultipleObjects(); $outputs = helper_invokeStepWithInput( Html::schemaOrg()->onlyType('Organization'), $html, ); expect($outputs)->toHaveCount(2); expect($outputs[0]->get()->getProperty('name'))->toBe('Example Company'); expect($outputs[1]->get()->getProperty('name'))->toBe('Some Organization, Inc.'); }); it('extracts certain data from schema.org objects when using the extract() method', function () { $html = helper_schemaOrgExampleMultipleObjects(); $outputs = helper_invokeStepWithInput( Html::schemaOrg()->onlyType('Article')->extract(['url', 'headline', 'publisher' => 'publisher.name']), $html, ); expect($outputs)->toHaveCount(1); expect($outputs[0]->get())->toBe([ 'url' => 'https://de.example.org/articles/some', 'headline' => 'This is some article about something.', 'publisher' => 'Some Organization, Inc.', ]); }); test('If an object doesn\'t contain a property from the extract mapping, it\'s just null in the output', function () { $html = helper_schemaOrgExampleMultipleObjects(); $outputs = helper_invokeStepWithInput( Html::schemaOrg()->onlyType('Article')->extract(['url', 'headline', 'alternativeHeadline']), $html, ); expect($outputs)->toHaveCount(1); expect($outputs[0]->get())->toBe([ 'url' => 'https://de.example.org/articles/some', 'headline' => 'This is some article about something.', 'alternativeHeadline' => null, ]); }); ================================================ FILE: tests/Steps/Html/XPathQueryTest.php ================================================ throws(InvalidDomQueryException::class); test('The apply method returns a string for a single match', function () { $xml = 'test'; expect((new XPathQuery('//item'))->apply(new XmlDocument($xml)))->toBe('test'); }); test('The apply method returns an array of strings for multiple matches', function () { $html = 'testtest 2 subtest 3'; expect((new XPathQuery('//item'))->apply(new HtmlDocument($html)))->toBe(['test', 'test 2 sub', 'test 3']); }); test('The apply method returns null if nothing matches', function () { $xml = 'test'; expect((new XPathQuery('//aitem'))->apply(new XmlDocument($xml)))->toBeNull(); }); it('trims whitespace', function () { $xml = << test XML; expect((new XPathQuery('//item'))->apply(new XmlDocument($xml)))->toBe('test'); }); it('contains inner tags when the html method is called', function () { $xml = 'test sub'; expect((new XPathQuery('//item'))->html()->apply(new XmlDocument($xml)))->toBe('test sub'); }); it('contains also the outer tag when the outerHtml method is called', function () { $xml = 'test sub'; expect((new XPathQuery('//item'))->outerHtml()->apply(new XmlDocument($xml)))->toBe('test sub'); }); it('gets the contents of an attribute using the attribute method', function () { $xml = 'test'; expect((new XPathQuery('//item'))->attribute('attr')->apply(new XmlDocument($xml)))->toBe('content'); }); test('getting an attribute value returns an empty string when the attribute does not exist', function () { $xml = 'test'; expect((new XPathQuery('//item'))->attribute('attr')->apply(new XmlDocument($xml)))->toBe(''); }); it('turns the value into an absolute url when toAbsoluteUrl() is called', function () { $xml = '/foo/bar'; $document = new XmlDocument($xml); $query = (new XPathQuery('//item')) ->setBaseUrl('https://www.example.com'); expect($query->apply($document))->toBe('/foo/bar'); $query->toAbsoluteUrl(); expect($query->apply($document))->toBe('https://www.example.com/foo/bar'); }); it('gets an absolute link from the href attribute of a link element, when the link() method is called', function () { $html = ''; $document = new HtmlDocument($html); $selector = (new XPathQuery('//*[@id=\'foo\']/a[@class=\'bar\']')) ->setBaseUrl('https://www.example.com/'); expect($selector->apply($document))->toBe('Foo'); $selector->link(); expect($selector->apply($document))->toBe('https://www.example.com/foo/bar'); }); it('gets only the first matching element when the first() method is called', function () { $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->first(); expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('one'); }); it('gets only the last matching element when the last() method is called', function () { $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->last(); expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('four'); }); it('gets only the nth matching element when the nth() method is called', function () { $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->nth(3); expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('three'); }); it('returns null when no nth matching element exists', function () { $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->nth(5); expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBeNull(); }); it('gets only even matching elements when the even() method is called', function () { $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->even(); expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['two', 'four']); }); it('gets only odd matching elements when the odd() method is called', function () { $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->odd(); expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['one', 'three']); }); ================================================ FILE: tests/Steps/HtmlTest.php ================================================ extract('.title'), helper_getHtmlContent('bookstore.html'), ); expect($output)->toHaveCount(4) ->and($output[0]->get())->toBe('Everyday Italian') ->and($output[3]->get())->toBe('Learning XML'); }); it('extracts data from an HTML document with CSS selectors by default', function () { $output = helper_invokeStepWithInput( Html::each('#bookstore .book')->extract(['title' => '.title', 'author' => '.author', 'year' => '.year']), helper_getHtmlContent('bookstore.html'), ); expect($output)->toHaveCount(4) ->and($output[0]->get())->toBe( ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'], ) ->and($output[1]->get())->toBe(['title' => 'Harry Potter', 'author' => 'J K. Rowling', 'year' => '2005']) ->and($output[2]->get())->toBe( [ 'title' => 'XQuery Kick Start', 'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'], 'year' => '2003', ], ) ->and($output[3]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']); }); it('can also extract data using XPath queries', function () { $output = helper_invokeStepWithInput( Html::each(Dom::xPath('//div[@id=\'bookstore\']/div[@class=\'book\']'))->extract([ 'title' => Dom::xPath('//h3[@class=\'title\']'), 'author' => Dom::xPath('//*[@class=\'author\']'), 'year' => Dom::xPath('//span[@class=\'year\']'), ]), helper_getHtmlContent('bookstore.html'), ); expect($output)->toHaveCount(4) ->and($output[2]->get())->toBe( [ 'title' => 'XQuery Kick Start', 'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'], 'year' => '2003', ], ); }); it('returns only one (compound) output when the root method is used', function () { $output = helper_invokeStepWithInput( Html::root()->extract(['title' => '.title', 'author' => '.author', 'year' => '.year',]), helper_getHtmlContent('bookstore.html'), ); expect($output)->toHaveCount(1) ->and($output[0]->get()['title'])->toBe(['Everyday Italian', 'Harry Potter', 'XQuery Kick Start', 'Learning XML']); }); it('extracts the data of the first matching element when the first method is used', function () { $output = helper_invokeStepWithInput( Html::first('#bookstore .book')->extract(['title' => '.title', 'author' => '.author', 'year' => '.year']), helper_getHtmlContent('bookstore.html'), ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe( ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'], ); }); it('extracts the data of the last matching element when the last method is used', function () { $output = helper_invokeStepWithInput( Html::last('#bookstore .book')->extract(['title' => '.title', 'author' => '.author', 'year' => '.year']), helper_getHtmlContent('bookstore.html'), ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']); }); test( 'you can extract data in a second level to the output array using another Html step as an element in the mapping ' . 'array', function () { $response = new RespondedRequest( new Request('GET', 'https://www.example.com/meetups/some-meetup/'), new Response(body: helper_getHtmlContent('event.html')), ); $output = helper_invokeStepWithInput( Html::root()->extract([ 'title' => '#event h1', 'location' => '#event .location', 'date' => '#event .date', 'talks' => Html::each('#event .talks .talk')->extract([ 'title' => '.title', 'speaker' => '.speaker', 'slides' => Dom::cssSelector('.slidesLink')->attribute('href')->toAbsoluteUrl(), ]), ]), $response, ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe([ 'title' => 'Some Meetup', 'location' => 'Somewhere', 'date' => '2023-01-14 21:00', 'talks' => [ [ 'title' => 'Sophisticated talk title', 'speaker' => 'Super Mario', 'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk1.pdf', ], [ 'title' => 'Simple beginner talk', 'speaker' => 'Luigi', 'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk2.pdf', ], [ 'title' => 'Fun talk', 'speaker' => 'Princess Peach', 'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk3.pdf', ], ], ]); }, ); test( 'When a child step is nested in the extraction and does not use each(), the extracted value is an array with ' . 'the keys defined in extract(), rather than an array of such arrays as it would be with each().', function () { $xml = << something
    ABCDEFGmbH
    1984
    Germany, Frankfurt
    Saubär GmbH
    2014
    Austria, Klagenfurt
    HTML; $expectedCompany1 = [ 'name' => 'ABCDEFGmbH', 'founded' => '1984', 'location' => ['country' => 'Germany', 'city' => 'Frankfurt'], ]; $expectedCompany2 = [ 'name' => 'Saubär GmbH', 'founded' => '2014', 'location' => ['country' => 'Austria', 'city' => 'Klagenfurt'], ]; // With base root() $step = Html::each('.company')->extract([ 'name' => '.name', 'founded' => '.founded', 'location' => Html::root()->extract(['country' => '.location .country', 'city' => '.location .city']), ]); $outputs = helper_invokeStepWithInput($step, $xml); expect($outputs)->toHaveCount(2) ->and($outputs[0]->get())->toBe($expectedCompany1) ->and($outputs[1]->get())->toBe($expectedCompany2); // With base first() $step = Html::each('.company')->extract([ 'name' => '.name', 'founded' => '.founded', 'location' => Html::first('.location')->extract(['country' => '.country', 'city' => '.city']), ]); $outputs = helper_invokeStepWithInput($step, $xml); expect($outputs)->toHaveCount(2) ->and($outputs[0]->get())->toBe($expectedCompany1) ->and($outputs[1]->get())->toBe($expectedCompany2); // With base last() $step = Html::each('.company')->extract([ 'name' => '.name', 'founded' => '.founded', 'location' => Html::last('.location')->extract(['country' => '.country', 'city' => '.city']), ]); $outputs = helper_invokeStepWithInput($step, $xml); expect($outputs)->toHaveCount(2) ->and($outputs[0]->get())->toBe($expectedCompany1) ->and($outputs[1]->get())->toBe($expectedCompany2); }, ); test( 'when selecting elements with each(), you can reference the element already selected within the each() selector ' . 'itself, in sub selectors', function () { $html = << Bookstore Example in HTML :) HTML; $response = new RespondedRequest( new Request('GET', 'https://www.example.com/foo'), new Response(body: $html), ); $output = helper_invokeStepWithInput( Html::each('#list .element')->extract([ // This is what this test is about. The element already selected in each (.element) can be // referenced in these child selectors. 'link' => Dom::cssSelector('.element > a')->link(), 'attribute' => Dom::cssSelector('')->attribute('data-attr'), ]), $response, ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe([ 'link' => 'https://www.example.com/bar', 'attribute' => 'yo', ]); }, ); test('the static getLink method works without argument', function () { expect(Html::getLink())->toBeInstanceOf(GetLink::class); }); test('the static getLinks method works without argument', function () { expect(Html::getLinks())->toBeInstanceOf(GetLinks::class); }); ================================================ FILE: tests/Steps/JsonTest.php ================================================ 'data.foo']), $respondedRequest); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['foo' => 'bar']); }); it('accepts PSR-7 Response as input', function () { $json = '{ "data": { "foo": "bar" } }'; $response = new Response(body: Utils::streamFor($json)); $output = helper_invokeStepWithInput(Json::get(['foo' => 'data.foo']), $response); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['foo' => 'bar']); }); it('extracts data defined using dot notation', function () { $json = << 'data.target.foo', 'baz' => 'data.target.baz']), $json); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['foo' => 'bar', 'baz' => 'yo']); }); it('uses the array values in the mapping as output key when no string keys defined in the mapping array', function () { $jsonString = << 'data.target.baz']), $jsonString); expect($output[0]->get())->toBe(['data.target.foo' => 'bar', 'baz' => 'yo']); }); it('can get items from a json array using a numeric key', function () { $jsonString = << 'data.target.array.1.name']), $jsonString); expect($output[0]->get())->toBe(['name' => 'Eve']); }); test('Using the each method you can iterate over a json array and yield multiple results', function () { $json = << 'name', 'age' => 'age.years']), $json); expect($output)->toHaveCount(3) ->and($output[0]->get())->toBe(['name' => 'Peter', 'age' => 19]) ->and($output[1]->get())->toBe(['name' => 'Paul', 'age' => 22]) ->and($output[2]->get())->toBe(['name' => 'Mary', 'age' => 20]); }); test('When the root element is an array you can use each with empty string as param', function () { $jsonString = <<toHaveCount(4) ->and($output[0]->get())->toBe(['nickname' => 'Axel']) ->and($output[1]->get())->toBe(['nickname' => 'Lilo']) ->and($output[2]->get())->toBe(['nickname' => 'Poppi']) ->and($output[3]->get())->toBe(['nickname' => 'Dominik']); }); it('yields no results and logs a warning when the target for "each" does not exist', function () { $jsonString = '{ "foo": { "bar": [{ "number": "one" }, { "number": "two" }] } }'; $step = Json::each('boo.bar', ['number']); $step->addLogger(new CliLogger()); $output = helper_invokeStepWithInput($step, $jsonString); expect($output)->toHaveCount(0); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput)->toContain('The target of "each" does not exist in the JSON data.'); }); it('also works with JS style JSON objects without quotes around keys', function () { $jsonString = <<toHaveCount(1) ->and($outputs[0]->get())->toBe(['foo' => 'one', 'bar' => 'two', 'baz' => 'three']); }); it('also correctly fixes keys without quotes, even when values contain colons', function () { $jsonString = <<toHaveCount(1) ->and($outputs[0]->get()) ->toBe([ 'foo' => 'https://www.example.com', 'bar' => 2, 'baz' => 'some: thing', ]); }); it('also correctly fixes keys without quotes, when the value is an empty string', function () { $jsonString = <<toHaveCount(1) ->and($outputs[0]->get()) ->toBe([ 'foo' => '', 'bar' => 'baz', ]); }); it('works with a string that is an HTML document and inside the body there\'s a JSON object', function () { $jsonString = << JSON { "foo": "Hello World!", "bar": "baz" } HTML; $outputs = helper_invokeStepWithInput(Json::get(['title' => 'foo']), $jsonString); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get()) ->toBe(['title' => 'Hello World!']); }); it('gets the whole JSON object as array, when using the all() method', function () { $jsonString = <<toHaveCount(1) ->and($outputs[0]->get()) ->toBe([ 'foo' => 'one', 'bar' => 'two', 'array' => ['one', 'two', 'three'], ]); }); it('can also map the whole decoded data array to a output property', function () { $jsonString = << '*']), $jsonString); expect($outputs) ->toHaveCount(1) ->and($outputs[0]->get()) ->toBe([ 'all' => [ 'foo' => 'one', 'bar' => 'two', 'array' => ['one', 'two', 'three'], ], ]); }); test('when there is a key * in the object, the * gets that key, not the whole decoded data', function () { $jsonString = << '*']), $jsonString); expect($outputs) ->toHaveCount(1) ->and($outputs[0]->get()) ->toBe(['shouldBeYes' => 'yes']); }); it('can also get the whole decoded data in the each() context', function () { $jsonString = << '*']), $jsonString); expect($outputs) ->toHaveCount(3) ->and($outputs[0]->get()) ->toBe(['full' => ['name' => 'foo', 'value' => 'one']]) ->and($outputs[1]->get()) ->toBe(['full' => ['name' => 'bar', 'value' => 'two']]) ->and($outputs[2]->get()) ->toBe(['full' => ['name' => 'baz', 'value' => 'three']]); }); test('in the each() context, when there is a key *, it gets that, not the whole decoded data', function () { $jsonString = << '*']), $jsonString); expect($outputs) ->toHaveCount(2) ->and($outputs[0]->get()) ->toBe(['full' => 'yo']) ->and($outputs[1]->get()) ->toBe(['full' => ['name' => 'bar', 'value' => 'two']]); }); ================================================ FILE: tests/Steps/Loading/GetSitemapsFromRobotsTxtTest.php ================================================ shouldReceive('sendRequest') ->once() ->withArgs(function (RequestInterface $request) { return $request->getUri()->__toString() === 'https://www.crwlr.software/robots.txt'; }) ->andReturn(new Response(200, body: Utils::streamFor($robotsTxt))); $loader = new HttpLoader(new UserAgent('SomeUserAgent'), $httpClient); $step = Sitemap::getSitemapsFromRobotsTxt()->setLoader($loader); $outputs = helper_invokeStepWithInput($step, new Input('https://www.crwlr.software/packages')); expect($outputs)->toHaveCount(3); }); ================================================ FILE: tests/Steps/Loading/Http/DocumentTest.php ================================================ foohello'; $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/foo'), new Response(200, body: $body), ); $document = new Document($respondedRequest); expect($document->dom())->toBeInstanceOf(HtmlDocument::class) ->and($document->dom()->outerHtml())->toBe( 'foohello', ); }); it('returns the effectiveUri as url()', function () { $body = 'foohello'; $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/foo'), new Response(301, ['Location' => 'https://www.example.com/bar'], $body), ); $respondedRequest->addRedirectUri('https://www.example.com/bar'); $document = new Document($respondedRequest); expect((string) $document->url())->toBe('https://www.example.com/bar'); }); it('returns the effectiveUri as baseUrl() if no base tag in HTML', function () { $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/foo'), new Response(301, ['Location' => 'https://www.example.com/bar']), ); $respondedRequest->addRedirectUri('https://www.example.com/bar'); $document = new Document($respondedRequest); expect((string) $document->baseUrl())->toBe('https://www.example.com/bar'); }); it('returns the URL referenced in base tag as baseUrl()', function () { $body = 'foohello'; $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/foo'), new Response(301, ['Location' => 'https://www.example.com/bar'], $body), ); $respondedRequest->addRedirectUri('https://www.example.com/bar'); $document = new Document($respondedRequest); expect((string) $document->baseUrl())->toBe('https://www.example.com/baz'); }); it('returns the effectiveUri as canonicalUrl() if no canonical link in HTML', function () { $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/foo'), new Response(301, ['Location' => 'https://www.example.com/bar']), ); $respondedRequest->addRedirectUri('https://www.example.com/bar'); $document = new Document($respondedRequest); expect($document->canonicalUrl())->toBe('https://www.example.com/bar'); }); it('returns the URL referenced in canonical link as canonicalUrl()', function () { $body = 'foohello'; $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/foo'), new Response(301, ['Location' => 'https://www.example.com/bar'], $body), ); $respondedRequest->addRedirectUri('https://www.example.com/bar'); $document = new Document($respondedRequest); expect($document->canonicalUrl())->toBe('https://www.example.com/quz'); }); ================================================ FILE: tests/Steps/Loading/Http/Paginators/AbstractPaginatorTest.php ================================================ processLoaded($respondedRequest1->request, $respondedRequest1); expect($paginator->getLoaded()) ->toBe(['f2be1fcc5667a8f4ee2fd7f48c69c909' => true]) ->and($paginator->getLoadedCount()) ->toBe(1) ->and($paginator->getLatestRequest()) ->toBe($respondedRequest1->request); $respondedRequest2 = helper_getRespondedRequest('GET', 'https://www.example.com/bar', [], 'Yo'); $paginator->processLoaded($respondedRequest2->request, $respondedRequest2); expect($paginator->getLoaded())->toBe([ 'f2be1fcc5667a8f4ee2fd7f48c69c909' => true, 'd9e0c3987944f190782f5af9506eb478' => true, ]) ->and($paginator->getLoadedCount()) ->toBe(2) ->and($paginator->getLatestRequest()) ->toBe($respondedRequest2->request); }); it('registers loaded requests from RespondedRequest objects', function () { $paginator = new AbstractTestPaginator(nextUrl: 'https://www.example.com/bar'); $requestOne = new Request('GET', Url::parsePsr7('https://www.example.com/foo'), [], 'Hi'); $requestTwo = new Request('GET', Url::parsePsr7('https://www.example.com/bar'), [], 'Yo'); $paginator->processLoaded($requestOne, new RespondedRequest($requestTwo, new Response())); expect($paginator->getLoaded()) ->toBe(['d9e0c3987944f190782f5af9506eb478' => true]) ->and($paginator->getLoadedCount()) ->toBe(1) ->and($paginator->getLatestRequest()) ->toBe($requestTwo); }); it('knows when the max pages to load limit is reached', function () { $paginator = new AbstractTestPaginator(3); $respondedRequest = helper_getRespondedRequest(url: 'https://www.example.com/foo'); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->limitReached())->toBeFalse(); $respondedRequest = helper_getRespondedRequest(url: 'https://www.example.com/bar'); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->limitReached())->toBeFalse(); $respondedRequest = helper_getRespondedRequest(url: 'https://www.example.com/baz'); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->limitReached())->toBeTrue(); expect($paginator->hasFinished())->toBeTrue(); }); test('the same request is not registered twice', function () { $paginator = new AbstractTestPaginator(); $respondedRequest = helper_getRespondedRequest(); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->getLoadedCount())->toBe(1); $respondedRequest = helper_getRespondedRequest(); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->getLoadedCount())->toBe(1); }); it('logs a message when the max pages limit was reached', function () { $paginator = new AbstractTestPaginator(2); $respondedRequest = helper_getRespondedRequest(url: 'https://www.example.com/foo'); $paginator->processLoaded($respondedRequest->request, $respondedRequest); $logger = new DummyLogger(); $paginator->logWhenFinished($logger); expect($logger->messages[0])->toBe([ 'level' => 'info', 'message' => 'Finished paginating.', ]); $respondedRequest = helper_getRespondedRequest(url: 'https://www.example.com/bar'); $paginator->processLoaded($respondedRequest->request, $respondedRequest); $paginator->logWhenFinished($logger); expect($logger->messages[1])->toBe([ 'level' => 'notice', 'message' => 'Max pages limit reached.', ]); }); it('logs a message when it finished paginating', function () { $paginator = new AbstractTestPaginator(); $paginator->stopWhen(PaginatorStopRules::isEmptyResponse()); $respondedRequest = helper_getRespondedRequest(); $paginator->processLoaded($respondedRequest->request, $respondedRequest); $logger = new DummyLogger(); $paginator->logWhenFinished($logger); expect($logger->messages[0])->toBe([ 'level' => 'info', 'message' => 'Finished paginating.', ]); }); it('stops paginating when a stop condition is met', function () { $paginator = new AbstractTestPaginator(); $paginator ->stopWhen(PaginatorStopRules::isEmptyResponse()) ->stopWhen(PaginatorStopRules::isEmptyInJson('items')); $respondedRequest = helper_getRespondedRequest( url: 'https://www.example.com/list?page=1', responseBody: '{ "items": ["foo"] }', ); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->hasFinished())->toBeFalse(); $respondedRequest = helper_getRespondedRequest(url: 'https://www.example.com/list?page=2', responseBody: '{}'); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->hasFinished())->toBeTrue(); $paginator = new AbstractTestPaginator(); $paginator ->stopWhen(PaginatorStopRules::isEmptyResponse()) ->stopWhen(PaginatorStopRules::isEmptyInJson('items')); $respondedRequest = helper_getRespondedRequest( url: 'https://www.example.com/list?page=1', responseBody: '{ "items": [] }', ); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->hasFinished())->toBeTrue(); }); test('after calling the setFinished() method, the hasFinished() method returns true', function () { $paginator = new AbstractTestPaginator(); expect($paginator->hasFinished())->toBeFalse(); $paginator->setFinished(); expect($paginator->hasFinished())->toBeTrue(); }); ================================================ FILE: tests/Steps/Loading/Http/Paginators/QueryParams/AbstractQueryParamManipulatorTest.php ================================================ currentParamValue = $this->getCurrentValue($query); return $query; } }; $manipulator->execute(Query::fromString('foo=bar')); expect($manipulator->currentParamValue)->toBe('bar'); }); it('gets the current value of a query param as integer', function () { $manipulator = new class ('foo') extends AbstractQueryParamManipulator { public int $currentParamValue = 0; public function execute(Query $query): Query { $this->currentParamValue = $this->getCurrentValueAsInt($query); return $query; } }; $manipulator->execute(Query::fromString('foo=123')); expect($manipulator->currentParamValue)->toBe(123); }); ================================================ FILE: tests/Steps/Loading/Http/Paginators/QueryParams/DecrementorTest.php ================================================ get('foo'))->toBe('20'); $decrementor->execute($query); expect($query->get('foo'))->toBe('10'); $decrementor->execute($query); expect($query->get('foo'))->toBe('0'); $decrementor->execute($query); expect($query->get('foo'))->toBe('-10'); }); it('reduces a non first level query param value by a certain number', function () { $decrementor = new Decrementor('foo.bar.baz', 7, true); $query = Query::fromString('foo[bar][baz]=10'); expect($decrementor->execute($query)->toString())->toBe('foo%5Bbar%5D%5Bbaz%5D=3'); }); ================================================ FILE: tests/Steps/Loading/Http/Paginators/QueryParams/IncrementorTest.php ================================================ get('foo'))->toBe('-10'); $incrementor->execute($query); expect($query->get('foo'))->toBe('0'); $incrementor->execute($query); expect($query->get('foo'))->toBe('10'); $incrementor->execute($query); expect($query->get('foo'))->toBe('20'); }); it('increments a non first level query param value by a certain number', function () { $incrementor = new Incrementor('foo.bar.baz', 7, true); $query = Query::fromString('foo[bar][baz]=3'); expect($incrementor->execute($query)->toString())->toBe('foo%5Bbar%5D%5Bbaz%5D=10'); }); ================================================ FILE: tests/Steps/Loading/Http/Paginators/QueryParamsPaginatorTest.php ================================================ increase('page') ->increase('offset', 20) ->decrease('foo', 10) ->decrease('bar', 20); $request = new Request('GET', 'https://www.example.com/list?page=1&offset=20&foo=40&bar=10'); $respondedRequest = new RespondedRequest($request, new Response()); $paginator->processLoaded($request, $respondedRequest); $nextRequest = $paginator->getNextRequest(); expect($nextRequest?->getUri()->__toString())->toBe('https://www.example.com/list?page=2&offset=40&foo=30&bar=-10'); }); it('increases and decreases values in query params in the body', function () { $paginator = QueryParamsPaginator::paramsInBody() ->increase('page') ->increase('offset', 20) ->decrease('foo', 10) ->decrease('bar', 20); $request = new Request('POST', 'https://www.example.com/list', body: 'page=1&offset=20&foo=40&bar=10'); $respondedRequest = new RespondedRequest($request, new Response()); $paginator->processLoaded($request, $respondedRequest); $nextRequest = $paginator->getNextRequest(); expect($nextRequest?->getMethod()) ->toBe('POST') ->and($nextRequest?->getUri()->__toString()) ->toBe('https://www.example.com/list') ->and($nextRequest?->getBody()->getContents()) ->toBe('page=2&offset=40&foo=30&bar=-10'); }); it('increases and decreases non first level (of query array) parameters using dot notation', function () { $paginator = QueryParamsPaginator::paramsInBody() ->increaseUsingDotNotation('pagination.page') ->increase('pagination.size', 5, true) ->decreaseUsingDotNotation('pagination2.page') ->decrease('pagination2.size', 5, true); $request = new Request( 'POST', 'https://www.example.com/list', body: 'pagination[page]=1&pagination[size]=25&pagination2[page]=1&pagination2[size]=25&foo=bar', ); $respondedRequest = new RespondedRequest($request, new Response()); $paginator->processLoaded($request, $respondedRequest); $nextRequest = $paginator->getNextRequest(); expect($nextRequest?->getBody()->getContents()) ->toBe( 'pagination%5Bpage%5D=2&pagination%5Bsize%5D=30&pagination2%5Bpage%5D=0&pagination2%5Bsize%5D=20&foo=bar', ); }); ================================================ FILE: tests/Steps/Loading/Http/Paginators/SimpleWebsitePaginatorTest.php ================================================ $links */ function helper_createResponseBodyWithPaginationLinks(array $links): string { $body = ''; } /** @var TestCase $this */ it('says it has finished when no initial response was provided yet', function () { $paginator = new SimpleWebsitePaginator('.pagination'); expect($paginator->hasFinished())->toBeTrue(); }); it('says it has finished when a response is provided, but it has no pagination links', function () { $paginator = new SimpleWebsitePaginator('.pagination', 3); $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing', '
    '); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->hasFinished())->toBeTrue(); }); it('says it has not finished when an initial response with pagination links is provided', function () { $paginator = new SimpleWebsitePaginator('.pagination', 3); $responseBody = helper_createResponseBodyWithPaginationLinks([ '/listing?page=1' => 'First page', '/listing?page=2' => 'Next page', '/listing?page12' => 'Last page', ]); $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->hasFinished())->toBeFalse(); }); it('has finished when the loaded pages count exceeds the max pages limit', function () { $paginator = new SimpleWebsitePaginator('.pagination', 3); $responseBody = helper_createResponseBodyWithPaginationLinks([ '/listing?page=1' => 'First page', '/listing?page=2' => 'Next page', '/listing?page12' => 'Last page', ]); $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->hasFinished())->toBeFalse(); $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->hasFinished())->toBeFalse(); $responseBody = helper_createResponseBodyWithPaginationLinks([ '/listing?page=1' => 'First page', '/listing?page=3' => 'Next page', '/listing?page12' => 'Last page', ]); $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=2', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->hasFinished())->toBeTrue(); }); it('says it has finished when there are no more found pagination links, that haven\'t been loaded yet', function () { $paginator = new SimpleWebsitePaginator('.pagination', 3); $responseBody = helper_createResponseBodyWithPaginationLinks(['/listing?page=2' => 'Page Two']); $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->hasFinished())->toBeFalse(); $paginator->getNextRequest(); $responseBody = helper_createResponseBodyWithPaginationLinks(['/listing?page=2' => 'Page Two']); $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=2', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->hasFinished())->toBeTrue(); }); it('finds pagination links when the selector matches the link itself', function () { $paginator = new SimpleWebsitePaginator('.nextPageLink', 3); $responseBody = 'Next Page'; $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->getNextRequest()?->getUri()->__toString())->toBe('https://www.example.com/listing?page=2'); }); it('finds pagination links when the selected element is a wrapper for pagination links', function () { $paginator = new SimpleWebsitePaginator('.pagination', 3); $responseBody = ''; $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->getNextRequest()?->getUri()->__toString())->toBe('https://www.example.com/listing?page=2'); }); it('finds all pagination links, when multiple elements match the pagination links selector', function () { $paginator = new SimpleWebsitePaginator('.pagination', 3); $responseBody = <<Next Page HTML; $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->getNextRequest()?->getUri()->__toString())->toBe('https://www.example.com/listing?page=2') ->and($paginator->getNextRequest()?->getUri()->__toString())->toBe('https://www.example.com/listing?page=12'); }); it('logs that max pages limit was reached when it was reached', function () { $paginator = new SimpleWebsitePaginator('.pagination', 3); $responseBody = << Page One Page Two Page Three Page Four HTML; $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=2', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=3', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->hasFinished())->toBeTrue(); $paginator->logWhenFinished(new CliLogger()); $output = $this->getActualOutputForAssertion(); expect($output)->toContain('Max pages limit reached'); }); it('logs that all found pagination links have been loaded when max pages limit was not reached', function () { $paginator = new SimpleWebsitePaginator('.pagination', 3); $responseBody = << Page One Page Two Page Three HTML; $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); $paginator->getNextRequest(); $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=2', $responseBody); $paginator->logWhenFinished(new CliLogger()); $paginator->processLoaded($respondedRequest->request, $respondedRequest); $paginator->logWhenFinished(new CliLogger()); $paginator->getNextRequest(); $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=3', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->hasFinished())->toBeTrue(); $paginator->logWhenFinished(new CliLogger()); $output = $this->getActualOutputForAssertion(); expect($output) ->not()->toContain('Max pages limit reached') ->and($output) ->toContain('All found pagination links loaded'); }); it( 'always creates upcoming requests from the parent request, where a link was found (which does not have to be ' . 'the latest processed response)', function () { $paginator = new SimpleWebsitePaginator('.pagination', 3); $responseBody = << Page One Page Two Page Three HTML; $respondedRequest = helper_getRespondedRequest( 'GET', 'https://www.example.com/list?page=1', ['foo' => 'bar'], responseBody: $responseBody, ); $paginator->processLoaded($respondedRequest->request, $respondedRequest); $responseBody = << Page One Page Two Page Three HTML; $respondedRequest = helper_getRespondedRequest( 'GET', 'https://www.example.com/list?page=2', ['foo' => 'baz'], responseBody: $responseBody, ); $paginator->processLoaded($respondedRequest->request, $respondedRequest); $nextRequest = $paginator->getNextRequest(); expect($nextRequest?->getHeader('foo'))->toBe(['bar']); }, ); it('cleans up the stored parent requests always when getting the next request to load', function () { $paginator = new class ('.pagination') extends SimpleWebsitePaginator { /** * @return array */ public function parentRequests(): array { return $this->parentRequests; } }; $responseBody = << Page Two Page Three HTML; $respondedRequest = helper_getRespondedRequest( 'GET', 'https://www.example.com/list?page=1', ['foo' => 'bar'], responseBody: $responseBody, ); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect(count($paginator->parentRequests()))->toBe(1); $nextRequest = $paginator->getNextRequest(); if (!$nextRequest) { $this->fail('failed to get next request'); } $respondedRequest = new RespondedRequest($nextRequest, new Response()); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect(count($paginator->parentRequests()))->toBe(1); $nextRequest = $paginator->getNextRequest(); if (!$nextRequest) { $this->fail('failed to get next request'); } $respondedRequest = new RespondedRequest($nextRequest, new Response()); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect(count($paginator->parentRequests()))->toBe(0); }); it('does not stop, when a response does not meet the stop rule criterion', function () { $paginator = new SimpleWebsitePaginator('.pagination', 3); $paginator->stopWhen(PaginatorStopRules::contains('hello world')); $responseBody = helper_createResponseBodyWithPaginationLinks(['/listing?page=2' => 'Next page']); $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing', $responseBody); $paginator->processLoaded($respondedRequest->request, $respondedRequest); expect($paginator->hasFinished())->toBeFalse(); }); ================================================ FILE: tests/Steps/Loading/Http/Paginators/StopRules/ContainsTest.php ================================================ shouldStop(new Request('GET', 'https://www.example.com/foo'), null))->toBeTrue(); }); it('stops when the string is contained in the response body', function () { $rule = PaginatorStopRules::contains('foo'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: 'This string contains foo'), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); }); it('does not stop when the string is not contained in the response body', function () { $rule = PaginatorStopRules::contains('foo'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: 'This does not contain the string'), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse(); }); ================================================ FILE: tests/Steps/Loading/Http/Paginators/StopRules/IsEmptyInHtmlTest.php ================================================ shouldStop(new Request('GET', 'https://www.crwl.io/'), null))->toBeTrue(); }); it('should stop, when response is not HTML', function () { $rule = PaginatorStopRules::isEmptyInHtml('#list .item'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: '{ "foo": "bar" }'), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); }); it('should stop, when the selector target does not exist in the HTML response', function () { $rule = PaginatorStopRules::isEmptyInHtml('#list'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: '
    '), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); }); it('should stop, when the selector target is empty in the response', function () { $rule = PaginatorStopRules::isEmptyInHtml('#list'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: '
    '), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); }); it('should not stop, when the selector target is not empty in the response', function () { $rule = PaginatorStopRules::isEmptyInHtml('#list'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: '
    a
    '), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse(); // Also if the content is only child elements. $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: '
    '), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse(); }); ================================================ FILE: tests/Steps/Loading/Http/Paginators/StopRules/IsEmptyInJsonTest.php ================================================ shouldStop(new Request('GET', 'https://www.crwl.io/'), null))->toBeTrue(); }); it('throws an exception when response is not valid JSON', function () { $rule = PaginatorStopRules::isEmptyInJson('data.items'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: ''), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); })->throws(InvalidJsonException::class); it('should stop, when the dot notation key does not exist in the response', function () { $rule = PaginatorStopRules::isEmptyInJson('data.items'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: '{ "data": { "foo": "bar" } }'), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); }); it('should stop, when the dot notation key is empty in the response', function () { $rule = PaginatorStopRules::isEmptyInJson('data.items'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: '{ "data": { "items": [] } }'), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); }); it('should not stop, when the dot notation key is not empty in the response', function () { $rule = PaginatorStopRules::isEmptyInJson('data.items'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: '{ "data": { "items": ["foo", "bar"] } }'), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse(); }); ================================================ FILE: tests/Steps/Loading/Http/Paginators/StopRules/IsEmptyInXmlTest.php ================================================ shouldStop(new Request('GET', 'https://www.crwl.io/'), null))->toBeTrue(); }); it('should stop, when response is not XML', function () { $rule = PaginatorStopRules::isEmptyInXml('channel item'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: '{}'), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); }); it('should stop, when the selector target does not exist in the XML response', function () { $rule = PaginatorStopRules::isEmptyInXml('channel item'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: ''), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); }); it('should stop, when the selector target is empty in the response', function () { $rule = PaginatorStopRules::isEmptyInXml('channel item'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response( body: ' ', ), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); }); it('should not stop, when the selector target is not empty in the response', function () { $rule = PaginatorStopRules::isEmptyInXml('channel item'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response( body: 'a', ), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse(); // Also if the content is only child elements. $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response( body: '', ), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse(); }); ================================================ FILE: tests/Steps/Loading/Http/Paginators/StopRules/IsEmptyResponseTest.php ================================================ shouldStop(new Request('GET', 'https://www.crwl.io/'), null))->toBeTrue(); }); it('should stop, when the response body is empty', function () { $rule = PaginatorStopRules::isEmptyResponse(); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: ''), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); }); it('should stop, when the response body is only spaces', function () { $rule = PaginatorStopRules::isEmptyResponse(); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/'), new Response(body: " \n\r\t "), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); }); it('should stop, when the response body is an empty JSON array', function () { $rule = PaginatorStopRules::isEmptyResponse(); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwlr.software/packages'), new Response(body: " [] "), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); }); it('should stop, when the response body is an empty JSON object', function () { $rule = PaginatorStopRules::isEmptyResponse(); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/en/home'), new Response(body: "{}"), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); }); ================================================ FILE: tests/Steps/Loading/Http/Paginators/StopRules/NotContainsTest.php ================================================ shouldStop(new Request('GET', 'https://www.example.com/foo'), null))->toBeTrue(); }); it('stops when the string is not contained in the response body', function () { $rule = PaginatorStopRules::notContains('foo'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: 'This does not contain the string'), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue(); }); it('does not stop when the string is contained in the response body', function () { $rule = PaginatorStopRules::notContains('foo'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/'), new Response(body: 'This contains the string foo'), ); expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse(); }); ================================================ FILE: tests/Steps/Loading/HttpTest.php ================================================ shouldReceive('load')->once(); $step = (new Http('GET'))->setLoader($loader); helper_traverseIterable($step->invokeStep(new Input('https://www.foo.bar/baz'))); }); it('can be invoked with a PSR-7 Uri object as input', function () { $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('load')->once(); $step = (new Http('GET'))->setLoader($loader); helper_traverseIterable($step->invokeStep(new Input(Url::parsePsr7('https://www.linkedin.com/')))); }); it('logs an error message when invoked with something else as input', function () { $logger = new DummyLogger(); $loader = Mockery::mock(HttpLoader::class); $step = (new Http('GET'))->setLoader($loader)->addLogger($logger); helper_traverseIterable($step->invokeStep(new Input(new stdClass()))); expect($logger->messages)->not->toBeEmpty() ->and($logger->messages[0]['message'])->toStartWith( 'The Crwlr\Crawler\Steps\Loading\Http step was called with input that it can not work with:', ) ->and($logger->messages[0]['message'])->toEndWith('. The invalid input is of type object.'); }); it('logs an error message when invoked with a relative reference URI', function () { $logger = new DummyLogger(); $loader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger); $step = (new Http('GET'))->setLoader($loader)->addLogger($logger); helper_invokeStepWithInput($step, '/foo/bar'); expect($logger->messages)->not->toBeEmpty() ->and($logger->messages[0]['message'])->toBe( 'Invalid input URL: /foo/bar - The URI is a relative reference and therefore can\'t be loaded.', ); }); it('catches the exception and logs an error when feeded with an invalid URL', function () { $loader = Mockery::mock(HttpLoader::class); $logger = new DummyLogger(); $step = (new Http('GET'))->setLoader($loader); $step->addLogger($logger); helper_traverseIterable($step->invokeStep(new Input('https://'))); expect($logger->messages)->toHaveCount(1) ->and($logger->messages[0]['level'])->toBe('error') ->and($logger->messages[0]['message'])->toBe( 'The Crwlr\\Crawler\\Steps\\Loading\\Http step was called with input that it can not work with: https:// ' . 'is not a valid URL.', ); }); it('throws an exception when invoked with a relative reference URI and stopOnErrorResponse() was called', function () { $logger = new DummyLogger(); $loader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger); $step = (new Http('GET'))->setLoader($loader)->addLogger($logger); $step->stopOnErrorResponse(); helper_invokeStepWithInput($step, '/foo/bar'); })->throws(InvalidArgumentException::class); test('You can set the request method via constructor', function (string $httpMethod) { $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($httpMethod) { return $request->getMethod() === $httpMethod; })->once(); if ($httpMethod !== 'GET') { $loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse(); } $step = (new Http($httpMethod))->setLoader($loader); helper_traverseIterable($step->invokeStep(new Input('https://www.foo.bar/baz'))); })->with(['GET', 'POST', 'PUT', 'PATCH', 'DELETE']); test('You can set request headers via constructor', function () { $loader = Mockery::mock(HttpLoader::class); $headers = [ 'Accept' => [ 'text/html', 'application/xhtml+xml', 'application/xml;q=0.9', 'image/avif', 'image/webp', 'image/apng', '*/*;q=0.8', 'application/signed-exchange;v=b3;q=0.9', ], 'Accept-Encoding' => ['gzip', 'deflate', 'br'], 'Accept-Language' => ['de-DE', 'de;q=0.9', 'en-US;q=0.8', 'en;q=0.7'], ]; $loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($headers) { foreach ($headers as $headerName => $values) { if (!$request->getHeader($headerName) || $request->getHeader($headerName) !== $values) { return false; } } return true; })->once(); $step = (new Http('GET', $headers))->setLoader($loader); helper_traverseIterable($step->invokeStep(new Input('https://www.crwlr.software/packages/url'))); }); test('You can set request body via constructor', function () { $loader = Mockery::mock(HttpLoader::class); $body = 'This is the request body'; $loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($body) { return $request->getBody()->getContents() === $body; })->once(); $loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse(); $step = (new Http('PATCH', [], $body))->setLoader($loader); helper_traverseIterable($step->invokeStep(new Input('https://github.com/'))); }); test('You can set the http version for the request via constructor', function (string $httpVersion) { $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($httpVersion) { return $request->getProtocolVersion() === $httpVersion; })->once(); $loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse(); $step = (new Http('PATCH', [], 'body', $httpVersion))->setLoader($loader); helper_traverseIterable($step->invokeStep(new Input('https://packagist.org/packages/crwlr/url'))); })->with(['1.0', '1.1', '2.0']); it('has static methods to create instances with all the different http methods', function (string $httpMethod) { $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($httpMethod) { return $request->getMethod() === $httpMethod; })->once(); if ($httpMethod !== 'GET') { $loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse(); } $step = (Http::{strtolower($httpMethod)}())->setLoader($loader); helper_traverseIterable($step->invokeStep(new Input('https://dev.to/otsch'))); })->with(['GET', 'POST', 'PUT', 'PATCH', 'DELETE']); it( 'calls the loadOrFail() loader method when the stopOnErrorResponse() method was called', function (string $httpMethod) { $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('loadOrFail')->withArgs(function (RequestInterface $request) use ($httpMethod) { return $request->getMethod() === $httpMethod; })->once()->andReturn(new RespondedRequest(new Request('GET', '/foo'), new Response(200))); if ($httpMethod !== 'GET') { $loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse(); } $step = (Http::{strtolower($httpMethod)}()) ->setLoader($loader) ->stopOnErrorResponse(); helper_traverseIterable($step->invokeStep(new Input('https://example.com/otsch'))); }, )->with(['GET', 'POST', 'PUT', 'PATCH', 'DELETE']); test('you can keep response properties with their aliases', function () { $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('load')->once()->andReturn( new RespondedRequest( new Request('GET', 'https://www.example.com/testresponse'), new Response(202, ['foo' => 'bar'], Utils::streamFor('testbody')), ), ); $step = Http::get() ->setLoader($loader) ->keep(['url', 'status', 'headers', 'body']); $outputs = helper_invokeStepWithInput($step); expect($outputs)->toHaveCount(1) ->and($outputs[0]->keep)->toBe([ 'url' => 'https://www.example.com/testresponse', 'status' => 202, 'headers' => ['foo' => ['bar']], 'body' => 'testbody', ]); }); test( 'the value behind url and uri is the effectiveUri', function (string $outputKey) { $loader = Mockery::mock(HttpLoader::class); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/testresponse'), new Response(202, ['foo' => 'bar'], Utils::streamFor('testbody')), ); $respondedRequest->addRedirectUri('https://www.example.com/testresponseredirect'); $loader->shouldReceive('load')->once()->andReturn($respondedRequest); $step = Http::get() ->setLoader($loader) ->keep([$outputKey]); $outputs = helper_invokeStepWithInput($step); expect($outputs)->toHaveCount(1) ->and($outputs[0]->keep)->toBe([$outputKey => 'https://www.example.com/testresponseredirect']); }, )->with(['url', 'uri']); it('gets the URL for the request from an input array when useInputKeyAsUrl() was called', function () { $inputArray = [ 'foo' => 'bar', 'someUrl' => 'https://www.example.com/baz', ]; $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($inputArray) { return $request->getUri()->__toString() === $inputArray['someUrl']; })->once()->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200))); $step = Http::get() ->setLoader($loader) ->useInputKeyAsUrl('someUrl'); helper_invokeStepWithInput($step, $inputArray); }); it( 'automatically gets the URL for the request from an input array when it contains an url or uri key', function ($key) { $inputArray = [ 'foo' => 'bar', $key => 'https://www.example.com/baz', ]; $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($inputArray, $key) { return $request->getUri()->__toString() === $inputArray[$key]; })->once()->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200))); $step = Http::get() ->setLoader($loader); helper_invokeStepWithInput($step, $inputArray); }, )->with(['url', 'uri']); it('gets the body for the request from an input array when useInputKeyAsBody() was called', function () { $inputArray = [ 'foo' => 'bar', 'someUrl' => 'https://www.example.com/baz', 'someBodyThatIUsedToKnow' => 'foo=bar&baz=quz', ]; $loader = Mockery::mock(HttpLoader::class); $loader ->shouldReceive('load') ->withArgs(function (RequestInterface $request) use ($inputArray) { return $request->getBody()->getContents() === $inputArray['someBodyThatIUsedToKnow']; }) ->once() ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200))); $step = Http::get() ->setLoader($loader) ->useInputKeyAsUrl('someUrl') ->useInputKeyAsBody('someBodyThatIUsedToKnow'); helper_invokeStepWithInput($step, $inputArray); }); it('gets as single header for the request from an input array when useInputKeyAsHeader() was called', function () { $inputArray = [ 'foo' => 'bar', 'someUrl' => 'https://www.example.com/baz', 'someHeader' => 'someHeaderValue', ]; $loader = Mockery::mock(HttpLoader::class); $loader ->shouldReceive('load') ->withArgs(function (RequestInterface $request) use ($inputArray) { return $request->getHeader('header-name-x') === [$inputArray['someHeader']]; }) ->once() ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200))); $step = Http::get() ->setLoader($loader) ->useInputKeyAsUrl('someUrl') ->useInputKeyAsHeader('someHeader', 'header-name-x'); helper_invokeStepWithInput($step, $inputArray); }); it('uses the input key as header name if no header name defined as argument', function () { $inputArray = [ 'foo' => 'bar', 'url' => 'https://www.example.com/baz', 'header-name' => 'someHeaderValue', ]; $loader = Mockery::mock(HttpLoader::class); $loader ->shouldReceive('load') ->withArgs(function (RequestInterface $request) use ($inputArray) { return $request->getHeader('header-name') === [$inputArray['header-name']]; }) ->once() ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200))); $step = Http::get() ->setLoader($loader) ->useInputKeyAsHeader('header-name'); helper_invokeStepWithInput($step, $inputArray); }); it('merges header values if you provide a static header value and use an input value as header', function () { $inputArray = [ 'foo' => 'bar', 'someUrl' => 'https://www.example.com/baz', 'someHeader' => 'someHeaderValue', ]; $loader = Mockery::mock(HttpLoader::class); $loader ->shouldReceive('load') ->withArgs(function (RequestInterface $request) use ($inputArray) { return $request->getHeader('header-name-x') === ['foo', $inputArray['someHeader']]; }) ->once() ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200))); $step = Http::get(['header-name-x' => 'foo']) ->setLoader($loader) ->useInputKeyAsUrl('someUrl') ->useInputKeyAsHeader('someHeader', 'header-name-x'); helper_invokeStepWithInput($step, $inputArray); }); test('you can use useInputKeyAsHeader() multiple times', function () { $inputArray = [ 'foo' => 'bar', 'someUrl' => 'https://www.example.com/baz', 'someHeader' => 'someHeaderValue', 'anotherHeader' => 'anotherHeaderValue', ]; $loader = Mockery::mock(HttpLoader::class); $loader ->shouldReceive('load') ->withArgs(function (RequestInterface $request) use ($inputArray) { return $request->getHeader('header-name-x') === [$inputArray['someHeader']] && $request->getHeader('header-name-y') === [$inputArray['anotherHeader']]; }) ->once() ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200))); $step = Http::get() ->setLoader($loader) ->useInputKeyAsUrl('someUrl') ->useInputKeyAsHeader('someHeader', 'header-name-x') ->useInputKeyAsHeader('anotherHeader', 'header-name-y'); helper_invokeStepWithInput($step, $inputArray); }); it('gets multiple headers from an input array using useInputKeyAsHeaders()', function () { $inputArray = [ 'foo' => 'bar', 'someUrl' => 'https://www.example.com/baz', 'customHeaders' => [ 'header-name-x' => 'foo', 'header-name-y' => ['bar', 'baz'], ], ]; $loader = Mockery::mock(HttpLoader::class); $loader ->shouldReceive('load') ->withArgs(function (RequestInterface $request) use ($inputArray) { $customHeaders = $inputArray['customHeaders']; $yHeaderExpectedValue = array_merge(['quz'], $customHeaders['header-name-y']); return $request->getHeader('header-name-x') === [$customHeaders['header-name-x']] && $request->getHeader('header-name-y') === $yHeaderExpectedValue; }) ->once() ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200))); $step = Http::get(['header-name-y' => 'quz']) ->setLoader($loader) ->useInputKeyAsUrl('someUrl') ->useInputKeyAsHeaders('customHeaders'); helper_invokeStepWithInput($step, $inputArray); }); it('uses a static URL when defined', function () { $input = 'foo'; $loader = Mockery::mock(HttpLoader::class); $loader ->shouldReceive('load') ->withArgs(function (RequestInterface $request) { return $request->getUri()->__toString() === 'https://www.example.com/servus'; }) ->once() ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/servus'), new Response(200))); $step = Http::get() ->setLoader($loader) ->staticUrl('https://www.example.com/servus'); helper_invokeStepWithInput($step, $input); }); it('resolves variables in a static URL from input data', function () { $input = ['one' => 'foo', 'two' => 'bar']; $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('usesHeadlessBrowser')->andReturn(false); $loader ->shouldReceive('load') ->withArgs(function (RequestInterface $request) { return $request->getUri()->__toString() === 'https://www.example.com/foo/bar/baz'; }) ->once() ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/foo/bar/baz'), new Response(200))); $step = Http::get() ->setLoader($loader) ->staticUrl('https://www.example.com/[crwl:\'one\']/[crwl:two]/baz'); helper_invokeStepWithInput($step, $input); }); it('resolves variables in the request body from input data', function () { $input = [ 'url' => 'https://www.example.com/foo', 'hey' => 'ho', 'yo' => 'lo', ]; $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('usesHeadlessBrowser')->andReturn(false); $loader ->shouldReceive('load') ->withArgs(function (RequestInterface $request) { $bodyString = Http::getBodyString($request); return $bodyString === 'Ho ho ho and lo asdf'; }) ->once() ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/foo'), new Response(200))); $step = Http::post(body: 'Ho ho [crwl:hey] and [crwl:yo] asdf') ->setLoader($loader); helper_invokeStepWithInput($step, $input); }); it('resolves variables in request headers from input data', function () { $input = [ 'url' => 'https://www.example.com/foo', 'encoding' => 'deflate, br', 'language' => 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7', ]; $loader = Mockery::mock(HttpLoader::class); $loader ->shouldReceive('load') ->withArgs(function (RequestInterface $request) { return $request->getHeaderLine('Accept-Encoding') === 'gzip, deflate, br, zstd' && $request->getHeaderLine('Accept-Language') === 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7'; }) ->once() ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/foo'), new Response(200))); $step = Http::get([ 'Accept-Encoding' => 'gzip, [crwl:"encoding"], zstd', 'Accept-Language' => '[crwl:language]', ]) ->setLoader($loader); helper_invokeStepWithInput($step, $input); }); test( 'the getBodyString() method does not generate a warning, when the response contains a ' . 'Content-Type: application/x-gzip header, but the content actually isn\'t compressed', function () { $warnings = []; set_error_handler(function ($errno, $errstr) use (&$warnings) { if ($errno === E_WARNING) { $warnings[] = $errstr; } return false; }); $response = helper_getRespondedRequest( url: 'https://example.com/yolo', responseHeaders: ['Content-Type' => 'application/x-gzip'], responseBody: 'Servas!', ); $string = Http::getBodyString($response); restore_error_handler(); expect($warnings)->toBeEmpty() ->and($string)->toBe('Servas!'); }, ); it('rejects post browser navigate hooks, when the HTTP method is not GET', function (string $httpMethod) { $logger = new DummyLogger(); $step = (new Http($httpMethod))->addLogger($logger)->postBrowserNavigateHook(BrowserAction::wait(1.0)); expect($logger->messages)->toHaveCount(1) ->and($logger->messages[0]['message'])->toBe( 'A ' . $httpMethod . ' request cannot be executed using the (headless) browser, so post browser ' . 'navigate hooks can\'t be defined for this step either.', ) ->and(invade($step)->postBrowserNavigateHooks)->toBe([]); })->with(['POST', 'PUT', 'PATCH', 'DELETE']); it( 'calls the HttpLoader::skipCacheForNextRequest() method before calling load when the skipCache() method was called', function () { $loader = Mockery::mock(HttpLoader::class); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/blog/posts'), new Response(200, body: Utils::streamFor('blog posts')), ); $loader->shouldReceive('skipCacheForNextRequest')->once(); $loader->shouldReceive('load')->once()->andReturn($respondedRequest); $step = Http::get()->setLoader($loader)->skipCache(); helper_invokeStepWithInput($step); }, ); it( 'calls the HttpLoader::skipCacheForNextRequest() method before calling loadOrFail() when the skipCache() method ' . 'was called', function () { $loader = Mockery::mock(HttpLoader::class); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/blog/posts'), new Response(200, body: Utils::streamFor('blog posts')), ); $loader->shouldReceive('skipCacheForNextRequest')->once(); $loader->shouldReceive('loadOrFail')->once()->andReturn($respondedRequest); $step = Http::get()->setLoader($loader)->skipCache()->stopOnErrorResponse(); helper_invokeStepWithInput($step); }, ); it( 'switches the loader to use the browser, when useBrowser() was called and the loader is configured to use the ' . 'HTTP client', function () { $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(false); $loader->shouldReceive('useHeadlessBrowser')->once(); $loader->shouldReceive('useHttpClient')->once(); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/hello/world'), new Response(200, body: Utils::streamFor('Hello World!')), ); $loader->shouldReceive('load')->once()->andReturn($respondedRequest); $step = Http::get()->setLoader($loader)->useBrowser(); helper_invokeStepWithInput($step); }, ); it( 'switches the loader to use the browser, when stopOnErrorResponse() and useBrowser() was called and the loader ' . 'is configured to use the HTTP client', function () { $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(false); $loader->shouldReceive('useHeadlessBrowser')->once(); $loader->shouldReceive('useHttpClient')->once(); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/hello/world'), new Response(200, body: Utils::streamFor('Hello World!')), ); $loader->shouldReceive('loadOrFail')->once()->andReturn($respondedRequest); $step = Http::get()->setLoader($loader)->stopOnErrorResponse()->useBrowser(); helper_invokeStepWithInput($step); }, ); it( 'does not switch the loader to use the browser, when useBrowser() was called, the loader is configured to use ' . 'the HTTP client, but the request method is not GET', function (string $httpMethod) { $logger = new DummyLogger(); $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(false); $loader->shouldNotReceive('useHeadlessBrowser'); $respondedRequest = new RespondedRequest( new Request($httpMethod, 'https://www.example.com/something'), new Response(200, body: Utils::streamFor('Something!')), ); $loader->shouldReceive('load')->once()->andReturn($respondedRequest); $step = Http::{$httpMethod}()->setLoader($loader)->addLogger($logger)->useBrowser(); helper_invokeStepWithInput($step); expect($logger->messages)->toHaveCount(1) ->and($logger->messages[0]['message'])->toBe( 'The (headless) browser can only be used for GET requests! Therefore this step will use the HTTP ' . 'client for loading.', ); }, )->with(['post', 'put', 'patch', 'delete']); it( 'automatically switches the loader to use the HTTP client, when the HTTP method is not GET and the loader is ' . 'configured to use the browser', function (string $httpMethod) { $logger = new DummyLogger(); $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(true); $loader->shouldReceive('useHttpClient')->once(); $loader->shouldReceive('useHeadlessBrowser')->once(); $respondedRequest = new RespondedRequest( new Request($httpMethod, 'https://www.example.com/something'), new Response(200, body: Utils::streamFor('Something!')), ); $loader->shouldReceive('load')->once()->andReturn($respondedRequest); $step = Http::{$httpMethod}()->setLoader($loader)->addLogger($logger)->useBrowser(); helper_invokeStepWithInput($step); expect($logger->messages)->toHaveCount(1) ->and($logger->messages[0]['message'])->toBe( 'The (headless) browser can only be used for GET requests! Therefore this step will use the HTTP ' . 'client for loading.', ); }, )->with(['post', 'put', 'patch', 'delete']); it( 'switches back the loader to use the HTTP client, when stopOnErrorResponse() and useBrowser() was called and ' . 'loading throws an exception', function () { $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(false); $loader->shouldReceive('useHeadlessBrowser')->once(); $loader->shouldReceive('useHttpClient')->once(); $loader->shouldReceive('loadOrFail')->once()->andThrow(new LoadingException('error message')); $step = Http::get()->setLoader($loader)->stopOnErrorResponse()->useBrowser(); try { helper_invokeStepWithInput($step); } catch (Throwable $exception) { } }, ); it( 'does not call the useHeadlessBrowser() method of the loader, when useBrowser() was called and the loader is ' . 'already configured to use the browser', function () { $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(true); $loader->shouldNotReceive('useHeadlessBrowser'); $loader->shouldNotReceive('useHttpClient'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/hello/world'), new Response(200, body: Utils::streamFor('Hello World!')), ); $loader->shouldReceive('load')->once()->andReturn($respondedRequest); $step = Http::get()->setLoader($loader)->useBrowser(); helper_invokeStepWithInput($step); }, ); it( 'does not call the useHeadlessBrowser() method of the loader, when stopOnErrorResponse() and useBrowser() was ' . 'called and the loader is already configured to use the browser', function () { $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(true); $loader->shouldNotReceive('useHeadlessBrowser'); $loader->shouldNotReceive('useHttpClient'); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/hello/world'), new Response(200, body: Utils::streamFor('Hello World!')), ); $loader->shouldReceive('loadOrFail')->once()->andReturn($respondedRequest); $step = Http::get()->setLoader($loader)->stopOnErrorResponse()->useBrowser(); helper_invokeStepWithInput($step); }, ); it( 'sets post browser navigate hooks, when useBrowser() was called and the loader is configured to use the HTTP ' . 'client', function () { $loader = Mockery::mock(HttpLoader::class)->makePartial(); $browserHelperMock = Mockery::mock(HeadlessBrowserLoaderHelper::class); $loader->shouldReceive('browser')->andReturn($browserHelperMock); $browserHelperMock ->shouldReceive('setTempPostNavigateHooks') ->once() ->withArgs(function (array $hooks) { return $hooks[0] instanceof Closure; }); $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.example.com/woop'), new Response(200, body: Utils::streamFor('Woop')), ); $loader->shouldReceive('load')->once()->andReturn($respondedRequest); $step = Http::get()->setLoader($loader)->useBrowser()->postBrowserNavigateHook(BrowserAction::wait(1.0)); helper_invokeStepWithInput($step); }, ); ================================================ FILE: tests/Steps/Loading/LoadingStepTest.php ================================================ */ use LoadingStep; protected function invoke(mixed $input): Generator { $this->getLoader()->load($input); yield []; } }; $loader = Mockery::mock(HttpLoader::class); $loader->shouldReceive('load')->once(); $step->setLoader($loader); helper_traverseIterable($step->invokeStep(new Input('https://www.digitalocean.com/blog'))); }); test( 'you can provide a custom loader to a step via the withLoader() method, and it will be preferred to the loader ' . 'provided via setLoader()', function () { $loaderOne = Mockery::mock(Loader::class); $loaderOne->shouldNotReceive('load'); $loaderTwo = Mockery::mock(Loader::class); $loaderTwo->shouldReceive('load')->once()->andReturn('Hi'); $step = new class extends Step { /** * @use LoadingStep */ use LoadingStep; protected function invoke(mixed $input): Generator { yield $this->getLoader()->load($input); } }; $step->withLoader($loaderTwo); // The crawler will call the setLoader() method of the step after the step was added to the crawler. // So, the call to withLoader() will happen before that. // Nevertheless, the loader passed to withLoader() should be preferred. $step->setLoader($loaderOne); helper_invokeStepWithInput($step); }, ); ================================================ FILE: tests/Steps/Refiners/AbstractRefinerTest.php ================================================ logger?->info('logging works'); return $value; } public function testLogTypeWarning(): void { $this->logTypeWarning('Some::staticMethodName()', 'foo'); } } /** @var TestCase $this */ it('takes a logger that can be used in the Refiner', function () { $refiner = new SomeRefiner(); $refiner->addLogger(new CliLogger()); $refiner->refine('foo'); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput)->toContain('logging works'); }); it('provides a method for children to log a warning if the type of the incoming value is wrong', function () { (new SomeRefiner())->addLogger(new CliLogger())->testLogTypeWarning(); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput)->toContain('Refiner Some::staticMethodName() can\'t be applied to value of type string'); }); ================================================ FILE: tests/Steps/Refiners/DateTime/DateTimeFormatTest.php ================================================ refine($from); expect($refinedValue)->toBe($to); })->with([ ['2024-09-21T13:55:41Z', '2024-09-21 13:55:41'], ['2024-09-21T13:55:41.000Z', '2024-09-21 13:55:41'], ['2024-09-21', '2024-09-21 00:00:00'], ['2024-09-21, 13:55:41', '2024-09-21 13:55:41'], ['21 September 2024, 13:55:41', '2024-09-21 13:55:41'], ['21. September 2024, 13:55:41', '2024-09-21 13:55:41'], ['21 September 2024', '2024-09-21 00:00:00'], ['21. September 2024', '2024-09-21 00:00:00'], ['21.09.2024', '2024-09-21 00:00:00'], ['21.09.2024 13:55', '2024-09-21 13:55:00'], ['21.09.2024 13:55:41', '2024-09-21 13:55:41'], ['Sat, 21 September 2024 13:55:41 +0000', '2024-09-21 13:55:41'], ['Sat Sep 21 2024 16:55:41 GMT+0100', '2024-09-21 15:55:41'], ]); it('reformats a format that PHP\'s strtotime() does not know, when the origin format is provided', function () { $refinedValue = DateTimeRefiner::reformat('Y-m-d H:i:s', 'd. F Y \u\m H:i:s') ->refine('21. September 2024 um 13:55:41'); expect($refinedValue)->toBe('2024-09-21 13:55:41'); }); it('logs a warning message (and keeps original input) when it wasn\'t able to auto-convert a date time string', function () { $refiner = DateTimeRefiner::reformat('Y-m-d H:i:s'); $logger = new DummyLogger(); $refiner->addLogger($logger); $refinedValue = $refiner->refine('21. September 2024 um 13:55:41'); expect($logger->messages)->toHaveCount(1) ->and($logger->messages[0]['level'])->toBe('warning') ->and($logger->messages[0]['message'])->toStartWith('Failed to automatically (without known format) parse') ->and($refinedValue)->toBe('21. September 2024 um 13:55:41'); }); it( 'logs a warning message (and keeps original input) when it wasn\'t able to convert a date time string with the ' . 'given origin format', function () { $refiner = DateTimeRefiner::reformat('Y-m-d H:i:s', 'd. F Y um H:i:s'); $logger = new DummyLogger(); $refiner->addLogger($logger); $refinedValue = $refiner->refine('21. September 2024 um 13:55:41'); expect($logger->messages)->toHaveCount(1) ->and($logger->messages[0]['level'])->toBe('warning') ->and($logger->messages[0]['message'])->toStartWith('Failed parsing date/time ') ->and($refinedValue)->toBe('21. September 2024 um 13:55:41'); }, ); it('reformats an array of date time strings', function () { $refinedValue = DateTimeRefiner::reformat('Y-m-d H:i:s')->refine([ '2024-09-21T13:55:41Z', '2024-09-21T13:55:41.000Z', '2024-09-21', ]); expect($refinedValue)->toBe([ '2024-09-21 13:55:41', '2024-09-21 13:55:41', '2024-09-21 00:00:00', ]); }); ================================================ FILE: tests/Steps/Refiners/Html/RemoveFromHtmlTest.php ================================================

    Hi!

    remove this!
    HTML; $refinedValue = HtmlRefiner::remove('#foo')->refine($html); expect($refinedValue)->not()->toContain('remove this!') ->and($refinedValue)->toContain('

    Hi!

    '); }); it('removes a certain node from an HTML snippet by selector', function () { $html = <<

    Hi!

    remove this!

    HTML; $refinedValue = HtmlRefiner::remove('#foo')->refine($html); expect($refinedValue)->not()->toContain('remove this!') ->and($refinedValue)->toContain('

    Hi!

    ') ->and($refinedValue)->not()->toContain(''); }); it('removes multiple nodes from an HTML snippet by selector', function () { $html = <<
    • foo
    • bar
    • baz
    • quz
    HTML; $refinedValue = HtmlRefiner::remove('#list .remove')->refine($html); expect($refinedValue)->not()->toContain('bar') ->and($refinedValue)->not()->toContain('quz') ->and($refinedValue)->toContain('
  • foo
  • ') ->and($refinedValue)->toContain('
  • baz
  • ') ->and($refinedValue)->not()->toContain(''); }); it('removes multiple nodes from HTML by xpath query', function () { $html = <<
    • foo
    • bar
    • baz
    • quz
    HTML; $refinedValue = HtmlRefiner::remove(Dom::xPath('//li[contains(@class, \'remove\')]'))->refine($html); expect($refinedValue)->not()->toContain('bar') ->and($refinedValue)->not()->toContain('quz') ->and($refinedValue)->toContain('
  • foo
  • ') ->and($refinedValue)->toContain('
  • baz
  • ') ->and($refinedValue)->not()->toContain(''); }); it('removes node from an array of HTML snippets', function () { $html = [ <<
  • foo
  • bar
  • baz
  • quz
  • HTML, <<
  • lorem
  • ipsum
  • dolor
  • sit
  • HTML, ]; $refinedValue = HtmlRefiner::remove('.remove')->refine($html); expect($refinedValue[0])->not()->toContain('bar') ->and($refinedValue[0])->not()->toContain('quz') ->and($refinedValue[1])->not()->toContain('ipsum') ->and($refinedValue[1])->not()->toContain('sit'); }); ================================================ FILE: tests/Steps/Refiners/String/AfterFirstTest.php ================================================ addLogger(new CliLogger()) ->refine($value); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput) ->toContain('Refiner StringRefiner::afterFirst() can\'t be applied to value of type ' . gettype($value)) ->and($refinedValue)->toBe($value); })->with([ [123], [12.3], [true], ]); it('works with an array of strings as value', function () { $refinedValue = StringRefiner::afterFirst('a') ->addLogger(new CliLogger()) ->refine(['foo a bar a baz', 'lorem a ipsum a dolor']); expect($refinedValue)->toBe(['bar a baz', 'ipsum a dolor']); }); it('returns the string after first occurrence of another string', function () { expect(StringRefiner::afterFirst('foo')->refine('yo lo foo boo choo foo gnu'))->toBe('boo choo foo gnu'); }); it('returns the full string if the string to look for is empty', function () { expect(StringRefiner::afterFirst('')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo'); }); it('returns the full string when the string to look for is not contained', function () { expect(StringRefiner::afterFirst('moo')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo'); }); ================================================ FILE: tests/Steps/Refiners/String/AfterLastTest.php ================================================ addLogger(new CliLogger()) ->refine($value); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput) ->toContain('Refiner StringRefiner::afterLast() can\'t be applied to value of type ' . gettype($value)) ->and($refinedValue)->toBe($value); })->with([ [123], [12.3], [true], ]); it('works with an array of strings as value', function () { $refinedValue = StringRefiner::afterLast('a') ->addLogger(new CliLogger()) ->refine(['foo a bar a baz', 'lorem a ipsum a dolor']); expect($refinedValue)->toBe(['z', 'dolor']); }); it('returns the string after last occurrence of another string', function () { expect(StringRefiner::afterLast('foo')->refine('yo lo foo boo choo foo gnu'))->toBe('gnu'); }); it('returns an empty string if the string to look for is empty', function () { expect(StringRefiner::afterLast('')->refine('yo lo foo boo choo'))->toBe(''); }); it('returns the full string when the string to look for is not contained', function () { expect(StringRefiner::afterLast('moo')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo'); }); ================================================ FILE: tests/Steps/Refiners/String/BeforeFirstTest.php ================================================ addLogger(new CliLogger()) ->refine($value); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput) ->toContain('Refiner StringRefiner::beforeFirst() can\'t be applied to value of type ' . gettype($value)) ->and($refinedValue)->toBe($value); })->with([ [123], [12.3], [true], ]); it('works with an array of strings as value', function () { $refinedValue = StringRefiner::beforeFirst('a') ->addLogger(new CliLogger()) ->refine(['foo a bar a baz', 'lorem a ipsum a dolor']); expect($refinedValue)->toBe(['foo', 'lorem']); }); it('returns the string before the first occurrence of another string', function () { expect(StringRefiner::beforeFirst('foo')->refine('yo lo foo boo choo foo gnu'))->toBe('yo lo'); }); it('returns an empty string if the string to look for is empty', function () { expect(StringRefiner::beforeFirst('')->refine('yo lo foo boo choo'))->toBe(''); }); it('returns the full string when the string to look for is not contained', function () { expect(StringRefiner::beforeFirst('moo')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo'); }); ================================================ FILE: tests/Steps/Refiners/String/BeforeLastTest.php ================================================ addLogger(new CliLogger()) ->refine($value); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput) ->toContain('Refiner StringRefiner::beforeLast() can\'t be applied to value of type ' . gettype($value)) ->and($refinedValue)->toBe($value); })->with([ [123], [12.3], [true], ]); it('works with an array of strings as value', function () { $refinedValue = StringRefiner::beforeLast('a') ->addLogger(new CliLogger()) ->refine(['foo a bar a baz', 'lorem a ipsum a dolor']); expect($refinedValue)->toBe(['foo a bar a b', 'lorem a ipsum']); }); it('returns the string before the last occurrence of another string', function () { expect(StringRefiner::beforeLast('foo')->refine('yo lo foo boo choo foo gnu'))->toBe('yo lo foo boo choo'); }); it('returns the full string if the string to look for is empty', function () { expect(StringRefiner::beforeLast('')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo'); }); it('returns the full string when the string to look for is not contained', function () { expect(StringRefiner::beforeLast('moo')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo'); }); ================================================ FILE: tests/Steps/Refiners/String/BetweenFirstTest.php ================================================ addLogger(new CliLogger()) ->refine($value); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput) ->toContain('Refiner StringRefiner::betweenFirst() can\'t be applied to value of type ' . gettype($value)) ->and($refinedValue)->toBe($value); })->with([ [123], [12.3], [true], ]); it('works with an array of strings as value', function () { $refinedValue = StringRefiner::betweenFirst('foo', 'bar') ->addLogger(new CliLogger()) ->refine(['one foo two bar three foo four bar five', 'six foo seven bar eight foo nine bar ten']); expect($refinedValue)->toBe(['two', 'seven']); }); it('gets the (trimmed) string between the first occurrence of start and the next occurrence of end', function () { $refiner = StringRefiner::betweenFirst('foo', 'bar'); $refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar'); expect($refinedValue)->toBe('bli'); }); test('if start is an empty string, start from the beginning', function () { $refiner = StringRefiner::betweenFirst('', 'bar'); $refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar'); expect($refinedValue)->toBe('bla foo bli'); }); test('if end is an empty string, it takes the rest of the string until the end', function () { $refiner = StringRefiner::betweenFirst('blu', ''); $refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar'); expect($refinedValue)->toBe('foo bar asdf foo bar'); }); it('returns an empty string if start is not contained in the string', function () { $refiner = StringRefiner::betweenFirst('not contained', ''); $refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar'); expect($refinedValue)->toBe(''); }); ================================================ FILE: tests/Steps/Refiners/String/BetweenLastTest.php ================================================ addLogger(new CliLogger()) ->refine($value); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput) ->toContain('Refiner StringRefiner::betweenLast() can\'t be applied to value of type ' . gettype($value)) ->and($refinedValue)->toBe($value); })->with([ [123], [12.3], [true], ]); it('works with an array of strings as value', function () { $refinedValue = StringRefiner::betweenLast('foo', 'bar') ->addLogger(new CliLogger()) ->refine(['one foo two bar three foo four bar five', 'six foo seven bar eight foo nine bar ten']); expect($refinedValue)->toBe(['four', 'nine']); }); it('gets the (trimmed) string between the last occurrence of start and the next occurrence of end', function () { $refiner = StringRefiner::betweenLast('foo', 'bar'); $refinedValue = $refiner->refine('bla foo bli bar blu foo ble foo blo bar blö bar blä'); expect($refinedValue)->toBe('blo'); }); test('if start is an empty string, start from the beginning', function () { $refiner = StringRefiner::betweenLast('', 'blu'); $refinedValue = $refiner->refine('bla foo bli bar blu foo ble foo blo bar blö bar blä'); expect($refinedValue)->toBe('bla foo bli bar'); }); test('if end is an empty string, it takes the rest of the string until the end', function () { $refiner = StringRefiner::betweenLast('blo', ''); $refinedValue = $refiner->refine('bla foo bli bar blu foo ble foo blo bar blö bar blä'); expect($refinedValue)->toBe('bar blö bar blä'); }); it('returns an empty string if start is not contained in the string', function () { $refiner = StringRefiner::betweenFirst('not contained', ''); $refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar'); expect($refinedValue)->toBe(''); }); ================================================ FILE: tests/Steps/Refiners/String/ReplaceTest.php ================================================ addLogger(new CliLogger()) ->refine($value); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput) ->toContain('Refiner StringRefiner::replace() can\'t be applied to value of type ' . gettype($value)) ->and($refinedValue)->toBe($value); })->with([ [123], [12.3], [true], ]); it('works when the value is an array of strings', function () { $refinedValue = StringRefiner::replace('foo', 'bar') ->addLogger(new CliLogger()) ->refine(['foo boo', 'who foo', 'yo lo']); expect($refinedValue)->toBe(['bar boo', 'who bar', 'yo lo']); }); it('replaces occurrences of a string with another string', function () { expect(StringRefiner::replace('foo', 'bar')->refine('foo, test lorem foo yolo'))->toBe('bar, test lorem bar yolo'); }); it('replaces occurrences of an array of strings with another array of strings', function () { expect(StringRefiner::replace(['foo', 'bar'], ['yo', 'lo'])->refine('foo bar baz'))->toBe('yo lo baz'); }); it('replaces occurrences of an array of strings with some single string', function () { expect(StringRefiner::replace(['foo', 'bar'], '-')->refine('foo bar baz'))->toBe('- - baz'); }); ================================================ FILE: tests/Steps/Refiners/Url/WithFragmentTest.php ================================================ addLogger(new CliLogger()) ->refine($value); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput) ->toContain('Refiner UrlRefiner::withFragment() can\'t be applied to value of type ' . gettype($value)) ->and($refinedValue)->toBe($value); }, )->with([ [123], [true], [new stdClass()], ]); it('replaces the query in a URL', function (mixed $value, string $expected) { expect(UrlRefiner::withFragment('#lorem')->refine($value))->toBe($expected); })->with([ ['https://www.example.com/path#foo', 'https://www.example.com/path#lorem'], ['https://www.example.com/path', 'https://www.example.com/path#lorem'], [Url::parse('https://www.crwlr.software/some/path#abc'), 'https://www.crwlr.software/some/path#lorem'], [Url::parsePsr7('https://www.crwl.io/quz#'), 'https://www.crwl.io/quz#lorem'], ]); it('resets any query', function (mixed $value, string $expected) { expect(UrlRefiner::withoutFragment()->refine($value))->toBe($expected); })->with([ ['https://www.example.com/foo#bar', 'https://www.example.com/foo'], ['https://www.crwlr.software/#', 'https://www.crwlr.software/'], ]); it('refines an array of URLs', function () { expect( UrlRefiner::withFragment('#lorem') ->refine([ 'https://www.example.com/path#foo', 'https://www.example.com/path#bar', ]), )->toBe(['https://www.example.com/path#lorem', 'https://www.example.com/path#lorem']); }); ================================================ FILE: tests/Steps/Refiners/Url/WithHostTest.php ================================================ addLogger(new CliLogger()) ->refine($value); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput) ->toContain('Refiner UrlRefiner::withHost() can\'t be applied to value of type ' . gettype($value)) ->and($refinedValue)->toBe($value); }, )->with([ [123], [true], [new stdClass()], ]); it('replaces the host in a URL', function (mixed $value, string $expected) { expect(UrlRefiner::withHost('www.crwlr.software')->refine($value))->toBe($expected); })->with([ ['https://www.example.com/foo', 'https://www.crwlr.software/foo'], ['https://www.crwl.io/bar', 'https://www.crwlr.software/bar'], [Url::parse('https://www.crwlr.software/baz'), 'https://www.crwlr.software/baz'], [Url::parsePsr7('https://crwl.io/quz'), 'https://www.crwlr.software/quz'], ]); it('refines an array of URLs', function () { expect( UrlRefiner::withHost('crwl.io') ->refine([ 'https://www.example.com/foo', 'https://www.example.com/bar', ]), )->toBe(['https://crwl.io/foo', 'https://crwl.io/bar']); }); ================================================ FILE: tests/Steps/Refiners/Url/WithPathTest.php ================================================ addLogger(new CliLogger()) ->refine($value); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput) ->toContain('Refiner UrlRefiner::withPath() can\'t be applied to value of type ' . gettype($value)) ->and($refinedValue)->toBe($value); }, )->with([ [123], [true], [new stdClass()], ]); it('replaces the path in a URL', function (mixed $value, string $expected) { expect(UrlRefiner::withPath('/some/path/123')->refine($value))->toBe($expected); })->with([ ['https://www.example.com/foo', 'https://www.example.com/some/path/123'], ['https://localhost/yo', 'https://localhost/some/path/123'], [Url::parse('https://www.crwlr.software/packages'), 'https://www.crwlr.software/some/path/123'], [Url::parsePsr7('https://www.crwl.io/'), 'https://www.crwl.io/some/path/123'], ]); it('refines an array of URLs', function () { expect( UrlRefiner::withPath('/hawedere') ->refine([ 'https://www.example.com/foo', 'https://www.example.com/bar', ]), )->toBe(['https://www.example.com/hawedere', 'https://www.example.com/hawedere']); }); ================================================ FILE: tests/Steps/Refiners/Url/WithPortTest.php ================================================ addLogger(new CliLogger()) ->refine($value); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput) ->toContain('Refiner UrlRefiner::withPort() can\'t be applied to value of type ' . gettype($value)) ->and($refinedValue)->toBe($value); }, )->with([ [123], [true], [new stdClass()], ]); it('replaces the port in a URL', function (mixed $value, string $expected) { expect(UrlRefiner::withPort(1234)->refine($value))->toBe($expected); })->with([ ['https://www.example.com:8000/foo', 'https://www.example.com:1234/foo'], ['https://localhost:8080/yo', 'https://localhost:1234/yo'], [Url::parse('https://www.crwlr.software:5678/bar'), 'https://www.crwlr.software:1234/bar'], [Url::parsePsr7('https://crwl.io/quz'), 'https://crwl.io:1234/quz'], ]); it('refines an array of URLs', function () { expect( UrlRefiner::withPort(1234) ->refine([ 'https://www.example.com/foo', 'https://www.example.com/bar', ]), )->toBe(['https://www.example.com:1234/foo', 'https://www.example.com:1234/bar']); }); ================================================ FILE: tests/Steps/Refiners/Url/WithQueryTest.php ================================================ addLogger(new CliLogger()) ->refine($value); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput) ->toContain('Refiner UrlRefiner::withQuery() can\'t be applied to value of type ' . gettype($value)) ->and($refinedValue)->toBe($value); }, )->with([ [123], [true], [new stdClass()], ]); it('replaces the query in a URL', function (mixed $value, string $expected) { expect(UrlRefiner::withQuery('a=b&c=d')->refine($value))->toBe($expected); })->with([ ['https://www.example.com/foo?one=two', 'https://www.example.com/foo?a=b&c=d'], ['https://www.example.com/bar', 'https://www.example.com/bar?a=b&c=d'], [Url::parse('https://www.crwlr.software/?'), 'https://www.crwlr.software/?a=b&c=d'], [Url::parsePsr7('https://www.crwl.io/quz?a=c&b=d'), 'https://www.crwl.io/quz?a=b&c=d'], ]); it('resets any query', function (mixed $value, string $expected) { expect(UrlRefiner::withoutQuery()->refine($value))->toBe($expected); })->with([ ['https://www.example.com/foo?one=two', 'https://www.example.com/foo'], ['https://www.crwlr.software/?', 'https://www.crwlr.software/'], ]); it('refines an array of URLs', function () { expect( UrlRefiner::withoutQuery() ->refine([ 'https://www.example.com/foo?one=two', 'https://www.example.com/bar?three=four', ]), )->toBe(['https://www.example.com/foo', 'https://www.example.com/bar']); }); ================================================ FILE: tests/Steps/Refiners/Url/WithSchemeTest.php ================================================ addLogger(new CliLogger()) ->refine($value); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput) ->toContain('Refiner UrlRefiner::withScheme() can\'t be applied to value of type ' . gettype($value)) ->and($refinedValue)->toBe($value); }, )->with([ [123], [true], [new stdClass()], ]); it('replaces the scheme in a URL', function (mixed $value, string $expected) { expect(UrlRefiner::withScheme('https')->refine($value))->toBe($expected); })->with([ ['http://www.example.com/foo', 'https://www.example.com/foo'], ['https://www.example.com/foo', 'https://www.example.com/foo'], [Url::parse('ftp://www.example.com/bar'), 'https://www.example.com/bar'], [Url::parsePsr7('http://www.example.com/baz'), 'https://www.example.com/baz'], ]); it('refines an array of URLs', function () { expect( UrlRefiner::withScheme('https') ->refine([ 'http://www.example.com/foo', 'https://www.example.com/bar', ]), )->toBe(['https://www.example.com/foo', 'https://www.example.com/bar']); }); ================================================ FILE: tests/Steps/Refiners/Url/WithoutPortTest.php ================================================ addLogger(new CliLogger()) ->refine($value); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput) ->toContain('Refiner UrlRefiner::withoutPort() can\'t be applied to value of type ' . gettype($value)) ->and($refinedValue)->toBe($value); }, )->with([ [123], [true], [new stdClass()], ]); it('resets the port to null in a URL', function (mixed $value, string $expected) { expect(UrlRefiner::withoutPort()->refine($value))->toBe($expected); })->with([ ['https://www.example.com:8000/foo', 'https://www.example.com/foo'], ['http://localhost:8080/yo', 'http://localhost/yo'], [Url::parse('https://www.crwlr.software:5678/bar'), 'https://www.crwlr.software/bar'], [Url::parsePsr7('https://crwl.io/quz'), 'https://crwl.io/quz'], ]); it('refines an array of URLs', function () { expect( UrlRefiner::withoutPort() ->refine([ 'https://www.example.com:8000/foo', 'https://www.example.com:8080/bar', ]), )->toBe(['https://www.example.com/foo', 'https://www.example.com/bar']); }); ================================================ FILE: tests/Steps/Sitemap/GetUrlsFromSitemapTest.php ================================================ https://www.crwlr.software/0.5 https://www.crwlr.software/packages0.7 https://www.crwlr.software/blog0.7 https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-512022-09-03 https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php12022-06-02 https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-412022-05-10 https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-2-and-v0-312022-04-30 https://www.crwlr.software/blog/release-of-crwlr-crawler-v-0-1-012022-04-18 https://www.crwlr.software/blog/prevent-homograph-attacks-in-user-input-urls12022-01-19 XML; $outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap(), $xml); expect($outputs)->toHaveCount(9) ->and($outputs[0]->get())->toBe('https://www.crwlr.software/') ->and($outputs[8]->get())->toBe('https://www.crwlr.software/blog/prevent-homograph-attacks-in-user-input-urls'); }); it('gets all urls with additional data when the withData() method is used', function () { $xml = << https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-512022-09-03 https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php12022-06-02 https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-40.72022-05-10 XML; $outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap()->withData(), $xml); expect($outputs)->toHaveCount(3) ->and($outputs[0]->get())->toBe([ 'url' => 'https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-5', 'lastmod' => '2022-09-03', 'priority' => '1', ]) ->and($outputs[1]->get())->toBe([ 'url' => 'https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php', 'lastmod' => '2022-06-02', 'priority' => '1', ]) ->and($outputs[2]->get())->toBe([ 'url' => 'https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-4', 'lastmod' => '2022-05-10', 'priority' => '0.7', ]); }); it('doesn\'t fail when sitemap is empty', function () { $xml = << XML; $outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap()->withData(), $xml); expect($outputs)->toHaveCount(0); }); it( 'doesn\'t fail when the urlset tag contains attributes, that would cause the symfony DomCrawler to not find the ' . 'elements', function () { $xml = << https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-5 https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-4 XML; $outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap(), $xml); expect($outputs)->toHaveCount(3); }, ); it( 'doesn\'t fail when the urlset tag contains attributes, that would cause the symfony DomCrawler to not find the ' . 'elements, when the XML content has no line breaks', function () { $xml = <<https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-5https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-phphttps://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-4 XML; $outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap(), $xml); expect($outputs)->toHaveCount(3); }, ); ================================================ FILE: tests/Steps/StepTest.php ================================================ */ protected function invoke(mixed $input): Generator { $this->logger?->info('logging works'); yield 'something'; } }; $step->addLogger(new CliLogger()); helper_traverseIterable($step->invokeStep(new Input('test'))); $output = $this->getActualOutputForAssertion(); expect($output)->toContain('logging works'); }); test('The invokeStep method wraps the values returned by invoke in Output objects', function () { $step = helper_getValueReturningStep('returnValue'); $output = helper_invokeStepWithInput($step); expect($output)->toHaveCount(1) ->and($output[0])->toBeInstanceOf(Output::class) ->and($output[0]->get())->toBe('returnValue'); }); /* ------------------------------- keep() ------------------------------- */ test('keep() can pick keys from nested (array) output using dot notation', function () { $step = helper_getValueReturningStep([ 'users' => [ ['user' => 'otsch', 'firstname' => 'Christian', 'surname' => 'Olear'], ['user' => 'juerx', 'firstname' => 'Jürgen', 'surname' => 'Müller'], ['user' => 'sandy', 'firstname' => 'Sandra', 'surname' => 'Mayr'], ], 'foo' => 'bar', ]) ->keep(['nickname' => 'users.0.user', 'foo']); $output = helper_invokeStepWithInput($step); expect($output[0]->keep)->toBe(['nickname' => 'otsch', 'foo' => 'bar']); }); test('keep() picks keys from nested output including a RespondedRequest object', function () { $step = helper_getValueReturningStep([ 'response' => new RespondedRequest( new Request('GET', 'https://www.example.com/something'), new Response(200, body: 'Hi :)'), ), 'foo' => 'bar', ]) ->keep(['content' => 'response.body']); $output = helper_invokeStepWithInput($step); expect($output[0]->keep)->toBe(['content' => 'Hi :)']); }); it('maps output keys to different keys when defined in the array passed to keep()', function () { $step = helper_getValueReturningStep(['user' => 'otsch', 'firstname' => 'Christian', 'surname' => 'Olear']) ->keep(['foo' => 'firstname', 'bar' => 'surname']); $output = helper_invokeStepWithInput($step); expect($output[0]->keep)->toBe(['foo' => 'Christian', 'bar' => 'Olear']); }); /* ------------------------------- useInputKey() ------------------------------- */ it('uses a key from array input when defined', function () { $step = helper_getInputReturningStep()->useInputKey('bar'); $output = helper_invokeStepWithInput($step, new Input( ['foo' => 'fooValue', 'bar' => 'barValue', 'baz' => 'bazValue'], )); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe('barValue'); }); it('logs a warning message when the input key to use does not exist in input array', function () { $step = helper_getInputReturningStep()->useInputKey('baz'); $step->addLogger(new CliLogger()); $output = helper_invokeStepWithInput($step, new Input(['foo' => 'one', 'bar' => 'two'])); expect($output)->toHaveCount(0) ->and($this->getActualOutputForAssertion()) ->toContain('Can\'t get key from input, because it does not exist.'); }); it( 'logs a warning message when useInputKey() was called but the input value is not an array', function (mixed $inputValue) { $step = helper_getInputReturningStep()->useInputKey('baz'); $step->addLogger(new CliLogger()); $output = helper_invokeStepWithInput($step, new Input($inputValue)); expect($output)->toHaveCount(0) ->and($this->getActualOutputForAssertion()) ->toContain( 'Can\'t get key from input, because input is of type ' . gettype($inputValue) . ' instead of array.', ); }, )->with([ ['string'], [0], [new stdClass()], ]); it('does not lose previously kept data, when it uses the useInputKey() method', function () { $step = helper_getValueReturningStep(['test' => 'test'])->useInputKey('foo'); $outputs = helper_invokeStepWithInput($step, new Input(['foo' => 'test'], ['some' => 'thing'])); expect($outputs[0]->keep)->toBe(['some' => 'thing']); }); it('keeps the original input data when useInputKey() is used', function () { $step = helper_getValueReturningStep(['baz' => 'three']) ->keepFromInput() ->useInputKey('bar'); $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two']); expect($outputs[0]->get())->toBe(['baz' => 'three']) ->and($outputs[0]->keep)->toBe(['foo' => 'one', 'bar' => 'two']); }); test('useInputKey() can be used to get data that was kept from a previous step with keep() or keepAs()', function () { $step = helper_getInputReturningStep(); $step->useInputKey('bar'); $outputs = helper_invokeStepWithInput($step, new Input('value', keep: ['bar' => 'baz'])); expect($outputs[0]->get())->toBe('baz'); }); it( 'also passes on kept data through further steps when they don\'t define any further data to keep', function () { $step = helper_getValueReturningStep('returnValue'); $output = helper_invokeStepWithInput($step, new Input('inputValue', ['prevProperty' => 'foobar'])); expect($output)->toHaveCount(1) ->and($output[0]->keep)->toBe(['prevProperty' => 'foobar']); }, ); /* ------------------------------- uniqueInputs() ------------------------------- */ it('doesn\'t invoke twice with duplicate inputs when uniqueInput was called', function () { $step = helper_getInputReturningStep(); $outputs = helper_invokeStepWithInput($step, 'foo'); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($step, 'foo'); expect($outputs)->toHaveCount(1); $step->uniqueInputs(); $outputs = helper_invokeStepWithInput($step, 'foo'); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($step, 'foo'); expect($outputs)->toHaveCount(0); }); it( 'doesn\'t invoke twice with inputs with the same value in an array key when uniqueInput was called with that key', function () { $step = helper_getInputReturningStep(); $step->uniqueInputs(); $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar', 'number' => 1]); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar', 'number' => 2]); expect($outputs)->toHaveCount(1); $step->resetAfterRun(); $step->uniqueInputs('foo'); $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar', 'number' => 1]); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar', 'number' => 2]); expect($outputs)->toHaveCount(0); }, ); it( 'doesn\'t invoke twice with inputs with the same value in an object key when uniqueInput was called with that key', function () { $step = helper_getInputReturningStep(); $step->uniqueInputs(); $outputs = helper_invokeStepWithInput($step, helper_getStdClassWithData(['foo' => 'bar', 'number' => 1])); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($step, helper_getStdClassWithData(['foo' => 'bar', 'number' => 2])); expect($outputs)->toHaveCount(1); $step->resetAfterRun(); $step->uniqueInputs('foo'); $outputs = helper_invokeStepWithInput($step, helper_getStdClassWithData(['foo' => 'bar', 'number' => 1])); expect($outputs)->toHaveCount(1); $outputs = helper_invokeStepWithInput($step, helper_getStdClassWithData(['foo' => 'bar', 'number' => 2])); expect($outputs)->toHaveCount(0); }, ); /* ------------------------------- uniqueOutputs() ------------------------------- */ it('makes outputs unique when uniqueOutput was called', function () { $step = helper_getStepYieldingMultipleNumbers(); $step->uniqueOutputs(); $output = helper_invokeStepWithInput($step, new Input('anything')); expect($output)->toHaveCount(5) ->and($output[0]->get())->toBe('one') ->and($output[1]->get())->toBe('two') ->and($output[2]->get())->toBe('three') ->and($output[3]->get())->toBe('four') ->and($output[4]->get())->toBe('five'); }); it('makes outputs unique when providing a key name to uniqueOutput to use from array output', function () { $step = helper_getStepYieldingMultipleArraysWithNumber(); $step->uniqueOutputs('number'); $output = helper_invokeStepWithInput($step, new Input('anything')); expect($output)->toHaveCount(5); }); it('makes outputs unique when providing a key name to uniqueOutput to use from object output', function () { $step = helper_getStepYieldingMultipleObjectsWithNumber(); $step->uniqueOutputs('number'); $output = helper_invokeStepWithInput($step, new Input('anything')); expect($output)->toHaveCount(5); }); it('makes array outputs unique when providing no key name to uniqueOutput', function () { $step = helper_getStepYieldingMultipleArraysWithNumber(); $step->uniqueOutputs(); $output = helper_invokeStepWithInput($step, new Input(false)); expect($output)->toHaveCount(5); $output = helper_invokeStepWithInput($step, new Input(true)); expect($output)->toHaveCount(8); }); it('makes object outputs unique when providing no key name to uniqueOutput', function () { $step = helper_getStepYieldingMultipleArraysWithNumber(); $step->uniqueOutputs(); $output = helper_invokeStepWithInput($step, new Input(false)); expect($output)->toHaveCount(5); $output = helper_invokeStepWithInput($step, new Input(true)); expect($output)->toHaveCount(8); }); /* ----------------------------- oneOutputPerInput() ----------------------------- */ test( 'when a step yields multiple outputs per input and the oneOutputPerInput() method was called, the step yields it ' . 'as a single output with an array of all the single output values', function () { $step = helper_getStepYieldingInputArrayAsSeparateOutputs(); $step->oneOutputPerInput(); $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe(['foo', 'bar', 'baz']); }, ); test('when using oneOutputPerInput(), the combined output counts as one output for the max outputs limit', function () { $step = helper_getStepYieldingInputArrayAsSeparateOutputs(); $step->oneOutputPerInput()->maxOutputs(2); $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe(['foo', 'bar', 'baz']); $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe(['foo', 'bar', 'baz']); $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']); expect($outputs)->toHaveCount(0); }); test('when using oneOutputPerInput(), refiners are applied to the single elements of the combined output', function () { $step = helper_getStepYieldingInputArrayAsSeparateOutputs(); $step->oneOutputPerInput()->refineOutput('title', fn(mixed $outputValue) => $outputValue . '-hey'); $outputs = helper_invokeStepWithInput($step, [ ['title' => 'foo'], ['title' => 'bar'], ['title' => 'baz'], ]); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe([ ['title' => 'foo-hey'], ['title' => 'bar-hey'], ['title' => 'baz-hey'], ]); }); test('when using oneOutputPerInput(), filters are applied to the single elements of the combined output', function () { $step = helper_getStepYieldingInputArrayAsSeparateOutputs(); $step->where('id', Filter::greaterThan(109))->oneOutputPerInput(); $outputs = helper_invokeStepWithInput($step, [ ['title' => 'foo', 'id' => 109], ['title' => 'bar', 'id' => 110], ['title' => 'baz', 'id' => 111], ]); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe([ ['title' => 'bar', 'id' => 110], ['title' => 'baz', 'id' => 111], ]); }); test( 'when using oneOutputPerInput() in combination with outputKey(), the whole combined output is returned in an ' . 'array with the defined key', function () { $step = helper_getStepYieldingInputArrayAsSeparateOutputs(); $step->outputKey('test')->oneOutputPerInput(); $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe(['test' => ['foo', 'bar', 'baz']]); }, ); test( 'when using oneOutputPerInput() in combination with uniqueOutputs(), the whole combined output is compared', function () { $step = helper_getStepYieldingInputArrayAsSeparateOutputs(); $step->oneOutputPerInput()->uniqueOutputs(); $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe(['foo', 'bar', 'baz']); $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'quz']); expect($outputs)->toHaveCount(1) ->and($outputs[0]->get())->toBe(['foo', 'bar', 'quz']); $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']); expect($outputs)->toHaveCount(0); }, ); /* -------------------------- validateAndSanitizeInput() -------------------------- */ it('calls the validateAndSanitizeInput method', function () { $step = new class extends Step { protected function validateAndSanitizeInput(mixed $input): string { return $input . ' validated and sanitized'; } protected function invoke(mixed $input): Generator { yield $input; } }; $output = helper_invokeStepWithInput($step, 'inputValue'); expect($output[0]->get())->toBe('inputValue validated and sanitized'); }); test( 'when calling validateAndSanitizeStringOrStringable() and the input is array with a single element it tries to ' . 'use that element as input value', function () { $step = new class extends Step { protected function validateAndSanitizeInput(mixed $input): string { return $this->validateAndSanitizeStringOrStringable($input); } protected function invoke(mixed $input): Generator { yield $input; } }; $output = helper_invokeStepWithInput($step, ['inputValue']); expect($output[0]->get())->toBe('inputValue'); }, ); test( 'when calling validateAndSanitizeStringOrStringable() and the input is array with multiple elements it logs ' . 'an error message', function () { $logger = new DummyLogger(); $step = new class extends Step { protected function validateAndSanitizeInput(mixed $input): string { return $this->validateAndSanitizeStringOrStringable($input); } protected function invoke(mixed $input): Generator { yield $input; } }; $step->addLogger($logger); helper_invokeStepWithInput($step, ['inputValue', 'foo' => 'bar']); expect($logger->messages)->not->toBeEmpty() ->and($logger->messages[0]['message'])->toStartWith( 'A step was called with input that it can not work with:', ) ->and($logger->messages[0]['message'])->toEndWith('. The invalid input is of type array.'); }, ); test( 'when throwing an InvalidArgumentException from the validateAndSanitizeInput() it is caught and logged as an error', function () { $logger = new DummyLogger(); $step = new class extends Step { protected function validateAndSanitizeInput(mixed $input): string { throw new InvalidArgumentException('hey :)'); } protected function invoke(mixed $input): Generator { yield $input; } }; $step->addLogger($logger); $outputs = helper_invokeStepWithInput($step, 'anything'); expect($outputs)->toBeEmpty() ->and($logger->messages)->not->toBeEmpty() ->and($logger->messages[0]['message'])->toBe( 'A step was called with input that it can not work with: hey :)', ); }, ); test( 'when throwing an Exception that is not an InvalidArgumentException, from the validateAndSanitizeInput() it is ' . 'not caught', function () { $logger = new DummyLogger(); $step = new class extends Step { protected function validateAndSanitizeInput(mixed $input): string { throw new Exception('hey :)'); } protected function invoke(mixed $input): Generator { yield $input; } }; $step->addLogger($logger); helper_invokeStepWithInput($step, 'anything'); }, )->throws(Exception::class); it('is possible that a step does not produce any output at all', function () { $step = new class extends Step { protected function invoke(mixed $input): Generator { if ($input === 'foo') { yield 'bar'; } } }; $output = helper_invokeStepWithInput($step, 'lol'); expect($output)->toHaveCount(0); $output = helper_invokeStepWithInput($step, 'foo'); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe('bar'); }); /* --------------------------- updateInputUsingOutput() --------------------------- */ test('You can add and call an updateInputUsingOutput callback', function () { $step = helper_getValueReturningStep('something'); $step->updateInputUsingOutput(function (mixed $input, mixed $output) { return $input . ' ' . $output; }); $updatedInput = $step->callUpdateInputUsingOutput(new Input('Boo'), new Output('Yah!')); expect($updatedInput)->toBeInstanceOf(Input::class) ->and($updatedInput->get())->toBe('Boo Yah!'); }); it('does not lose previously kept data, when updateInputUsingOutput() is called', function () { $step = helper_getValueReturningStep('something'); $step->updateInputUsingOutput(function (mixed $input, mixed $output) { return $input . ' ' . $output; }); $updatedInput = $step->callUpdateInputUsingOutput( new Input('Some', ['foo' => 'bar']), new Output('thing'), ); expect($updatedInput->keep)->toBe(['foo' => 'bar']); }); /* -------------------------------- maxOutputs() -------------------------------- */ it('does not yield more outputs than defined via maxOutputs() method', function () { $step = helper_getValueReturningStep('yolo')->maxOutputs(3); for ($i = 1; $i <= 5; $i++) { $outputs = helper_invokeStepWithInput($step, new Input('asdf')); if ($i <= 3) { expect($outputs)->toHaveCount(1); } else { expect($outputs)->toHaveCount(0); } } }); it( 'does not yield more outputs than defined via maxOutputs() when step yields multiple outputs per input and the ' . 'limit is reached in the middle of the outputs resulting from one input', function () { $step = new class extends Step { protected function invoke(mixed $input): Generator { yield 'one'; yield 'two'; yield 'three'; } }; $step->maxOutputs(7); $outputs = helper_invokeStepWithInput($step, new Input('a')); expect($outputs)->toHaveCount(3); $outputs = helper_invokeStepWithInput($step, new Input('b')); expect($outputs)->toHaveCount(3); $outputs = helper_invokeStepWithInput($step, new Input('c')); expect($outputs)->toHaveCount(1); }, ); test('When a step has max outputs defined, it won\'t call the invoke method after the limit was reached', function () { $step = new class extends Step { public int $_invokeCallCount = 0; protected function invoke(mixed $input): Generator { $this->_invokeCallCount += 1; yield 'something'; } }; $step->maxOutputs(2); helper_invokeStepWithInput($step, new Input('one')); helper_invokeStepWithInput($step, new Input('two')); helper_invokeStepWithInput($step, new Input('three')); helper_invokeStepWithInput($step, new Input('four')); expect($step->_invokeCallCount)->toBe(2); }); it('resets outputs count for maxOutputs rule when resetAfterRun() is called', function () { $step = helper_getValueReturningStep('gogogo')->maxOutputs(2); helper_invokeStepWithInput($step, new Input('one')); helper_invokeStepWithInput($step, new Input('two')); $step->resetAfterRun(); expect(helper_invokeStepWithInput($step, new Input('three')))->toHaveCount(1); }); /* -------------------------------- outputKey() -------------------------------- */ it('converts non array output to array with a certain key using the outputKey() method', function () { $step = helper_getValueReturningStep('bar')->outputKey('foo'); $outputs = helper_invokeStepWithInput($step); expect($outputs[0]->get())->toBe(['foo' => 'bar']); }); test('keeping a scalar output value with keep() also works when outputKey() was used', function () { $step = new class extends Step { protected function invoke(mixed $input): Generator { yield 'hey'; } public function outputType(): StepOutputType { return StepOutputType::Scalar; } }; $step ->outputKey('greeting') ->keep(); $step->validateBeforeRun(Http::get()); $outputs = helper_invokeStepWithInput($step, 'guten tag'); expect($outputs[0]->get())->toBe(['greeting' => 'hey']); }); /* -------------------------------- refineOutput() -------------------------------- */ it('applies a Closure refiner to the steps output', function () { $step = helper_getValueReturningStep('output'); $step->refineOutput(function (mixed $outputValue) { return $outputValue . ' refined'; }); $outputs = helper_invokeStepWithInput($step); expect($outputs[0]->get())->toBe('output refined'); }); it('applies an instance of the RefinerInterface to the steps output', function () { $step = helper_getInputReturningStep(); $step->refineOutput(StringRefiner::betweenFirst('foo', 'baz')); $outputs = helper_invokeStepWithInput($step, 'foo bar baz'); expect($outputs[0]->get())->toBe('bar'); }); it('applies multiple refiners to the steps output in the order they\'re added', function () { $step = helper_getInputReturningStep(); $step ->refineOutput(StringRefiner::betweenFirst('foo', 'baz')) ->refineOutput(function (mixed $outputValue) { return $outputValue . ' refined'; }) ->refineOutput(function (mixed $outputValue) { return $outputValue . ', and refined further'; }); $outputs = helper_invokeStepWithInput($step, 'foo bar baz'); expect($outputs[0]->get())->toBe('bar refined, and refined further'); }); it('applies refiners to certain keys from array output when the key is provided', function () { $step = helper_getInputReturningStep(); $step ->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor')) ->refineOutput('baz', function (mixed $outputValue) { return 'refined ' . $outputValue; }); $outputs = helper_invokeStepWithInput( $step, ['foo' => 'lorem ipsum dolor', 'bar' => 'bla', 'baz' => 'quz'], ); expect($outputs[0]->get())->toBe([ 'foo' => 'ipsum', 'bar' => 'bla', 'baz' => 'refined quz', ]); }); test('you can apply multiple refiners to the same output array key', function () { $step = helper_getInputReturningStep(); $step ->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor')) ->refineOutput('foo', function (mixed $outputValue) { return $outputValue . ' yolo'; }); $outputs = helper_invokeStepWithInput( $step, ['foo' => 'lorem ipsum dolor', 'bar' => 'bla'], ); expect($outputs[0]->get())->toBe([ 'foo' => 'ipsum yolo', 'bar' => 'bla', ]); }); it( 'uses the original input value when applying a refiner, not only the value of an input array key chosen via ' . 'useInputKey()', function () { $step = helper_getInputReturningStep(); $step ->useInputKey('bar') ->refineOutput(function (mixed $outputValue, mixed $originalInputValue) { return $originalInputValue; }); $outputs = helper_invokeStepWithInput( $step, ['foo' => 'one', 'bar' => 'two'], ); expect($outputs[0]->get())->toBe(['foo' => 'one', 'bar' => 'two']); }, ); /* ------------------------------- outputKeyAliases() ------------------------------- */ test('you can define aliases for output keys and they are considered when using keep()', function () { $step = new class extends Step { protected function invoke(mixed $input): Generator { yield [ 'foo' => 'one', 'bar' => 'two', 'baz' => 'three', ]; } protected function outputKeyAliases(): array { return [ 'woo' => 'foo', 'war' => 'bar', 'waz' => 'baz', ]; } }; $step->keep(['woo', 'far' => 'war', 'waz']); $outputs = helper_invokeStepWithInput($step); expect($outputs[0]->keep)->toBe([ 'woo' => 'one', 'far' => 'two', 'waz' => 'three', ]); }); test('you can filter outputs using an output key alias', function () { $step = new class extends Step { protected function invoke(mixed $input): Generator { yield [ 'foo' => 'one', 'bar' => 'two', ]; } protected function outputKeyAliases(): array { return [ 'baz' => 'bar', ]; } }; $step->where('baz', Filter::equal('two')); $outputs = helper_invokeStepWithInput($step); expect($outputs[0])->toBeInstanceOf(Output::class); }); it('can filter by a key that only exists in the serialized version of an output object', function () { $step = new class extends Step { protected function invoke(mixed $input): Generator { yield new class { public string $foo = 'one'; public string $bar = 'two'; /** * @return string[] */ public function __serialize(): array { return [ 'foo' => $this->foo, 'bar' => $this->bar, 'baz' => $this->bar, ]; } }; } protected function outputKeyAliases(): array { return [ 'quz' => 'baz', ]; } }; $step->where('quz', Filter::equal('two')); $outputs = helper_invokeStepWithInput($step); expect($outputs[0])->toBeInstanceOf(Output::class); }); ================================================ FILE: tests/Steps/XmlTest.php ================================================ extract('title'), helper_getStepFilesContent('Xml/bookstore.xml'), ); expect($output)->toHaveCount(4) ->and($output[0]->get())->toBe('Everyday Italian') ->and($output[3]->get())->toBe('Learning XML'); }); it('extracts data from an XML document with XPath queries per default', function () { $output = helper_invokeStepWithInput( Xml::each('bookstore book')->extract([ 'title' => 'title', 'author' => 'author', 'year' => 'year', ]), helper_getStepFilesContent('Xml/bookstore.xml'), ); expect($output)->toHaveCount(4) ->and($output[0]->get())->toBe( ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'], ) ->and($output[1]->get())->toBe(['title' => 'Harry Potter', 'author' => 'J K. Rowling', 'year' => '2005']) ->and($output[2]->get())->toBe( [ 'title' => 'XQuery Kick Start', 'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'], 'year' => '2003', ], ) ->and($output[3]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']); }); it('can also extract data using XPath queries', function () { $output = helper_invokeStepWithInput( Xml::each(Dom::xPath('//bookstore/book'))->extract([ 'title' => Dom::xPath('//title'), 'author' => Dom::xPath('//author'), 'year' => Dom::xPath('//year'), ]), helper_getStepFilesContent('Xml/bookstore.xml'), ); expect($output)->toHaveCount(4) ->and($output[2]->get())->toBe( [ 'title' => 'XQuery Kick Start', 'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'], 'year' => '2003', ], ); }); it('returns only one (compound) output when the root method is used', function () { $output = helper_invokeStepWithInput( Xml::root()->extract(['title' => 'title', 'author' => 'author', 'year' => 'year']), helper_getStepFilesContent('Xml/bookstore.xml'), ); expect($output)->toHaveCount(1) ->and($output[0]->get()['title'])->toBe(['Everyday Italian', 'Harry Potter', 'XQuery Kick Start', 'Learning XML']); }); it('extracts the data of the first matching element when the first method is used', function () { $output = helper_invokeStepWithInput( Xml::first('bookstore book')->extract(['title' => 'title', 'author' => 'author', 'year' => 'year']), helper_getStepFilesContent('Xml/bookstore.xml'), ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe( ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'], ); }); it('extracts the data of the last matching element when the last method is used', function () { $output = helper_invokeStepWithInput( Xml::last('bookstore book')->extract(['title' => 'title', 'author' => 'author', 'year' => 'year']), helper_getStepFilesContent('Xml/bookstore.xml'), ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']); }); test( 'you can extract data in a second level to the output array using another Xml step as an element in the mapping ' . 'array', function () { $response = new RespondedRequest( new Request('GET', 'https://www.example.com/events.xml'), new Response(body: helper_getStepFilesContent('Xml/events.xml')), ); $outputs = helper_invokeStepWithInput( Xml::each('events event')->extract([ 'title' => 'name', 'location' => 'location', 'date' => 'date', 'talks' => Xml::each('talks talk')->extract([ 'title' => 'title', 'speaker' => 'speaker', ]), ]), $response, ); expect($outputs)->toHaveCount(2) ->and($outputs[0]->get())->toBe([ 'title' => 'Some Meetup', 'location' => 'Somewhere', 'date' => '2023-01-14 20:00', 'talks' => [ [ 'title' => 'Sophisticated talk title', 'speaker' => 'Super Mario', ], [ 'title' => 'Fun talk', 'speaker' => 'Princess Peach', ], ], ]) ->and($outputs[1]->get())->toBe([ 'title' => 'Another Meetup', 'location' => 'Somewhere else', 'date' => '2023-01-21 19:00', 'talks' => [ [ 'title' => 'Join the dark side', 'speaker' => 'Wario', ], [ 'title' => 'Let\'s go', 'speaker' => 'Yoshi', ], ], ]); }, ); test( 'When a child step is nested in the extraction and does not use each(), the extracted value is an array with ' . 'the keys defined in extract(), rather than an array of such arrays as it would be with each().', function () { $xml = << ABCDEFGmbH foo Germany Frankfurt Saubär GmbH bar Austria Klagenfurt XML; $expectedCompany1 = [ 'name' => 'ABCDEFGmbH', 'founded' => '1984', 'location' => ['country' => 'Germany', 'city' => 'Frankfurt'], ]; $expectedCompany2 = [ 'name' => 'Saubär GmbH', 'founded' => '2014', 'location' => ['country' => 'Austria', 'city' => 'Klagenfurt'], ]; // With base root() $step = Xml::each(Dom::xPath('//companies/company'))->extract([ 'name' => Dom::cssSelector('name')->text(), 'founded' => Dom::xPath('//founded')->attribute('year'), 'location' => Xml::root()->extract([ 'country' => Dom::xPath('//location/country')->text(), 'city' => Dom::cssSelector('location city')->text(), ]), ]); $outputs = helper_invokeStepWithInput($step, $xml); expect($outputs)->toHaveCount(2) ->and($outputs[0]->get())->toBe($expectedCompany1) ->and($outputs[1]->get())->toBe($expectedCompany2); // With base first() $step = Xml::each(Dom::xPath('//companies/company'))->extract([ 'name' => Dom::cssSelector('name')->text(), 'founded' => Dom::xPath('//founded')->attribute('year'), 'location' => Xml::first(Dom::cssSelector('location'))->extract([ 'country' => Dom::xPath('//country')->text(), 'city' => Dom::cssSelector('city')->text(), ]), ]); $outputs = helper_invokeStepWithInput($step, $xml); expect($outputs)->toHaveCount(2) ->and($outputs[0]->get())->toBe($expectedCompany1) ->and($outputs[1]->get())->toBe($expectedCompany2); // With base last() $step = Xml::each(Dom::xPath('//companies/company'))->extract([ 'name' => Dom::cssSelector('name')->text(), 'founded' => Dom::xPath('//founded')->attribute('year'), 'location' => Xml::last(Dom::cssSelector('location'))->extract([ 'country' => Dom::xPath('//country')->text(), 'city' => Dom::cssSelector('city')->text(), ]), ]); $outputs = helper_invokeStepWithInput($step, $xml); expect($outputs)->toHaveCount(2) ->and($outputs[0]->get())->toBe($expectedCompany1) ->and($outputs[1]->get())->toBe($expectedCompany2); }, ); it('works when the response string starts with an UTF-8 byte order mark character', function () { $response = new RespondedRequest( new Request('GET', 'https://www.example.com/rss'), new Response(body: helper_getStepFilesContent('Xml/rss-with-bom.xml')), ); $outputs = helper_invokeStepWithInput( Xml::each('channel item')->extract([ 'url' => 'link', 'title' => 'title', ]), $response, ); expect($outputs[0]->get())->toBe([ 'url' => 'https://www.example.com/story/1234567/foo-bar-baz?ref=rss', 'title' => 'Some title', ]); }); test( 'when selecting elements with each(), you can reference the element already selected within the each() selector ' . 'itself, in sub selectors', function () { $xml = << 123 456 XML; $response = new RespondedRequest( new Request('GET', 'https://www.example.com/foo'), new Response(body: $xml), ); $output = helper_invokeStepWithInput( Xml::each('data items item')->extract([ // This is what this test is about. The element already selected in each (item) can be // referenced in these child selectors. 'id' => Dom::cssSelector('item > id'), 'attribute' => Dom::cssSelector('')->attribute('attr'), ]), $response, ); expect($output)->toHaveCount(1) ->and($output[0]->get())->toBe(['id' => '123', 'attribute' => 'abc']); }, ); it('works with tags with camelCase names', function () { $xml = << foo foo abc-123 2024-11-07T11:00:31Z Foo bar baz! https://www.example.com/item-1?utm_source=foo&utm_medium=feed-xml test XML; $response = new RespondedRequest( new Request('GET', 'https://www.example.com/xml-feed'), new Response(body: $xml), ); $outputs = helper_invokeStepWithInput( Xml::each(Dom::cssSelector('feed items item'))->extract([ 'title' => 'title', 'some-url' => 'someUrl', 'foo-bar-baz' => 'foo baRbaz', ]), $response, ); expect($outputs[0]->get())->toBe([ 'title' => 'Foo bar baz!', 'some-url' => 'https://www.example.com/item-1?utm_source=foo&utm_medium=feed-xml', 'foo-bar-baz' => 'test', ]); })->group('php84'); ================================================ FILE: tests/Steps/_Files/Csv/basic.csv ================================================ 123,"Otsch","https://www.otsch.codes" 234,"John Doe","https://www.john.doe" 345,"Jane Doe","https://www.jane.doe" ================================================ FILE: tests/Steps/_Files/Csv/enclosure.csv ================================================ 123,?Kräftige Rindsuppe?,4.5 234,?Crispy Chicken Burger?,12 345,?Duett von Saibling und Forelle?,21 ================================================ FILE: tests/Steps/_Files/Csv/escape.csv ================================================ 123,"test %"escape%" test",test 123,"foo %"escape%" bar %"baz%" lorem",test ================================================ FILE: tests/Steps/_Files/Csv/separator.csv ================================================ 123*"CoDerOtsch"*Christian*Olear*35 234*"g3n1u5"*Albert*Einstein*143 345*"sWiFtY"*Taylor*Swift*32 ================================================ FILE: tests/Steps/_Files/Csv/with-column-headlines.csv ================================================ Stunde,Montag,Dienstag,Mittwoch,Donnerstag,Freitag 1,Mathematik,Deutsch,Englisch,Erdkunde,Politik 2,Sport,Deutsch,Englisch,Sport,Geschichte 3,Sport,"Religion (ev., kath.)",Kunst,,Kunst ================================================ FILE: tests/Steps/_Files/Html/basic.html ================================================

    match 1

    match 2

    match 3

    ================================================ FILE: tests/Steps/_Files/Html/bookstore.html ================================================ Bookstore Example in HTML :)

    Everyday Italian

    Giada De Laurentiis
    2005 - 30.00

    Harry Potter

    J K. Rowling
    2005 - 29.99

    XQuery Kick Start

    James McGovern, Per Bothner, Kurt Cagle, James Linn, Vaidyanathan Nagarajan 2003 - 49.99

    Learning XML

    Erik T. Ray
    2003 - 39.95
    ================================================ FILE: tests/Steps/_Files/Html/event.html ================================================ Bookstore Example in HTML :)

    Some Meetup

    Somewhere
    2023-01-14 21:00

    Sophisticated talk title

    Super Mario
    Slides

    Simple beginner talk

    Luigi
    Slides

    Fun talk

    Princess Peach
    Slides
    ================================================ FILE: tests/Steps/_Files/Xml/bookstore.xml ================================================ Everyday Italian Giada De Laurentiis 2005 30.00 Harry Potter J K. Rowling 2005 29.99 XQuery Kick Start James McGovern Per Bothner Kurt Cagle James Linn Vaidyanathan Nagarajan 2003 49.99 Learning XML Erik T. Ray 2003 39.95 ================================================ FILE: tests/Steps/_Files/Xml/events.xml ================================================ Some Meetup Somewhere 2023-01-14 20:00 Sophisticated talk title Super Mario Fun talk Princess Peach Another Meetup Somewhere else 2023-01-21 19:00 Join the dark side Wario Let's go Yoshi ================================================ FILE: tests/Steps/_Files/Xml/rss-with-bom.xml ================================================ Foo - Barhttps://www.example.com/somethinglorem ipsum dolor sitde-dehttps://www.example.com/story/1234567/foo-bar-baz?ref=rsshttps://www.example.com/story/1234567/foo-bar-baz?ref=rssFooBarSome titlelorem ipsum dolor sit ametMon, 08 May 2023 14:08:21 ZFoto: Foo/BarFoto: Foo/Bar ================================================ FILE: tests/Stores/JsonFileStoreTest.php ================================================ $value) { $result->set($key, $value); } return $result; } it('saves Results to a JSON file', function () { $result1 = helper_getResultWithJsonData(['user' => 'otsch', 'firstname' => 'Christian', 'surname' => 'Olear']); $store = new JsonFileStore(__DIR__ . '/_files', 'test'); $store->store($result1); expect(file_get_contents($store->filePath()))->toBe('[{"user":"otsch","firstname":"Christian","surname":"Olear"}]'); $result2 = helper_getResultWithJsonData(['user' => 'hader', 'firstname' => 'Josef', 'surname' => 'Hader']); $store->store($result2); expect(file_get_contents($store->filePath()))->toBe( '[{"user":"otsch","firstname":"Christian","surname":"Olear"},' . '{"user":"hader","firstname":"Josef","surname":"Hader"}]', ); $result3 = helper_getResultWithJsonData(['user' => 'evamm', 'firstname' => 'Eva Maria', 'surname' => 'Maier']); $store->store($result3); expect(file_get_contents($store->filePath()))->toBe( '[{"user":"otsch","firstname":"Christian","surname":"Olear"},' . '{"user":"hader","firstname":"Josef","surname":"Hader"},' . '{"user":"evamm","firstname":"Eva Maria","surname":"Maier"}]', ); }); afterAll(function () { $dir = __DIR__ . '/_files'; if (file_exists($dir)) { $files = scandir($dir); if (is_array($files)) { foreach ($files as $file) { if ($file === '.' || $file === '..' || !str_ends_with($file, '.json')) { continue; } @unlink($dir . '/' . $file); } } } }); ================================================ FILE: tests/Stores/SimpleCsvFileStoreTest.php ================================================ $value) { $result->set($key, $value); } return $result; } it('saves Results to a csv file', function () { $result1 = helper_getResultWithData(['user' => 'otsch', 'firstname' => 'Christian', 'surname' => 'Olear']); $store = new SimpleCsvFileStore(__DIR__ . '/_files', 'test'); $store->store($result1); expect(file_get_contents($store->filePath()))->toBe("user,firstname,surname\notsch,Christian,Olear\n"); $result2 = helper_getResultWithData(['user' => 'hader', 'firstname' => 'Josef', 'surname' => 'Hader']); $store->store($result2); expect(file_get_contents($store->filePath()))->toBe( "user,firstname,surname\notsch,Christian,Olear\nhader,Josef,Hader\n", ); $result3 = helper_getResultWithData(['user' => 'evamm', 'firstname' => 'Eva Maria', 'surname' => 'Maier']); $store->store($result3); expect(file_get_contents($store->filePath()))->toBe( "user,firstname,surname\notsch,Christian,Olear\nhader,Josef,Hader\nevamm,\"Eva Maria\",Maier\n", ); }); test('if the value of a result property is an array, it concatenates the values separated with a pipe', function () { $result1 = helper_getResultWithData(['col1' => 'foo', 'col2' => ['bar', 'baz', 'quz']]); $store = new SimpleCsvFileStore(__DIR__ . '/_files', 'test2'); $store->store($result1); expect(file_get_contents($store->filePath()))->toBe("col1,col2\nfoo,\"bar | baz | quz\"\n"); $result2 = helper_getResultWithData(['col1' => 'Donald', 'col2' => ['Tick', 'Trick', 'Track']]); $store->store($result2); expect(file_get_contents($store->filePath()))->toBe( "col1,col2\nfoo,\"bar | baz | quz\"\nDonald,\"Tick | Trick | Track\"\n", ); }); afterAll(function () { $dir = __DIR__ . '/_files'; if (file_exists($dir)) { $files = scandir($dir); if (is_array($files)) { foreach ($files as $file) { if ($file === '.' || $file === '..' || !str_ends_with($file, '.csv')) { continue; } unlink($dir . '/' . $file); } } } }); ================================================ FILE: tests/Stores/_files/.gitkeep ================================================ ================================================ FILE: tests/UserAgents/BotUserAgentTest.php ================================================ assertStringContainsString('SomeBot', $userAgent); }); test('Create UserAgent instance via static make method', function () { $userAgent = BotUserAgent::make('CrwlrBot'); $this->assertStringContainsString('CrwlrBot', $userAgent); }); test('Create instance with info uri', function () { $userAgent = new BotUserAgent('SomeBot', 'https://www.example.com/somebot'); $this->assertStringContainsString('SomeBot; +https://www.example.com/somebot', $userAgent); }); test('Create instance with info uri and version', function () { $userAgent = new BotUserAgent('SomeBot', 'https://www.example.com/somebot', '1.3'); $this->assertStringContainsString('SomeBot/1.3; +https://www.example.com/somebot', $userAgent); }); test('Create instance with version but without info uri', function () { $userAgent = new BotUserAgent('SomeBot', version: '1.3'); $this->assertStringContainsString('SomeBot/1.3)', $userAgent); }); test('User agent string starts with Mozilla/5.0', function () { $userAgent = new BotUserAgent('ExampleBot', 'https://www.example.com/bot', '2.0'); expect($userAgent->__toString())->toStartWith('Mozilla/5.0'); }); ================================================ FILE: tests/UserAgents/UserAgentTest.php ================================================ __toString())->toBe($string); }, )->with([ '', 'Foo', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 ' . 'Safari/537.36', '%$§$!")(=aäöüäö?ßß``2304980=)(§$/&!"=)=', ]); ================================================ FILE: tests/Utils/GzipTest.php ================================================ not->toBe($string) ->and(strlen($compressed))->toBeLessThan(strlen($string)); }); it('decodes a string', function () { $encoded = Gzip::encode('Hello World!'); expect($encoded)->not->toBe('Hello World!') ->and(Gzip::decode($encoded))->toBe('Hello World!'); }); it('does not generate a warning, when string to decode actually isn\'t encoded', function () { $warnings = []; set_error_handler(function ($errno, $errstr) use (&$warnings) { if ($errno === E_WARNING) { $warnings[] = $errstr; } return false; }); $decoded = Gzip::decode('Hello World!'); restore_error_handler(); expect($decoded)->toBe('Hello World!') ->and($warnings)->toBeEmpty(); }); ================================================ FILE: tests/Utils/HttpHeadersTest.php ================================================ 'de', 'Accept-Encoding' => ['gzip', 'deflate', 'br'], ]))->toBe([ 'Accept-Language' => ['de'], 'Accept-Encoding' => ['gzip', 'deflate', 'br'], ]); }); it('merges two header arrays', function () { $headers = [ 'Accept-Language' => ['de'], 'Accept-Encoding' => ['gzip', 'deflate', 'br'], ]; $merge = [ 'Accept' => ['text/html', 'application/xhtml+xml', 'application/xml'], 'Accept-Language' => ['de', 'en'], ]; expect(HttpHeaders::merge($headers, $merge))->toBe([ 'Accept-Language' => ['de', 'en'], 'Accept-Encoding' => ['gzip', 'deflate', 'br'], 'Accept' => ['text/html', 'application/xhtml+xml', 'application/xml'], ]); }); it('adds a single value to a certain header in a headers array', function () { $headers = ['Accept-Language' => ['de']]; expect(HttpHeaders::addTo($headers, 'Accept-Language', 'en'))->toBe(['Accept-Language' => ['de', 'en']]); }); it('adds an array of values to a certain header in a headers array', function () { $headers = ['Accept-Language' => ['de']]; expect( HttpHeaders::addTo($headers, 'Accept-Language', ['en-US', 'en']), )->toBe(['Accept-Language' => ['de', 'en-US', 'en']]); }); it('adds the header when calling addTo() with a header name that the array does not contain yet', function () { $headers = ['Accept-Encoding' => ['gzip', 'deflate', 'br']]; expect( HttpHeaders::addTo($headers, 'Accept-Language', ['de', 'en']), )->toBe([ 'Accept-Encoding' => ['gzip', 'deflate', 'br'], 'Accept-Language' => ['de', 'en'], ]); }); ================================================ FILE: tests/Utils/OutputTypeHelperTest.php ================================================ 'bar', 'baz']; } }; expect(OutputTypeHelper::objectToArray($object))->toBe(['foo' => 'bar', 'baz']); }); it('converts an object with a toArray() method to an array', function () { $object = new class { /** * @return string[] */ public function toArray(): array { return ['foo' => 'bar']; } }; expect(OutputTypeHelper::objectToArray($object))->toBe(['foo' => 'bar']); }); it('converts an object with a __serialize() method to an array', function () { $object = new class { public function __serialize(): array { return ['winnie' => 'the pooh']; } }; expect(OutputTypeHelper::objectToArray($object))->toBe(['winnie' => 'the pooh']); }); it('converts an object to an array by just casting it', function () { $object = new class { public string $foo = 'one'; public string $bar = 'two'; }; expect(OutputTypeHelper::objectToArray($object))->toBe(['foo' => 'one', 'bar' => 'two']); }); it('checks if a value is a scalar value', function (mixed $value, bool $expectedResult) { expect(OutputTypeHelper::isScalar($value))->toBe($expectedResult); })->with([ ['foo', true], [123, true], [true, true], [false, true], [1.23, true], [['foo', 'bar'], true], // only associative array counts as non scalar for the output types [['foo' => 'bar'], false], [new stdClass(), false], ]); it('checks if a value is an associative array', function (mixed $value, bool $expectedResult) { expect(OutputTypeHelper::isAssociativeArray($value))->toBe($expectedResult); })->with([ ['foo', false], [['foo', 'bar'], false], [['foo' => 'bar'], true], [new stdClass(), false], ]); it( 'checks if a value is an associative array or object (a.k.a. non-scalar)', function (mixed $value, bool $expectedResult) { expect(OutputTypeHelper::isAssociativeArrayOrObject($value))->toBe($expectedResult); }, )->with([ ['foo', false], [['foo', 'bar'], false], [['foo' => 'bar'], true], [new stdClass(), true], ]); ================================================ FILE: tests/Utils/RequestKeyTest.php ================================================ 'gzip, deflate, br']); expect(RequestKey::from($request))->toBe('fc2a9e78c97e68674201853cea4a3d74'); $request = $request->withAddedHeader('accept-language', 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7'); expect(RequestKey::from($request))->not()->toBe('fc2a9e78c97e68674201853cea4a3d74'); }); it('makes a cache key from a RespondedRequest object', function () { $respondedRequest = new RespondedRequest( new Request('GET', 'https://www.crwl.io/en/home', ['accept-encoding' => 'gzip, deflate, br']), new Response(), ); expect(RequestKey::from($respondedRequest))->toBe('08bcc643c9fb21af5e4f3361243e2220'); }); test('when creating the key it ignores cookies in the sent headers by default', function () { $request = new Request('GET', 'https://www.crwlr.software/packages', ['accept-encoding' => 'gzip, deflate, br']); $keyWithoutCookie = RequestKey::from($request); $request = new Request('GET', 'https://www.crwlr.software/packages', [ 'accept-encoding' => 'gzip, deflate, br', 'Cookie' => 'cookieName=v4lu3', ]); expect(RequestKey::from($request))->toBe($keyWithoutCookie); }); it('also ignores other headers when provided in second parameter', function () { $request = new Request('GET', 'https://www.example.com', ['accept-encoding' => 'gzip, deflate, br']); $keyWithAcceptEncodingHeader = RequestKey::from($request); $keyWithoutAcceptEncodingHeader = RequestKey::from($request, ['accept-encoding']); expect($keyWithAcceptEncodingHeader)->not()->toBe($keyWithoutAcceptEncodingHeader); $request = new Request('GET', 'https://www.example.com', ['Accept-Encoding' => 'gzip']); $anotherKeyWithoutAcceptEncodingHeader = RequestKey::from($request, ['accept-encoding']); expect($keyWithoutAcceptEncodingHeader)->toBe($anotherKeyWithoutAcceptEncodingHeader); }); ================================================ FILE: tests/Utils/TemplateStringTest.php ================================================ 'foo', 'asdf' => 'asdf', 'var' => 'yolo', 'asdf\'asdf' => 'replace', 'qu"z' => 'double', ]); expect($replaced)->toBe( << 'bonjour', 'two' => 'ciao'], ), )->toBe('hi bonjour/ciao bye'); }); ================================================ FILE: tests/_Integration/GroupTest.php ================================================ input('http://localhost:8000/blog-post-with-json-ld'); $crawler ->addStep(Http::get()) ->addStep( Crawler::group() ->addStep( Html::first('#content article.blog-post') ->extract(['title' => 'h1', 'date' => '.date']), ) ->addStep( Html::schemaOrg() ->onlyType('BlogPosting') ->extract([ 'author' => 'author.name', 'keywords', ]), ) ->keep(), ); $result = helper_generatorToArray($crawler->run()); expect($result[0]->toArray())->toBe([ 'title' => 'Prevent Homograph Attacks using the crwlr/url Package', 'date' => '2022-01-19', 'author' => 'Christian Olear', 'keywords' => 'homograph, attack, security, idn, internationalized domain names, prevention, url, uri', ]); }, ); ================================================ FILE: tests/_Integration/Http/CharsetTest.php ================================================ input('http://localhost:8000/non-utf-8-charset') ->addStep(Http::get()) ->addStep(Html::root()->extract(['foo' => '.element'])); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1) ->and($results[0]->toArray())->toBe(['foo' => '0 l/m²']); }); ================================================ FILE: tests/_Integration/Http/CrawlingTest.php ================================================ robotsTxtHandler = new class ($this, $this->logger) extends RobotsTxtHandler { public function isAllowed(UriInterface|Url|string $url): bool { if (is_string($url)) { $url = Url::parse($url); } elseif ($url instanceof UriInterface) { $url = Url::parse($url); } if ($url->path() === '/not-allowed') { return false; } return parent::isAllowed($url); } }; } public function load(mixed $subject): ?RespondedRequest { $request = $this->validateSubjectType($subject); $this->loadedUrls[] = $request->getUri()->__toString(); return parent::load($subject); } } /** * To check if the Crawler stays on the same host or same domain when crawling, the PSR-18 HTTP ClientInterface * of this Crawler's Loader, replaces the host in the request URI just before sending the Request. The Loader thinks * it actually loaded the page from the incoming URI and the returned RespondedRequest object also has that original URI * as effectiveUri (except if the requested page redirects). */ class Crawler extends HttpCrawler { public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): TestLoader { $client = new class implements ClientInterface { private Client $guzzleClient; public function __construct() { $this->guzzleClient = new Client(); } public function sendRequest(RequestInterface $request): ResponseInterface { $request = $request->withUri($request->getUri()->withHost('localhost')->withPort(8000)); return $this->guzzleClient->sendRequest($request); } }; $loader = new TestLoader($userAgent, $client, $logger); // To not slow down tests unnecessarily $loader->throttle() ->waitBetween(new MultipleOf(0.0001), new MultipleOf(0.0002)) ->waitAtLeast(Microseconds::fromSeconds(0.0001)); return $loader; } protected function userAgent(): UserAgentInterface { return new UserAgent('SomeUserAgent'); } /** * This method is here for the return type, so phpstan doesn't complain. */ public function getLoader(): TestLoader { return parent::getLoader(); // @phpstan-ignore-line } } /** @var TestCase $this */ it('stays on the same host by default', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/main') ->addStep(Http::crawl()); $crawler->runAndTraverse(); expect($crawler->getLoader()->loadedUrls)->not()->toContain('http://foo.example.com/crawling/main-on-subdomain'); }); it('stays on the same domain when method sameDomain() is called', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/main') ->addStep(Http::crawl()->sameDomain()); $crawler->runAndTraverse(); expect($crawler->getLoader()->loadedUrls)->toContain('http://foo.example.com/crawling/main-on-subdomain') ->and($crawler->getLoader()->loadedUrls)->not()->toContain('https://www.crwlr.software/packages/crawler'); }); it('stays on the same host when method sameHost() is called', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/main') ->addStep( Http::crawl() ->sameDomain() ->sameHost(), ); $crawler->runAndTraverse(); expect($crawler->getLoader()->loadedUrls)->not()->toContain('http://foo.example.com/crawling/main-on-subdomain'); }); it('crawls every page of a website that is linked somewhere', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/main') ->addStep(Http::crawl()); $crawler->runAndTraverse(); expect($crawler->getLoader()->loadedUrls)->toHaveCount(6) ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main') ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1') ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1') ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2') ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1') ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1'); }); it('crawls only to a certain depth when the crawl depth is defined', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/main') ->addStep(Http::crawl()->depth(1)); $crawler->runAndTraverse(); expect($crawler->getLoader()->loadedUrls)->toHaveCount(3); $crawler = (new Crawler()) ->input('http://www.example.com/crawling/main') ->addStep(Http::crawl()->depth(2)); $crawler->runAndTraverse(); expect($crawler->getLoader()->loadedUrls)->toHaveCount(5); }); it('extracts URLs from a sitemap if you call method inputIsSitemap()', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/sitemap.xml') ->addStep(Http::crawl()->inputIsSitemap()); $crawler->runAndTraverse(); expect($crawler->getLoader()->loadedUrls)->toHaveCount(7); }); it('fails to extract URLs if you provide a sitemap as input and don\'t call inputIsSitemap()', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/sitemap.xml') ->addStep(Http::crawl()); $crawler->runAndTraverse(); expect($crawler->getLoader()->loadedUrls)->toHaveCount(1); }); it( 'extracts URLs from a sitemap where the tag contains attributes that cause symfony DomCrawler to fail', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/sitemap2.xml') ->addStep(Http::crawl()->inputIsSitemap()); $crawler->runAndTraverse(); expect($crawler->getLoader()->loadedUrls)->toHaveCount(7); }, ); it('loads only pages where the path starts with a certain string when method pathStartsWith() is called', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/sitemap.xml') ->addStep( Http::crawl() ->inputIsSitemap() ->pathStartsWith('/crawling/sub1'), ); $crawler->runAndTraverse(); expect($crawler->getLoader()->loadedUrls)->toHaveCount(3) ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sitemap.xml') ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1') ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1'); }); it('loads only URLs where the path matches a regex when method pathMatches() is used', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/sitemap.xml') ->addStep( Http::crawl() ->inputIsSitemap() ->pathMatches('/^\/crawling\/sub[12]$/'), ); $crawler->runAndTraverse(); expect($crawler->getLoader()->loadedUrls)->toHaveCount(3); }); it('loads only URLs where the Closure passed to method customFilter() returns true', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/sitemap.xml') ->addStep( Http::crawl() ->inputIsSitemap() ->customFilter(function (Url $url) { return in_array($url->path(), [ '/crawling/main', '/crawling/sub1/sub1', '/crawling/sub2/sub1/sub1', ], true); }), ); $crawler->runAndTraverse(); expect($crawler->getLoader()->loadedUrls)->toHaveCount(4) ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main') ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1') ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1'); }); it( 'receives the link element where the URL was found, as second param in the Closure passed to method ' . 'customFilter() when it was found in an HTML document', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/main') ->addStep( Http::crawl() ->customFilter(function (Url $url, ?HtmlElement $linkElement) { return $linkElement && str_contains($linkElement->text(), 'Subpage 2'); }), ); $crawler->runAndTraverse(); expect($crawler->getLoader()->loadedUrls)->toHaveCount(4) ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main') ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2') ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1') ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1'); }, ); it( 'loads all pages, but yields only responses where the URL path starts with a certain string, when methods ' . 'pathStartsWith() and loadAllButYieldOnlyMatching() are called', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/sitemap.xml') ->addStep( Http::crawl() ->inputIsSitemap() ->pathStartsWith('/crawling/sub2') ->loadAllButYieldOnlyMatching(), ); $results = helper_generatorToArray($crawler->run()); expect($crawler->getLoader()->loadedUrls)->toHaveCount(7) ->and($results)->toHaveCount(3); }, ); it( 'loads all URLs, but yields only responses where the URL path matches a regex, when methods pathMatches() and ' . 'loadAllButYieldOnlyMatching() are called', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/sitemap.xml') ->addStep( Http::crawl() ->inputIsSitemap() ->pathMatches('/^\/crawling\/sub[12]$/') ->loadAllButYieldOnlyMatching(), ); $results = helper_generatorToArray($crawler->run()); expect($crawler->getLoader()->loadedUrls)->toHaveCount(7) ->and($results)->toHaveCount(2); }, ); it( 'loads all URLs but yields only responses where the Closure passed to method customFilter() returns true, when ' . 'methods customFilter() and loadAllButYieldOnlyMatching() are called', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/sitemap.xml') ->addStep( Http::crawl() ->inputIsSitemap() ->customFilter(function (Url $url) { return in_array($url->path(), [ '/crawling/main', '/crawling/sub1/sub1', '/crawling/sub2/sub1/sub1', ], true); }) ->loadAllButYieldOnlyMatching(), ); $results = helper_generatorToArray($crawler->run()); expect($crawler->getLoader()->loadedUrls)->toHaveCount(7) ->and($results)->toHaveCount(3); }, ); it( 'keeps the fragment parts in URLs and treats the same URL with a different fragment part as separate URLs when ' . 'keepUrlFragment() was called', function () { // Explanation: in almost all cases URLs with a fragment part at the end (#something) will respond with the // same content. So, to avoid loading the same page multiple times, the step throws away the fragment part of // discovered URLs by default. $crawler = (new Crawler()) ->input('http://www.example.com/crawling/main') ->addStep(Http::crawl()->keepUrlFragment()->keep(['url'])); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(8); $urls = []; foreach ($results as $result) { $urls[] = $result->get('url'); } expect($urls)->toContain('http://www.example.com/crawling/sub2') ->and($urls)->toContain('http://www.example.com/crawling/sub2#fragment1') ->and($urls)->toContain('http://www.example.com/crawling/sub2#fragment2'); }, ); it('stops crawling when maxOutputs is reached', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/main') ->addStep( Http::crawl() ->keepUrlFragment() ->maxOutputs(4), ); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(4) ->and($crawler->getLoader()->loadedUrls)->toHaveCount(4); }); it('uses canonical links when useCanonicalLinks() is called', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/main') ->addStep( Http::crawl() ->useCanonicalLinks() ->keep(['url']), ); $results = helper_generatorToArray($crawler->run()); $resultUrls = array_map(function (Result $result) { return $result->get('url'); }, $results); expect($resultUrls) ->toBe([ 'http://www.example.com/crawling/main', 'http://www.example.com/crawling/sub1/sub1', // actual loaded url was sub1, but canonical is sub1/sub1 'http://www.example.com/crawling/sub2', 'http://www.example.com/crawling/sub2/sub1/sub1', ]) ->and($crawler->getLoader()->loadedUrls) ->toBe([ 'http://www.example.com/crawling/main', 'http://www.example.com/crawling/sub1', // => /crawling/sub1/sub1 => this URL wasn't loaded yet, 'http://www.example.com/crawling/sub2', // so when the link is discovered it won't load it. 'http://www.example.com/crawling/sub2/sub1', // => /crawling/sub1/sub1 => this URL was already loaded, 'http://www.example.com/crawling/sub2/sub1/sub1', // so the response is not yielded as a separate result. ]); }); it('does not yield the same page twice when a URL was redirected to an already loaded page', function () { $crawler = (new Crawler()) ->input('http://www.example.com/crawling/redirect') ->addStep(Http::crawl()->keep(['url'])); $results = helper_generatorToArray($crawler->run()); $resultUrls = array_map(function (Result $result) { return $result->get('url'); }, $results); expect($resultUrls) ->toContain('http://www.example.com/crawling/main') ->and($resultUrls) ->not() ->toContain('http://www.example.com/crawling/redirect') ->and($this->getActualOutputForAssertion()) ->toContain('Was already loaded before. Do not process this page again.'); }); it('does not produce a fatal error when the initial request fails', function () { $crawler = (new Crawler()) ->input('http://www.example.com/not-allowed') ->addStep(Http::crawl()->keep(['url'])); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(0); }); ================================================ FILE: tests/_Integration/Http/ErrorResponsesTest.php ================================================ inputs(['http://localhost:8000/client-error-response']) ->addStep(Http::{$method}()->keepAs('response')); $results = helper_generatorToArray($crawler->run()); expect($results)->toBeEmpty(); })->with(['get', 'post', 'put', 'patch', 'delete']); it('does not yield server error responses by default', function (string $method) { $crawler = new ErrorCrawler(); $crawler->inputs(['http://localhost:8000/server-error-response']) ->addStep(Http::{$method}()->keepAs('response')); $results = helper_generatorToArray($crawler->run()); expect($results)->toBeEmpty(); })->with(['get', 'post', 'put', 'patch', 'delete']); it('yields client error responses when yieldErrorResponses() was called', function (string $method) { $crawler = new ErrorCrawler(); $crawler->inputs(['http://localhost:8000/client-error-response']) ->addStep(Http::{$method}()->yieldErrorResponses()->keepAs('response')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1); })->with(['get', 'post', 'put', 'patch', 'delete']); it('yields server error responses when yieldErrorResponses() was called', function (string $method) { $crawler = new ErrorCrawler(); $crawler->inputs(['http://localhost:8000/server-error-response']) ->addStep(Http::{$method}()->yieldErrorResponses()->keepAs('response')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1); })->with(['get', 'post', 'put', 'patch', 'delete']); it( 'goes on crawling after a client error response when stopOnErrorResponse() wasn\'t called', function (string $method) { $crawler = new ErrorCrawler(); $crawler->inputs(['http://localhost:8000/client-error-response', 'http://localhost:8000/simple-listing']) ->addStep(Http::{$method}()->keepAs('response')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1); }, )->with(['get', 'post', 'put', 'patch', 'delete']); it( 'goes on crawling after a server error response when stopOnErrorResponse() wasn\'t called', function (string $method) { $crawler = new ErrorCrawler(); $crawler->inputs(['http://localhost:8000/server-error-response', 'http://localhost:8000/simple-listing']) ->addStep(Http::{$method}()->keepAs('response')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1); }, )->with(['get', 'post', 'put', 'patch', 'delete']); it( 'stops crawling (throws exception) after a client error response when the stopOnErrorResponse() method was called', function (string $method) { $crawler = new ErrorCrawler(); $crawler->inputs(['http://localhost:8000/client-error-response', 'http://localhost:8000/simple-listing']) ->addStep(Http::{$method}()->stopOnErrorResponse()); $crawler->runAndTraverse(); }, )->with(['get', 'post', 'put', 'patch', 'delete'])->throws(LoadingException::class); it( 'stops crawling (throws exception) after a server error response when the stopOnErrorResponse() method was called', function (string $method) { $crawler = new ErrorCrawler(); $crawler->inputs(['http://localhost:8000/client-error-response', 'http://localhost:8000/simple-listing']) ->addStep( Http::{$method}() ->stopOnErrorResponse(), ); $crawler->runAndTraverse(); }, )->with(['get', 'post', 'put', 'patch', 'delete'])->throws(LoadingException::class); it('does not log warnings about multiple loader hook calls when stopOnErrorResponse() is used', function () { $crawler = new ErrorCrawler(); $crawler->inputs(['http://localhost:8000/hello-world', 'http://localhost:8000/simple-listing']) ->addStep(Http::get()->stopOnErrorResponse()); $crawler->runAndTraverse(); foreach ($crawler->getLogger()->messages as $message) { expect($message['message'])->not->toContain(' was already called in this load call.'); } }); ================================================ FILE: tests/_Integration/Http/GzipTest.php ================================================ input('http://localhost:8000/gzip') ->addStep(Http::get()->keepAs('response')); $results = helper_generatorToArray($crawler->run()); expect($results[0])->toBeInstanceOf(Result::class) ->and($results[0]->get('response'))->toBeInstanceOf(RespondedRequest::class) ->and(Http::getBodyString($results[0]->get('response')))->toBe('This is a gzip compressed string'); }); ================================================ FILE: tests/_Integration/Http/HeadlessBrowserTest.php ================================================ useHeadlessBrowser(); return $loader; } } class GetJsonFromResponseHtmlBody extends Step { protected function invoke(mixed $input): Generator { $html = Http::getBodyString($input->response); $jsonString = (new HtmlDocument($html))->querySelector('body pre')?->text() ?? ''; yield json_decode($jsonString, true); } } class GetStringFromResponseHtmlBody extends Step { protected function invoke(mixed $input): Generator { $html = Http::getBodyString($input->response); yield (new HtmlDocument($html))->querySelector('body')?->text() ?? ''; } } /** * @return Cookie[] */ function helper_getCookiesByDomainFromLoader(HttpLoader $loader, string $domain): array { $cookieJar = invade($loader)->cookieJar; /** @var CookieJar $cookieJar */ return $cookieJar->allByDomain($domain); } it('automatically uses the Loader\'s user agent', function () { $crawler = new HeadlessBrowserCrawler(); $crawler->input('http://localhost:8000/print-headers') ->addStep(Http::get()) ->addStep((new GetJsonFromResponseHtmlBody())->keepAs('responseBody')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1) ->and($results[0]->get('responseBody'))->toBeArray() ->and($results[0]->get('responseBody'))->toHaveKey('User-Agent') ->and($results[0]->get('responseBody')['User-Agent'])->toBe('HeadlessBrowserBot'); }); it( 'does not use the user-agent defined in the crawler, when useNativeUserAgent() was called on the browser loader ' . 'helper', function () { $crawler = new HeadlessBrowserCrawler(); $crawler ->getLoader() ->browser() ->useNativeUserAgent(); $crawler->input('http://localhost:8000/print-headers') ->addStep(Http::get()) ->addStep((new GetJsonFromResponseHtmlBody())->keepAs('responseBody')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1) ->and($results[0]->get('responseBody'))->toBeArray() ->and($results[0]->get('responseBody'))->toHaveKey('User-Agent') ->and($results[0]->get('responseBody')['User-Agent'])->toStartWith('Mozilla/5.0 ('); }, ); it('uses cookies', function () { $crawler = new HeadlessBrowserCrawler(); $crawler ->input('http://localhost:8000/set-cookie') ->addStep(Http::get()) ->addStep(new class extends Step { protected function invoke(mixed $input): Generator { yield 'http://localhost:8000/print-cookie'; } }) ->addStep(Http::get()) ->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookie')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1) ->and($results[0]->get('printed-cookie'))->toBeString() ->and($results[0]->get('printed-cookie'))->toBe('foo123'); }); it('does not use cookies when HttpLoader::dontUseCookies() was called', function () { $crawler = new HeadlessBrowserCrawler(); $crawler->getLoader()->dontUseCookies(); $crawler ->input('http://localhost:8000/set-cookie') ->addStep(Http::get()) ->addStep(new class extends Step { protected function invoke(mixed $input): Generator { yield 'http://localhost:8000/print-cookie'; } }) ->addStep(Http::get()) ->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookie')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1) ->and($results[0]->get('printed-cookie'))->toBeEmpty(); }); it('renders javascript', function () { $crawler = new HeadlessBrowserCrawler(); $crawler->input('http://localhost:8000/js-rendering') ->addStep(Http::get()) ->addStep( Html::root() ->extract(['content' => '#content p']), ); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1) ->and($results[0]->toArray())->toBe([ 'content' => 'This was added through javascript', ]); }); it('gets cookies that are set via javascript', function () { $crawler = new HeadlessBrowserCrawler(); $cache = new FileCache(helper_cachedir()); $cache->clear(); $crawler->getLoader()->setCache($cache); $crawler ->input('http://localhost:8000/set-js-cookie') ->addStep(Http::get()); helper_generatorToArray($crawler->run()); $cookiesInJar = helper_getCookiesByDomainFromLoader($crawler->getLoader(), 'localhost'); $testCookie = $cookiesInJar['testcookie'] ?? null; expect($cookiesInJar)->toHaveCount(1) ->and($testCookie?->name())->toBe('testcookie') ->and($testCookie?->value())->toBe('javascriptcookie'); // Check that cookie is not added to the cookiejar when the response was served from cache. $crawler = new HeadlessBrowserCrawler(); $crawler->getLoader()->setCache($cache); $crawler ->input('http://localhost:8000/set-js-cookie') ->addStep(Http::get()); helper_generatorToArray($crawler->run()); $cookiesInJar = helper_getCookiesByDomainFromLoader($crawler->getLoader(), 'localhost'); expect($cookiesInJar)->toHaveCount(0); }); it('gets a cookie that is set via a click, executed via post browser navigate hook', function () { $crawler = new HeadlessBrowserCrawler(); $crawler ->input('http://localhost:8000/set-delayed-js-cookie') ->addStep( Http::get() ->postBrowserNavigateHook(BrowserAction::clickElement('#consent_btn')), ) ->addStep(new class extends Step { protected function invoke(mixed $input): Generator { yield 'http://localhost:8000/print-cookie'; } }) ->addStep(Http::get()) ->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookie')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1) ->and($results[0]->get('printed-cookie'))->toBeString() ->and($results[0]->get('printed-cookie'))->toBe('javascriptcookie'); $cookiesInJar = helper_getCookiesByDomainFromLoader($crawler->getLoader(), 'localhost'); $testCookie = $cookiesInJar['testcookie'] ?? null; expect($cookiesInJar)->toHaveCount(1) ->and($testCookie?->name())->toBe('testcookie') ->and($testCookie?->value())->toBe('javascriptcookie'); }); it( 'sending cookies works correctly when the loader is not configured to use the browser but two steps use the ' . 'browser by calling the useBrowser() method of Http steps', function () { $crawler = HttpCrawler::make()->withMozilla5CompatibleUserAgent(); $crawler ->input('http://localhost:8000/set-multiple-js-cookies') ->addStep(Http::get()->useBrowser()) ->addStep(new class extends Step { protected function invoke(mixed $input): Generator { yield 'http://localhost:8000/print-cookies'; } }) ->addStep(Http::get()->useBrowser()) ->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookies')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1) ->and($results[0]->get('printed-cookies'))->toBeString() ->and($results[0]->get('printed-cookies')) ->toBe('cookie3=cookie3value;cookie2=cookie2value;cookie1=cookie1value'); $cookiesInJar = helper_getCookiesByDomainFromLoader($crawler->getLoader(), 'localhost'); expect($cookiesInJar)->toHaveCount(3) ->and($cookiesInJar['cookie1']->value())->toBe('cookie1value') ->and($cookiesInJar['cookie2']->value())->toBe('cookie2value') ->and($cookiesInJar['cookie3']->value())->toBe('cookie3value'); }, ); test( 'BrowserAction::clickElement(), clickInsideShadowDom(), evaluate(), moveMouseToElement(), ' . 'moveMouseToPosition(), scrollDown(), scrollUp() and typeText() work as expected', function () { $crawler = new HeadlessBrowserCrawler(); $crawler ->getLoader() ->browser() ->includeShadowElementsInHtml(); $crawler ->input('http://localhost:8000/browser-actions') ->addStep( Http::get() // Inserting the #click_element is delayed in the page, so this also tests, that the // BrowserAction::clickElement() action automatically waits for an element matching the selector // to be present. ->postBrowserNavigateHook(BrowserAction::clickElement('#click_element')) ->postBrowserNavigateHook(BrowserAction::screenshot(ScreenshotConfig::make(helper_storagedir()))) ->postBrowserNavigateHook(BrowserAction::clickInsideShadowDom('#shadow_host', '#shadow_click_div')) ->postBrowserNavigateHook( BrowserAction::evaluate( 'document.getElementById(\'evaluation_container\').innerHTML = \'evaluated\'', ), ) ->postBrowserNavigateHook(BrowserAction::moveMouseToElement('#mouseover_check_1')) ->postBrowserNavigateHook(BrowserAction::moveMouseToPosition(305, 405)) ->postBrowserNavigateHook(BrowserAction::scrollDown(4000)) ->postBrowserNavigateHook( BrowserAction::screenshot( ScreenshotConfig::make(helper_storagedir()) ->setImageFileType('jpeg') ->setQuality(20) ->setFullPage(), ), ) ->postBrowserNavigateHook(BrowserAction::scrollUp(2000)) ->postBrowserNavigateHook(BrowserAction::scrollUp(2000)) ->postBrowserNavigateHook(BrowserAction::clickElement('#input')) ->postBrowserNavigateHook(BrowserAction::typeText('typing text works')) ->keep(['body', 'screenshots']), ); $results = helper_generatorToArray($crawler->run()); $body = $results[0]->get('body'); $screenshots = $results[0]->get('screenshots'); expect($body)->toContain('
    yes
    ') // This also tests the `HeadlessBrowserLoaderHelper::includeShadowElementsInHtml()` method, // because even if the click worked, with the normal way of getting HTML this wouldn't be // included in the returned HTML. ->and($body)->toContain('
    clicked
    ') ->and($body)->toContain('
    evaluated
    ') ->and($body)->toContain('
    mouse was here
    ') ->and($body)->toContain('
    mouse was here
    ') ->and($body)->toContain('
    scrolled down
    ') ->and($body)->toContain('
    scrolled up
    ') ->and($body)->toContain('
    typing text works
    ') ->and($screenshots)->toHaveCount(2) ->and($screenshots[0])->toEndWith('.png') ->and($screenshots[1])->toEndWith('.jpeg'); if (function_exists('getimagesize')) { $screenshot1Size = getimagesize($screenshots[0]); $screenshot2Size = getimagesize($screenshots[1]); if (is_array($screenshot1Size) && is_array($screenshot2Size)) { expect($screenshot1Size[1])->toBeLessThan(2100) ->and($screenshot2Size[1])->toBeGreaterThan(4000); } } helper_resetStorageDir(); }, ); test('BrowserAction::waitUntilDocumentContainsElement() works as expected', function () { $crawler = new HeadlessBrowserCrawler(); $crawler ->input('http://localhost:8000/browser-actions/wait') ->addStep( Http::get() ->postBrowserNavigateHook( BrowserAction::waitUntilDocumentContainsElement('#delayed_container'), ) ->keep('body'), ); $results = helper_generatorToArray($crawler->run()); $body = $results[0]->get('body'); expect($body)->toContain('
    hooray
    '); }); test('BrowserAction::clickElementAndWaitForReload() works as expected', function () { $crawler = new HeadlessBrowserCrawler(); $crawler ->input('http://localhost:8000/browser-actions/click-and-wait-for-reload') ->addStep( Http::get() ->postBrowserNavigateHook(BrowserAction::clickElementAndWaitForReload('#click')) ->keep('body'), ); $results = helper_generatorToArray($crawler->run()); $body = $results[0]->get('body'); expect($body)->toContain('
    yes
    '); }); test( 'when on the click and wait for reload page, and the element is only clicked but we don\'t wait for reload, ' . 'we don\'t get the reloaded page content', function () { $crawler = new HeadlessBrowserCrawler(); $crawler ->input('http://localhost:8000/browser-actions/click-and-wait-for-reload') ->addStep( Http::get() ->postBrowserNavigateHook(BrowserAction::clickElement('#click')) ->keep('body'), ); $results = helper_generatorToArray($crawler->run()); $body = $results[0]->get('body'); expect($body)->not()->toContain('
    yes
    '); }, ); test( 'when on the click and wait for reload page, and the element is clicked and we also wait for reload, we get the ' . 'reloaded page content', function () { $crawler = new HeadlessBrowserCrawler(); $crawler ->input('http://localhost:8000/browser-actions/click-and-wait-for-reload') ->addStep( Http::get() ->postBrowserNavigateHook(BrowserAction::clickElement('#click')) ->postBrowserNavigateHook(BrowserAction::waitForReload()) ->keep('body'), ); $results = helper_generatorToArray($crawler->run()); $body = $results[0]->get('body'); expect($body)->toContain('
    yes
    '); }, ); test('BrowserAction::evaluateAndWaitForReload() works as expected', function () { $crawler = new HeadlessBrowserCrawler(); $crawler ->input('http://localhost:8000/browser-actions/evaluate-and-wait-for-reload') ->addStep( Http::get() ->postBrowserNavigateHook( BrowserAction::evaluateAndWaitForReload( 'window.location.href = \'http://localhost:8000/browser-actions/' . 'evaluate-and-wait-for-reload-reloaded\'', ), ) ->keep('body'), ); $results = helper_generatorToArray($crawler->run()); $body = $results[0]->get('body'); expect($body)->toContain('
    yay
    '); }); test('BrowserAction::wait() works as expected', function () { $crawler = new HeadlessBrowserCrawler(); $crawler ->input('http://localhost:8000/browser-actions/wait') ->addStep( Http::get() ->postBrowserNavigateHook(BrowserAction::wait(0.3)) ->keep('body'), ); $results = helper_generatorToArray($crawler->run()); $body = $results[0]->get('body'); expect($body)->toContain('
    hooray
    '); }); it('executes the javascript code provided via HeadlessBrowserLoaderHelper::setPageInitScript()', function () { $crawler = new HeadlessBrowserCrawler(); $crawler ->getLoader() ->browser() ->setPageInitScript('window._secret_content = \'secret content\''); $crawler ->input('http://localhost:8000/page-init-script') ->addStep(Http::get()) ->addStep(Html::root()->extract(['content' => '#content'])); $results = helper_generatorToArray($crawler->run()); expect($results[0]->get('content'))->toBe('secret content'); }); it('gets the source of an XML response without being wrapped in an HTML document', function () { $crawler = new HeadlessBrowserCrawler(); $crawler ->input('http://localhost:8000/rss-feed') ->addStep(Http::get()->keep(['body'])); $results = helper_generatorToArray($crawler->run()); expect($results[0]->get('body'))->toStartWith('' . PHP_EOL . 'input('http://localhost:8000/broken-mime-type-rss') ->addStep(Http::get()->keep(['body'])); $results = helper_generatorToArray($crawler->run()); expect($results[0]->get('body'))->toStartWith(''); }, ); ================================================ FILE: tests/_Integration/Http/Html/PaginatedListingTest.php ================================================ input('http://localhost:8000/paginated-listing'); $crawler ->addStep(Http::get()->paginate('#nextPage')) ->addStep(Html::getLinks('#listing .item a')->keepAs('url')) ->addStep(Http::get()) ->addStep( Html::first('article') ->extract(['title' => 'h1', 'number' => '.someNumber']) ->keep(), ); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(10) ->and($results[0]->toArray())->toBe([ 'url' => 'http://localhost:8000/paginated-listing/items/1', 'title' => 'Some Item 1', 'number' => '10', ]) ->and($results[9]->toArray())->toBe([ 'url' => 'http://localhost:8000/paginated-listing/items/10', 'title' => 'Some Item 10', 'number' => '100', ]); }); ================================================ FILE: tests/_Integration/Http/Html/SimpleListingTest.php ================================================ input('http://localhost:8000/simple-listing'); $crawler->addStep(Http::get()) ->addStep(Html::getLinks('.listingItem a')) ->addStep(Http::get()) ->addStep( Html::first('article') ->extract([ 'title' => 'h1', 'date' => '.date', 'author' => '.articleAuthor', ]) ->keep(), ); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(3) ->and($results[0]->toArray())->toBe([ 'title' => 'Some Article 1', 'date' => '2022-04-13', 'author' => 'Christian Olear', ]) ->and($results[1]->toArray())->toBe([ 'title' => 'Some Article 2', 'date' => '2022-04-14', 'author' => 'Christian Olear', ]) ->and($results[2]->toArray())->toBe([ 'title' => 'Some Article 3', 'date' => '2022-04-15', 'author' => 'Christian Olear', ]); }); ================================================ FILE: tests/_Integration/Http/PaginationTest.php ================================================ input('http://localhost:8000/paginated-listing') ->addStep(Http::get()->paginate('#pagination')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(5); }); it('only iterates pagination until max pages limit is reached', function () { $crawler = new PaginationCrawler(); $crawler->input('http://localhost:8000/paginated-listing') ->addStep(Http::get()->paginate('#pagination', 2)); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(2) ->and($this->getActualOutputForAssertion())->toContain('Max pages limit reached'); }); it('resets the finished paginating state after each processed (/paginated) input', function () { $crawler = new PaginationCrawler(); $crawler ->inputs(['http://localhost:8000/paginated-listing', 'http://localhost:8000/paginated-listing?foo=bar']) ->addStep(Http::get()->paginate('#pagination', 2)->outputKey('response')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(4); }); ================================================ FILE: tests/_Integration/Http/ProxyingTest.php ================================================ */ public static array $processes = [8001 => null, 8002 => null, 8003 => null]; } beforeEach(function () { $startedProcesses = false; foreach (ProxyServerProcesses::PORTS as $port) { if (!ProxyServerProcesses::$processes[$port]) { ProxyServerProcesses::$processes[$port] = Process::fromShellCommandline( 'php -S localhost:' . $port . ' ' . __DIR__ . '/../ProxyServer.php', ); ProxyServerProcesses::$processes[$port]->start(); $startedProcesses = true; } } if ($startedProcesses) { usleep(100_000); } }); afterAll(function () { foreach (ProxyServerProcesses::PORTS as $port) { ProxyServerProcesses::$processes[$port]?->stop(3, SIGINT); ProxyServerProcesses::$processes[$port] = null; } }); it('uses a proxy when the useProxy() method of the loader was called', function () { $crawler = helper_getFastCrawler(); $crawler->getLoader()->useProxy('http://localhost:8001'); $crawler ->input('http://www.crwlr.software/packages') ->addStep(Http::get()->keep(['body'])); $results = iterator_to_array($crawler->run()); expect($results[0]) ->toBeInstanceOf(Result::class) ->and($results[0]->get('body')) ->toContain('Proxy Server Response for http://www.crwlr.software/packages'); }); it('uses correct method, headers and HTTP version in the proxied request', function () { $crawler = helper_getFastCrawler(); $crawler->getLoader()->useProxy('http://localhost:8001'); $crawler ->input('http://www.crwlr.software/packages') ->addStep( Http::put(['Accept-Encoding' => 'gzip, deflate, br'], 'Hello World', '1.0') ->keep(['body']), ); $results = iterator_to_array($crawler->run()); expect($results[0]) ->toBeInstanceOf(Result::class) ->and($results[0]->get('body')) ->toContain('Protocol Version: HTTP/1.0') ->toContain('Request Method: PUT') ->toContain('Request Body: Hello World') ->toContain('["Accept-Encoding"]=>' . PHP_EOL . ' string(17) "gzip, deflate, br"'); }); it('uses rotating proxies when the useRotatingProxies() method of the loader was called', function () { $crawler = helper_getFastCrawler(); $crawler->getLoader()->useRotatingProxies([ 'http://localhost:8001', 'http://localhost:8002', 'http://localhost:8003', ]); $crawler ->input([ 'http://www.crwlr.software/packages/crawler/v1.1/getting-started', 'http://www.crwlr.software/packages/url/v2.0/getting-started', 'http://www.crwlr.software/packages/query-string/v1.0/getting-started', 'http://www.crwlr.software/packages/robots-txt/v1.1/getting-started', ]) ->addStep(Http::get()->keep(['body'])); $results = iterator_to_array($crawler->run()); expect($results)->toHaveCount(4) ->and($results[0]) ->toBeInstanceOf(Result::class) ->and($results[0]->get('body')) ->toContain('Port: 8001') // First request with first proxy ->and($results[1]) ->toBeInstanceOf(Result::class) ->and($results[1]->get('body')) ->toContain('Port: 8002') // Second request with second proxy ->and($results[2]) ->toBeInstanceOf(Result::class) ->and($results[2]->get('body')) ->toContain('Port: 8003') // Third request with third proxy ->and($results[3]) ->toBeInstanceOf(Result::class) ->and($results[3]->get('body')) ->toContain('Port: 8001'); // And finally the fourth request with the first proxy again. }); it('can also use a proxy when using the headless browser', function () { $crawler = helper_getFastCrawler(); $crawler ->getLoader() ->useHeadlessBrowser() ->useProxy('http://localhost:8001'); $crawler ->input('http://www.crwlr.software/blog') ->addStep( Http::get(['Accept-Language' => 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7']) ->keep(['body']), ); $results = iterator_to_array($crawler->run()); expect($results[0]) ->toBeInstanceOf(Result::class) ->and($results[0]->get('body')) ->toContain('["Accept-Language"]=>' . PHP_EOL . ' string(35) "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"'); }); it('can also use rotating proxies when using the headless browser', function () { $crawler = helper_getFastCrawler(); $crawler ->getLoader() ->useHeadlessBrowser() ->useRotatingProxies([ 'http://localhost:8001', 'http://localhost:8002', ]); $crawler ->input([ 'http://www.crwlr.software/packages/crawler/v1.1', 'http://www.crwlr.software/packages/url/v2.0', 'http://www.crwlr.software/packages/query-string/v1.0', ]) ->addStep(Http::get()->keep(['body'])); $results = iterator_to_array($crawler->run()); expect($results)->toHaveCount(3) ->and($results[0]) ->toBeInstanceOf(Result::class) ->and($results[0]->get('body')) ->toContain('Port: 8001') // First request with first proxy ->and($results[1]) ->toBeInstanceOf(Result::class) ->and($results[1]->get('body')) ->toContain('Port: 8002') // Second request with second proxy ->and($results[2]) ->toBeInstanceOf(Result::class) ->and($results[2]->get('body')) ->toContain('Port: 8001'); // And finally the third request with the first proxy again. }); ================================================ FILE: tests/_Integration/Http/PublisherExampleTest.php ================================================ input('http://localhost:8000/publisher/authors') ->addStep(Http::get()) ->addStep(Html::getLinks('#authors a')) ->addStep(Http::get()) ->addStep( Html::root() ->extract([ 'author' => 'h1', 'bookUrls' => Dom::cssSelector('#author-data .books a.book')->attribute('href')->toAbsoluteUrl(), ]) ->keep(['author']), ) ->addStep(Http::get()->useInputKey('bookUrls')) ->addStep( Html::root() ->extract(['book' => 'h1']) ->keep(), ); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(5) ->and($results[0]->toArray())->toBe([ 'author' => 'John Example', 'book' => 'Some novel', ]) ->and($results[1]->toArray())->toBe([ 'author' => 'John Example', 'book' => 'Another novel', ]) ->and($results[2]->toArray())->toBe([ 'author' => 'Susan Example', 'book' => 'Poems #1', ]) ->and($results[3]->toArray())->toBe([ 'author' => 'Susan Example', 'book' => 'Poems #2', ]) ->and($results[4]->toArray())->toBe([ 'author' => 'Susan Example', 'book' => 'Poems #3', ]); }); it('turns an array of URLs to nested extracted data from those child pages using sub crawlers', function () { $crawlerBuilder = new class { public function build(): \Crwlr\Crawler\Crawler { $crawler = new PublisherExampleCrawler(); return $crawler ->input('http://localhost:8000/publisher/authors') ->addStep(Http::get()) ->addStep(Html::getLinks('#authors a')) ->addStep(Http::get()) ->addStep($this->extractAuthorData()); } private function extractAuthorData(): Html { return Html::root() ->extract([ 'name' => 'h1', 'age' => '#author-data .age', 'bornIn' => '#author-data .born-in', 'books' => Dom::cssSelector('#author-data .books a.book')->link(), ]) ->subCrawlerFor('books', function (\Crwlr\Crawler\Crawler $crawler) { return $crawler ->addStep(Http::get()) ->addStep( $this->extractBookData(), ); }); } private function extractBookData(): Html { return Html::root() ->extract(['title' => 'h1', 'editions' => Dom::cssSelector('#editions a')->link()]) ->subCrawlerFor('editions', function (\Crwlr\Crawler\Crawler $crawler) { return $crawler ->addStep(Http::get()) ->addStep($this->extractEditionData()); }); } private function extractEditionData(): Html { return Html::root() ->extract(['year' => '.year', 'publisher' => '.publishingCompany']); } }; $results = helper_generatorToArray($crawlerBuilder->build()->run()); expect($results)->toHaveCount(2) ->and($results[0]->toArray())->toBe([ 'name' => 'John Example', 'age' => '51', 'bornIn' => 'Lisbon', 'books' => [ [ 'title' => 'Some novel', 'editions' => [ ['year' => '1996', 'publisher' => 'Foo'], ['year' => '2005', 'publisher' => 'Foo'], ], ], [ 'title' => 'Another novel', 'editions' => [ ['year' => '2001', 'publisher' => 'Foo'], ['year' => '2009', 'publisher' => 'Bar'], ['year' => '2017', 'publisher' => 'Bar'], ], ], ], ]) ->and($results[1]->toArray())->toBe([ 'name' => 'Susan Example', 'age' => '49', 'bornIn' => 'Athens', 'books' => [ [ 'title' => 'Poems #1', 'editions' => [ ['year' => '2008', 'publisher' => 'Poems'], ['year' => '2009', 'publisher' => 'Poems'], ], ], [ 'title' => 'Poems #2', 'editions' => [ ['year' => '2011', 'publisher' => 'Poems'], ['year' => '2014', 'publisher' => 'New Poems'], ], ], [ 'title' => 'Poems #3', 'editions' => [ ['year' => '2013', 'publisher' => 'Poems'], ['year' => '2017', 'publisher' => 'New Poems'], ], ], ], ]); }); test('it can also keep the URLs, provided to the sub crawler', function () { $crawlerBuilder = new class { public function build(): \Crwlr\Crawler\Crawler { $crawler = new PublisherExampleCrawler(); return $crawler ->input('http://localhost:8000/publisher/authors') ->addStep(Http::get()) ->addStep(Html::getLinks('#authors a')) ->addStep(Http::get()) ->addStep($this->extractAuthorData()); } private function extractAuthorData(): Html { return Html::root() ->extract([ 'name' => 'h1', 'age' => '#author-data .age', 'bornIn' => '#author-data .born-in', 'books' => Dom::cssSelector('#author-data .books a.book')->link(), ]) ->subCrawlerFor('books', function (\Crwlr\Crawler\Crawler $crawler) { return $crawler ->addStep(Http::get()->keepInputAs('url')) ->addStep($this->extractBookData()); }); } private function extractBookData(): Html { return Html::root() ->extract(['title' => 'h1', 'editions' => Dom::cssSelector('#editions a')->link()]) ->subCrawlerFor('editions', function (\Crwlr\Crawler\Crawler $crawler) { return $crawler ->addStep(Http::get()->keepInputAs('url')) ->addStep($this->extractEditionData()); }); } private function extractEditionData(): Html { return Html::root() ->extract(['year' => '.year', 'publisher' => '.publishingCompany']); } }; $results = helper_generatorToArray($crawlerBuilder->build()->run()); expect($results)->toHaveCount(2) ->and($results[0]->toArray())->toBe([ 'name' => 'John Example', 'age' => '51', 'bornIn' => 'Lisbon', 'books' => [ [ 'url' => 'http://localhost:8000/publisher/books/1', 'title' => 'Some novel', 'editions' => [ [ 'url' => 'http://localhost:8000/publisher/books/1/edition/1', 'year' => '1996', 'publisher' => 'Foo', ], [ 'url' => 'http://localhost:8000/publisher/books/1/edition/2', 'year' => '2005', 'publisher' => 'Foo', ], ], ], [ 'url' => 'http://localhost:8000/publisher/books/2', 'title' => 'Another novel', 'editions' => [ [ 'url' => 'http://localhost:8000/publisher/books/2/edition/1', 'year' => '2001', 'publisher' => 'Foo', ], [ 'url' => 'http://localhost:8000/publisher/books/2/edition/2', 'year' => '2009', 'publisher' => 'Bar', ], [ 'url' => 'http://localhost:8000/publisher/books/2/edition/3', 'year' => '2017', 'publisher' => 'Bar', ], ], ], ], ]) ->and($results[1]->toArray())->toBe([ 'name' => 'Susan Example', 'age' => '49', 'bornIn' => 'Athens', 'books' => [ [ 'url' => 'http://localhost:8000/publisher/books/3', 'title' => 'Poems #1', 'editions' => [ [ 'url' => 'http://localhost:8000/publisher/books/3/edition/1', 'year' => '2008', 'publisher' => 'Poems', ], [ 'url' => 'http://localhost:8000/publisher/books/3/edition/2', 'year' => '2009', 'publisher' => 'Poems', ], ], ], [ 'url' => 'http://localhost:8000/publisher/books/4', 'title' => 'Poems #2', 'editions' => [ [ 'url' => 'http://localhost:8000/publisher/books/4/edition/1', 'year' => '2011', 'publisher' => 'Poems', ], [ 'url' => 'http://localhost:8000/publisher/books/4/edition/2', 'year' => '2014', 'publisher' => 'New Poems', ], ], ], [ 'url' => 'http://localhost:8000/publisher/books/5', 'title' => 'Poems #3', 'editions' => [ [ 'url' => 'http://localhost:8000/publisher/books/5/edition/1', 'year' => '2013', 'publisher' => 'Poems', ], [ 'url' => 'http://localhost:8000/publisher/books/5/edition/2', 'year' => '2017', 'publisher' => 'New Poems', ], ], ], ], ]); }); ================================================ FILE: tests/_Integration/Http/QueryParamPaginationTest.php ================================================ input('http://localhost:8000/query-param-pagination') ->addStep( Http::post(body: 'page=1') ->paginate( Paginator::queryParams(5) ->inBody() ->increase('page') ->stopWhen(PaginatorStopRules::isEmptyInJson('data.items')), )->keep(['body']), ); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(4); }); it('also paginates using query params sent in the request body, when used in combination with static URL', function () { $crawler = new QueryParamPaginationCrawler(); $crawler ->input('foo') ->addStep( Http::post(body: 'page=1') ->staticUrl('http://localhost:8000/query-param-pagination') ->paginate( Paginator::queryParams(3) ->inBody() ->increase('page'), )->keep(['body']), ); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(3); }); it('paginates using URL query params', function () { $crawler = new QueryParamPaginationCrawler(); $crawler ->input('http://localhost:8000/query-param-pagination?page=1') ->addStep( Http::get() ->paginate( Paginator::queryParams(5) ->inUrl() ->increase('page') ->stopWhen(PaginatorStopRules::isEmptyInJson('data.items')), )->keep(['body']), ); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(4); }); it('paginates only until the max pages limit', function () { $crawler = new QueryParamPaginationCrawler(); $crawler ->input('http://localhost:8000/query-param-pagination?page=1') ->addStep( Http::get() ->paginate( QueryParamsPaginator::paramsInUrl(2) ->increase('page') ->stopWhen(PaginatorStopRules::isEmptyInJson('data.items')), )->keep(['body']), ); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(2); }); it('resets the finished paginating state after each processed (/paginated) input', function () { $crawler = new QueryParamPaginationCrawler(); $crawler ->inputs([ 'http://localhost:8000/query-param-pagination?page=1', 'http://localhost:8000/query-param-pagination?page=1&foo=bar', ]) ->addStep( Http::get() ->paginate( QueryParamsPaginator::paramsInUrl(2) ->increase('page') ->stopWhen(PaginatorStopRules::isEmptyInJson('data.items')), )->keep(['body']), ); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(4); }); ================================================ FILE: tests/_Integration/Http/RedirectTest.php ================================================ input('http://localhost:8000/redirect?stopAt=5') ->addStep(Http::get()) ->addStep((new GetResponseBodyAsString())->keepAs('body')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1) ->and($results[0]->get('body'))->toBe('success after 5 redirects'); }); it('stops at 10 redirects by default', function () { $crawler = new RedirectTestCrawler(); $crawler ->input('http://localhost:8000/redirect?stopAt=11') ->addStep(Http::get()) ->addStep((new GetResponseBodyAsString())->keepAs('body')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(0); $logOutput = $this->getActualOutputForAssertion(); expect($logOutput)->toContain('Failed to load http://localhost:8000/redirect?stopAt=11: Too many redirects.'); }); test('you can set your own max redirects limit', function () { $crawler = new class extends HttpCrawler { protected function userAgent(): UserAgentInterface { return new UserAgent('RedirectBot'); } protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface { $loader = parent::loader($userAgent, $logger); if ($loader instanceof HttpLoader) { $loader->setMaxRedirects(15); } return $loader; } }; $crawler ->input('http://localhost:8000/redirect?stopAt=11') ->addStep(Http::get()) ->addStep((new GetResponseBodyAsString())->keepAs('body')); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1) ->and($results[0]->get('body'))->toBe('success after 11 redirects'); }); ================================================ FILE: tests/_Integration/Http/RequestParamsFromInputTest.php ================================================ 'http://localhost:8000/print-headers', 'body' => 'test', 'headers' => [ 'header-x' => 'foo', 'header-y' => ['bar'], ], 'header-y' => 'baz', 'header-z' => ['quz'], ]; } }; $crawler = helper_getFastCrawler(); $crawler ->input('anything') ->addStep($paramsStep) ->addStep( Http::get() ->useInputKeyAsBody('body') ->useInputKeyAsHeaders('headers') ->useInputKeyAsHeader('header-y', 'header-y') ->useInputKeyAsHeader('header-z', 'header-z'), ) ->addStep(Json::all()); $results = helper_generatorToArray($crawler->run()); expect($results)->toHaveCount(1); $result = $results[0]->toArray(); expect($result['Content-Length'])->toBe('4'); expect($result['header-x'])->toBe('foo'); expect($result['header-y'])->toBe('bar, baz'); expect($result['header-z'])->toBe('quz'); }); ================================================ FILE: tests/_Integration/Http/RetryErrorResponsesTest.php ================================================ input('http://localhost:8000' . $path) ->addStep(Http::get()); $start = microtime(true); helper_generatorToArray($crawler->run()); $end = microtime(true); $diff = $end - $start; expect($diff)->toBeGreaterThan(3.0); expect($diff)->toBeLessThan(3.5); })->with(['/too-many-requests', '/service-unavailable']); it( 'starts the first retry after the number of seconds returned in the Retry-After HTTP header', function (string $path) { $crawler = new RetryErrorResponsesCrawler(); $crawler ->input('http://localhost:8000' . $path . '/retry-after') ->addStep(Http::get()); $start = microtime(true); helper_generatorToArray($crawler->run()); $end = microtime(true); $diff = $end - $start; expect($diff)->toBeGreaterThan(4.0); expect($diff)->toBeLessThan(4.5); }, )->with(['/too-many-requests', '/service-unavailable']); it('goes on crawling when a retry receives a successful response', function (string $path) { $crawler = new RetryErrorResponsesCrawler(); $crawler->input('http://localhost:8000' . $path . '/succeed-on-second-attempt') ->addStep(Http::get()); $start = microtime(true); $results = helper_generatorToArray($crawler->run()); $end = microtime(true); $diff = $end - $start; expect($results)->toHaveCount(1); expect($diff)->toBeGreaterThan(1.0); expect($diff)->toBeLessThan(1.5); })->with(['/too-many-requests', '/service-unavailable']); ================================================ FILE: tests/_Integration/Http/RobotsTxtTest.php ================================================ input('http://localhost:8000/hello-world') ->addStep(Http::get()) ->addStep(Html::root()->extract('body')->keepAs('body')); $results = helper_generatorToArray($crawler->run()); expect($results[0]->get('body'))->toBe('Hello World!'); $logger = $crawler->getLogger(); foreach ($logger->messages as $message) { expect($message['message'])->not->toContain(' was already called in this load call.'); } }); it('also does not warn about loader hooks being called multiple times when loadOrFail() is used', function () { // See comment in the test above. $crawler = new RobotsTxtCrawler(); $crawler ->input('http://localhost:8000/hello-world') ->addStep(Http::get()->stopOnErrorResponse()) ->addStep(Html::root()->extract('body')->keepAs('body')); $results = helper_generatorToArray($crawler->run()); expect($results[0]->get('body'))->toBe('Hello World!'); $logger = $crawler->getLogger(); foreach ($logger->messages as $message) { expect($message['message'])->not->toContain(' was already called in this load call.'); } }); ================================================ FILE: tests/_Integration/Http/TimeoutTest.php ================================================ 1, 'timeout' => 1, ]); } }; $crawler->input('http://localhost:8000/sleep') ->addStep(Http::get()); $crawler->runAndTraverse(); expect($this->getActualOutputForAssertion())->toContain('Operation timed out'); }); ================================================ FILE: tests/_Integration/ProxyServer.php ================================================ = $stopAt) { echo 'success after ' . $redirectNo . ' redirects'; return; } else { $stopAt = '&stopAt=' . $stopAt; } } else { $stopAt = ''; } header('Location: http://localhost:8000/redirect?no=' . ($redirectNo + 1) . $stopAt); } if (str_starts_with($route, '/non-utf-8-charset')) { return include(__DIR__ . '/_Server/NonUtf8.php'); } if (str_starts_with($route, '/page-init-script')) { return include(__DIR__ . '/_Server/PageInitScript.php'); } if ($route === '/rss-feed') { header('Content-Type: text/xml; charset=utf-8'); return include(__DIR__ . '/_Server/RssFeed.php'); } if ($route === '/broken-mime-type-rss') { header('Content-Type: application/rss+xml; charset=UTF-8'); return include(__DIR__ . '/_Server/BrokenMimeTypeRss.php'); } if ($route === '/robots.txt') { return << Prevent Homograph Attacks using the crwlr/url Package - crwlr.software

    Prevent Homograph Attacks using the crwlr/url Package

    2022-01-19

    This post is not crawling/scraping related, but about another valuable use case for the url package, to prevent so-called homograph attacks.

    About the attack

    Homograph attacks are using internationalized domain names (IDN) for malicious links including domains that look like trusted organizations. You might know attacks where they want to trick you with typos like faecbook or things like zeros instead of Os (g00gle). Using internationalized domain names this kind of attack is even harder to spot because they are using characters that almost exactly look like other characters (also depending on the font they're displayed with).

    Can you see the difference between those two As?

    a а

    No? But in fact they aren't the same. The second one is a Cyrillic character.
    You can check it e.g. by using PHP's ord function.

    var_dump(ord('a')); // int(97)
    var_dump(ord('а')); // int(208)

    Browsers already implemented mechanisms to warn users that a page they're visiting might not be as legitimate as they thought.

    But still: if on your website, you are linking to urls originating from user input, it'd be a good idea to have an eye on urls containing internationalized domain names.

    How to identify IDN urls using the Url class

    The Url class has the handy hasIdn method:

    $legitUrl = Url::parse('https://www.apple.com');
    $seemsLegitUrl = Url::parse('https://www.аpple.com');
    
    var_dump($legitUrl->hasIdn());              // bool(false)
    var_dump($seemsLegitUrl->hasIdn());         // bool(true)
    
    var_dump($legitUrl->__toString());          // string(21) "https://www.apple.com"
    var_dump($seemsLegitUrl->__toString());     // string(28) "https://www.xn--pple-43d.com"

    So you see, it's very easy to identify IDN urls with it. Of course there are many legitimate IDN domains, so you might not want to automatically block all of them. I'd suggest you could put some kind of monitoring in place that notifies you about users posting links to IDNs.

    Maybe you're operating in a country where IDNs are very common. Maybe in that case you can find a way to automatically sort out legitimate uses from your area.

    ================================================ FILE: tests/_Integration/_Server/BrokenMimeTypeRss.php ================================================ Lorem ipsum https://www.example.com/ Lorem ipsum dolor sit amet Fri, 10 Jan 2025 10:48:01 +0000 en hourly 1 Foo https://www.example.com/some-article https://www.example.com/some-article#comments Fri, 10 Jan 2025 10:48:01 +0000 https://www.example.com/?a=123 Lorem ipsum dolor

    sit amet

    ]]>
    ================================================ FILE: tests/_Integration/_Server/BrowserActions/ClickAndWaitForReload.php ================================================ Hello World
    Click here
    yes
    ================================================ FILE: tests/_Integration/_Server/BrowserActions/EvaluateAndWaitForReload.php ================================================ Hello World ================================================ FILE: tests/_Integration/_Server/BrowserActions/EvaluateAndWaitForReloadReloaded.php ================================================ Hello World
    yay
    ================================================ FILE: tests/_Integration/_Server/BrowserActions/Main.php ================================================ Hello World
    mouse wasn't here yet
    mouse wasn't here yet
    not scrolled up yet
    not scrolled down yet
    ================================================ FILE: tests/_Integration/_Server/BrowserActions/Wait.php ================================================ Hello World
    ================================================ FILE: tests/_Integration/_Server/Crawling.php ================================================ /crawling/sub1 * => /crawling/sub1/sub1 * => /crawling/sub2 * => /crawling/sub2/sub1 * => /crawling/sub2/sub1/sub1 */ if ($route === '/crawling/sitemap.xml') { echo << http://www.example.com/crawling/main http://www.example.com/crawling/sub1 http://www.example.com/crawling/sub1/sub1 http://www.example.com/crawling/sub2 http://www.example.com/crawling/sub2/sub1 http://www.example.com/crawling/sub2/sub1/sub1 XML; } if ($route === '/crawling/sitemap2.xml') { echo << http://www.example.com/crawling/main http://www.example.com/crawling/sub1 http://www.example.com/crawling/sub1/sub1 http://www.example.com/crawling/sub2 http://www.example.com/crawling/sub2/sub1 http://www.example.com/crawling/sub2/sub1/sub1 XML; } if ($route === '/crawling' || $route === '/crawling/redirect') { header('Location: http://www.example.com/crawling/main?redirect=1', true, 301); return ''; } if ($route === '/crawling/main' || $route === '/crawling/main?redirect=1') { $showRedirectLinkHtml = ''; if (!empty($_GET['redirect'] ?? null)) { $showRedirectLinkHtml = PHP_EOL . 'link'; } echo << {$showRedirectLinkHtml} Subpage 1
    Subpage 2
    Subpage 2 - Fragment 1
    Subpage 2 - Fragment 2
    External link mailto link javascript link phone link broken link HTML; } if ($route === '/crawling/sub1') { echo << foo Subpage 1 of Subpage 1
    External link Link to subdomain HTML; } if ($route === '/crawling/sub1/sub1') { echo <<

    Final level of sub1

    Subpage 1 of Subpage 1

    Back to main HTML; } if ($route === '/crawling/sub2') { echo << Subpage 1 of Subpage 2 HTML; } if ($route === '/crawling/sub2/sub1') { echo << foo Subpage 1 of Subpage 1 of Subpage 2 HTML; } if ($route === '/crawling/sub2/sub1/sub1') { echo <<

    Final level of sub2

    Subpage 1 of Subpage 1 of Subpage 2

    Back to Subpage 2 HTML; } if ($route === '/crawling/main-on-subdomain') { echo <<

    Main page on subdomain

    HTML; } ================================================ FILE: tests/_Integration/_Server/HelloWorld.php ================================================ Hello World! Hello World! ================================================ FILE: tests/_Integration/_Server/JsGeneratedContent.php ================================================ JS Generated Content
    ================================================ FILE: tests/_Integration/_Server/NonUtf8.php ================================================ Non UTF-8 charset page
    ================================================ FILE: tests/_Integration/_Server/PageInitScript.php ================================================
    ================================================ FILE: tests/_Integration/_Server/PaginatedListing/Detail.php ================================================ Paginated listing item detail

    Some Item

    ================================================ FILE: tests/_Integration/_Server/PaginatedListing.php ================================================ Paginated Listing
    Item

    asdlfkj asdlfka jsdlfk ajsdflk

    Item

    asdflk jasdlfk asdlfk asldfk

    ================================================ FILE: tests/_Integration/_Server/PrintCookie.php ================================================ $value) { echo $key . '=' . $value . ($key !== $lastKey ? ';' : ''); } } ================================================ FILE: tests/_Integration/_Server/PrintHeaders.php ================================================ <?=$author?>

    ================================================ FILE: tests/_Integration/_Server/Publisher/AuthorsListPage.php ================================================ Example Publishing Authors

    Our authors

    ================================================ FILE: tests/_Integration/_Server/Publisher/BookDetailPage.php ================================================ Book

    First Edition ' . 'Second Edition'; } elseif ($bookNo === 2) { // Another Novel echo 'First Edition ' . 'Second Edition ' . 'Third Edition'; } ?>
    ================================================ FILE: tests/_Integration/_Server/Publisher/EditionDetailPage.php ================================================ Book Edition 1996 Foo'; } elseif ($edition === 2) { echo '2005 Foo'; } } elseif ($bookNo === 2) { // Another Novel if ($edition === 1) { echo '2001 Foo'; } elseif ($edition === 2) { echo '2009 Bar'; } elseif ($edition === 3) { echo '2017 Bar'; } } elseif ($bookNo === 3) { // Poems #1 if ($edition === 1) { echo '2008 Poems'; } elseif ($edition === 2) { echo '2009 Poems'; } } elseif ($bookNo === 4) { // Poems #2 if ($edition === 1) { echo '2011 Poems'; } elseif ($edition === 2) { echo '2014 New Poems'; } } elseif ($bookNo === 5) { // Poems #3 if ($edition === 1) { echo '2013 Poems'; } elseif ($edition === 2) { echo '2017 New Poems'; } } ?> ================================================ FILE: tests/_Integration/_Server/QueryParamPagination.php ================================================ Example https://www.example.com Public RSS feed en <![CDATA[Foo, bar, baz]]> Wed, 08 Jan 2025 12:14:47 GMT Christian Olear ================================================ FILE: tests/_Integration/_Server/ServiceUnavailable.php ================================================ yo
    {$cookies}
    ================================================ FILE: tests/_Integration/_Server/SetDelayedCookieJs.php ================================================ Hey
    ================================================ FILE: tests/_Integration/_Server/SetMultipleCookiesJs.php ================================================ yo ================================================ FILE: tests/_Integration/_Server/SimpleListing/Detail.php ================================================ Simple listing article detail

    Some Article

    2022-04-
    ================================================ FILE: tests/_Integration/_Server/SimpleListing.php ================================================ Simple article listing
    Article 1

    asdfa sdlfka sdflkja sdflkj

    Article 2

    asldfkj aldfk jaslfk asdjflkajsdlf

    Article 3

    asldfk aslfdkjasd flkajsdfl kajsdflakjsdlf

    ================================================ FILE: tests/_Integration/_Server/TooManyRequests.php ================================================ nextUrl); } /** * @return array */ public function getLoaded(): array { return $this->loaded; } public function getLoadedCount(): int { return $this->loadedCount; } public function getLatestRequest(): ?RequestInterface { return $this->latestRequest; } public function limitReached(): bool { return $this->maxPagesReached(); } public function setFinished(): AbstractPaginator { return parent::setFinished(); } } ================================================ FILE: tests/_Stubs/Crawlers/DummyOne.php ================================================ userAgentCalled += 1; return new DummyTwoUserAgent('FooBot'); } /** * @return DummyTwoLogger */ protected function logger(): LoggerInterface { $this->loggerCalled += 1; return new DummyTwoLogger(); } /** * @return DummyTwoLoader */ protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface { $this->loaderCalled += 1; return new DummyTwoLoader($userAgent, null, $logger); } } ================================================ FILE: tests/_Stubs/DummyLogger.php ================================================ > */ public array $messages = []; public function emergency(string|Stringable $message, array $context = []): void { $this->log('emergency', $message, $context); } public function alert(string|Stringable $message, array $context = []): void { $this->log('alert', $message, $context); } public function critical(string|Stringable $message, array $context = []): void { $this->log('critical', $message, $context); } public function error(string|Stringable $message, array $context = []): void { $this->log('error', $message, $context); } public function warning(string|Stringable $message, array $context = []): void { $this->log('warning', $message, $context); } public function notice(string|Stringable $message, array $context = []): void { $this->log('notice', $message, $context); } public function info(string|Stringable $message, array $context = []): void { $this->log('info', $message, $context); } public function debug(string|Stringable $message, array $context = []): void { $this->log('debug', $message, $context); } /** * @param mixed $level * @param mixed[] $context */ public function log($level, string|Stringable $message, array $context = []): void { if (!is_string($level)) { throw new InvalidArgumentException('Level must be string.'); } if (!in_array($level, ['emergency', 'alert', 'critical', 'error', 'warning', 'notice', 'info', 'debug'], true)) { throw new UnexpectedValueException('Unknown log level.'); } $this->messages[] = ['level' => $level, 'message' => $message]; } } ================================================ FILE: tests/_Stubs/PhantasyLoader.php ================================================ request, $respondedRequest->response); } public static function fromArray(array $data): RespondedRequestChild { $respondedRequest = parent::fromArray($data); return self::fromRespondedRequest($respondedRequest); } public function itseme(): string { return 'mario'; } } ================================================ FILE: tests/_Temp/_cachedir/.gitkeep ================================================ ================================================ FILE: tests/_Temp/_storagedir/.gitkeep ================================================