Repository: apify/crawlee-python
Branch: master
Commit: 9becf12908f8
Files: 635
Total size: 2.6 MB

Directory structure:
gitextract_o1cy5s8w/

├── .editorconfig
├── .github/
│   ├── CODEOWNERS
│   ├── pull_request_template.md
│   └── workflows/
│       ├── _check_code.yaml
│       ├── _check_docs.yaml
│       ├── _release_docs.yaml
│       ├── _tests.yaml
│       ├── manual_release_stable.yaml
│       ├── on_issue.yaml
│       ├── on_master.yaml
│       ├── on_pull_request.yaml
│       └── on_schedule_tests.yaml
├── .gitignore
├── .markdownlint.yaml
├── .pre-commit-config.yaml
├── .rules.md
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── codecov.yaml
├── docs/
│   ├── deployment/
│   │   ├── apify_platform.mdx
│   │   ├── aws_lambda.mdx
│   │   ├── code_examples/
│   │   │   ├── apify/
│   │   │   │   ├── crawler_as_actor_example.py
│   │   │   │   ├── get_public_url.py
│   │   │   │   ├── log_with_config_example.py
│   │   │   │   ├── proxy_advanced_example.py
│   │   │   │   └── proxy_example.py
│   │   │   ├── aws/
│   │   │   │   ├── beautifulsoup_crawler_lambda.py
│   │   │   │   ├── playwright_crawler_lambda.py
│   │   │   │   └── playwright_dockerfile
│   │   │   └── google/
│   │   │       ├── cloud_run_example.py
│   │   │       └── google_example.py
│   │   ├── google_cloud.mdx
│   │   └── google_cloud_run.mdx
│   ├── examples/
│   │   ├── add_data_to_dataset.mdx
│   │   ├── beautifulsoup_crawler.mdx
│   │   ├── capture_screenshot_using_playwright.mdx
│   │   ├── capturing_page_snapshots_with_error_snapshotter.mdx
│   │   ├── code_examples/
│   │   │   ├── adaptive_playwright_crawler.py
│   │   │   ├── add_data_to_dataset_bs.py
│   │   │   ├── add_data_to_dataset_dataset.py
│   │   │   ├── add_data_to_dataset_pw.py
│   │   │   ├── beautifulsoup_crawler.py
│   │   │   ├── beautifulsoup_crawler_keep_alive.py
│   │   │   ├── beautifulsoup_crawler_stop.py
│   │   │   ├── capture_screenshot_using_playwright.py
│   │   │   ├── configure_json_logging.py
│   │   │   ├── crawl_all_links_on_website_bs.py
│   │   │   ├── crawl_all_links_on_website_pw.py
│   │   │   ├── crawl_multiple_urls_bs.py
│   │   │   ├── crawl_multiple_urls_pw.py
│   │   │   ├── crawl_specific_links_on_website_bs.py
│   │   │   ├── crawl_specific_links_on_website_pw.py
│   │   │   ├── crawl_website_with_relative_links_all_links.py
│   │   │   ├── crawl_website_with_relative_links_same_domain.py
│   │   │   ├── crawl_website_with_relative_links_same_hostname.py
│   │   │   ├── crawl_website_with_relative_links_same_origin.py
│   │   │   ├── export_entire_dataset_to_file_csv.py
│   │   │   ├── export_entire_dataset_to_file_json.py
│   │   │   ├── extract_and_add_specific_links_on_website_bs.py
│   │   │   ├── extract_and_add_specific_links_on_website_pw.py
│   │   │   ├── fill_and_submit_web_form_crawler.py
│   │   │   ├── fill_and_submit_web_form_request.py
│   │   │   ├── parsel_crawler.py
│   │   │   ├── parsel_crawler_with_error_snapshotter.py
│   │   │   ├── playwright_block_requests.py
│   │   │   ├── playwright_crawler.py
│   │   │   ├── playwright_crawler_with_camoufox.py
│   │   │   ├── playwright_crawler_with_error_snapshotter.py
│   │   │   ├── playwright_crawler_with_fingerprint_generator.py
│   │   │   ├── respect_robots_on_skipped_request.py
│   │   │   ├── respect_robots_txt_file.py
│   │   │   ├── resuming_paused_crawl.py
│   │   │   ├── run_parallel_crawlers.py
│   │   │   ├── using_browser_profiles_chrome.py
│   │   │   ├── using_browser_profiles_firefox.py
│   │   │   └── using_sitemap_request_loader.py
│   │   ├── crawl_all_links_on_website.mdx
│   │   ├── crawl_multiple_urls.mdx
│   │   ├── crawl_specific_links_on_website.mdx
│   │   ├── crawl_website_with_relative_links.mdx
│   │   ├── crawler_keep_alive.mdx
│   │   ├── crawler_stop.mdx
│   │   ├── export_entire_dataset_to_file.mdx
│   │   ├── fill_and_submit_web_form.mdx
│   │   ├── json_logging.mdx
│   │   ├── parsel_crawler.mdx
│   │   ├── playwright_crawler.mdx
│   │   ├── playwright_crawler_adaptive.mdx
│   │   ├── playwright_crawler_with_block_requests.mdx
│   │   ├── playwright_crawler_with_camoufox.mdx
│   │   ├── playwright_crawler_with_fingerprint_generator.mdx
│   │   ├── respect_robots_txt_file.mdx
│   │   ├── resuming_paused_crawl.mdx
│   │   ├── run_parallel_crawlers.mdx
│   │   ├── using_browser_profile.mdx
│   │   └── using_sitemap_request_loader.mdx
│   ├── guides/
│   │   ├── architecture_overview.mdx
│   │   ├── avoid_blocking.mdx
│   │   ├── code_examples/
│   │   │   ├── avoid_blocking/
│   │   │   │   ├── default_fingerprint_generator_with_args.py
│   │   │   │   └── playwright_with_fingerprint_generator.py
│   │   │   ├── creating_web_archive/
│   │   │   │   ├── manual_archiving_parsel_crawler.py
│   │   │   │   ├── manual_archiving_playwright_crawler.py
│   │   │   │   └── simple_pw_through_proxy_pywb_server.py
│   │   │   ├── error_handling/
│   │   │   │   ├── change_handle_error_status.py
│   │   │   │   ├── disable_retry.py
│   │   │   │   └── handle_proxy_error.py
│   │   │   ├── http_clients/
│   │   │   │   ├── parsel_curl_impersonate_example.py
│   │   │   │   ├── parsel_httpx_example.py
│   │   │   │   └── parsel_impit_example.py
│   │   │   ├── http_crawlers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── beautifulsoup_example.py
│   │   │   │   ├── custom_crawler_example.py
│   │   │   │   ├── http_example.py
│   │   │   │   ├── lexbor_parser.py
│   │   │   │   ├── lxml_parser.py
│   │   │   │   ├── lxml_saxonche_parser.py
│   │   │   │   ├── parsel_example.py
│   │   │   │   ├── pyquery_parser.py
│   │   │   │   ├── scrapling_parser.py
│   │   │   │   ├── selectolax_adaptive_run.py
│   │   │   │   ├── selectolax_context.py
│   │   │   │   ├── selectolax_crawler.py
│   │   │   │   ├── selectolax_crawler_run.py
│   │   │   │   └── selectolax_parser.py
│   │   │   ├── login_crawler/
│   │   │   │   ├── http_login.py
│   │   │   │   └── playwright_login.py
│   │   │   ├── playwright_crawler/
│   │   │   │   ├── browser_configuration_example.py
│   │   │   │   ├── browser_pool_page_hooks_example.py
│   │   │   │   ├── multiple_launch_example.py
│   │   │   │   ├── navigation_hooks_example.py
│   │   │   │   └── plugin_browser_configuration_example.py
│   │   │   ├── playwright_crawler_adaptive/
│   │   │   │   ├── handler.py
│   │   │   │   ├── init_beautifulsoup.py
│   │   │   │   ├── init_parsel.py
│   │   │   │   ├── init_prediction.py
│   │   │   │   └── pre_nav_hooks.py
│   │   │   ├── playwright_crawler_stagehand/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── browser_classes.py
│   │   │   │   ├── stagehand_run.py
│   │   │   │   └── support_classes.py
│   │   │   ├── proxy_management/
│   │   │   │   ├── inspecting_bs_example.py
│   │   │   │   ├── inspecting_pw_example.py
│   │   │   │   ├── integration_bs_example.py
│   │   │   │   ├── integration_pw_example.py
│   │   │   │   ├── quick_start_example.py
│   │   │   │   ├── session_bs_example.py
│   │   │   │   ├── session_pw_example.py
│   │   │   │   ├── tiers_bs_example.py
│   │   │   │   └── tiers_pw_example.py
│   │   │   ├── request_loaders/
│   │   │   │   ├── rl_basic_example.py
│   │   │   │   ├── rl_basic_example_with_persist.py
│   │   │   │   ├── rl_tandem_example.py
│   │   │   │   ├── rl_tandem_example_explicit.py
│   │   │   │   ├── sitemap_basic_example.py
│   │   │   │   ├── sitemap_example_with_persist.py
│   │   │   │   ├── sitemap_tandem_example.py
│   │   │   │   └── sitemap_tandem_example_explicit.py
│   │   │   ├── request_router/
│   │   │   │   ├── adaptive_crawler_handlers.py
│   │   │   │   ├── basic_request_handlers.py
│   │   │   │   ├── custom_router_default_only.py
│   │   │   │   ├── error_handler.py
│   │   │   │   ├── failed_request_handler.py
│   │   │   │   ├── http_pre_navigation.py
│   │   │   │   ├── playwright_pre_navigation.py
│   │   │   │   └── simple_default_handler.py
│   │   │   ├── running_in_web_server/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── crawler.py
│   │   │   │   └── server.py
│   │   │   ├── scaling_crawlers/
│   │   │   │   ├── max_tasks_per_minute_example.py
│   │   │   │   └── min_and_max_concurrency_example.py
│   │   │   ├── service_locator/
│   │   │   │   ├── service_conflicts.py
│   │   │   │   ├── service_crawler_configuration.py
│   │   │   │   ├── service_crawler_event_manager.py
│   │   │   │   ├── service_crawler_storage_client.py
│   │   │   │   ├── service_locator_configuration.py
│   │   │   │   ├── service_locator_event_manager.py
│   │   │   │   ├── service_locator_storage_client.py
│   │   │   │   ├── service_storage_configuration.py
│   │   │   │   └── service_storage_storage_client.py
│   │   │   ├── session_management/
│   │   │   │   ├── multi_sessions_http.py
│   │   │   │   ├── one_session_http.py
│   │   │   │   ├── sm_basic.py
│   │   │   │   ├── sm_beautifulsoup.py
│   │   │   │   ├── sm_http.py
│   │   │   │   ├── sm_parsel.py
│   │   │   │   ├── sm_playwright.py
│   │   │   │   └── sm_standalone.py
│   │   │   ├── storage_clients/
│   │   │   │   ├── custom_storage_client_example.py
│   │   │   │   ├── file_system_storage_client_basic_example.py
│   │   │   │   ├── file_system_storage_client_configuration_example.py
│   │   │   │   ├── memory_storage_client_basic_example.py
│   │   │   │   ├── redis_storage_client_basic_example.py
│   │   │   │   ├── redis_storage_client_configuration_example.py
│   │   │   │   ├── registering_storage_clients_example.py
│   │   │   │   ├── sql_storage_client_basic_example.py
│   │   │   │   └── sql_storage_client_configuration_example.py
│   │   │   ├── storages/
│   │   │   │   ├── cleaning_do_not_purge_example.py
│   │   │   │   ├── cleaning_purge_explicitly_example.py
│   │   │   │   ├── dataset_basic_example.py
│   │   │   │   ├── dataset_with_crawler_example.py
│   │   │   │   ├── dataset_with_crawler_explicit_example.py
│   │   │   │   ├── helper_add_requests_example.py
│   │   │   │   ├── helper_enqueue_links_example.py
│   │   │   │   ├── kvs_basic_example.py
│   │   │   │   ├── kvs_with_crawler_example.py
│   │   │   │   ├── kvs_with_crawler_explicit_example.py
│   │   │   │   ├── opening.py
│   │   │   │   ├── rq_basic_example.py
│   │   │   │   ├── rq_with_crawler_example.py
│   │   │   │   └── rq_with_crawler_explicit_example.py
│   │   │   └── trace_and_monitor_crawlers/
│   │   │       └── instrument_crawler.py
│   │   ├── crawler_login.mdx
│   │   ├── creating_web_archive.mdx
│   │   ├── error_handling.mdx
│   │   ├── http_clients.mdx
│   │   ├── http_crawlers.mdx
│   │   ├── playwright_crawler.mdx
│   │   ├── playwright_crawler_adaptive.mdx
│   │   ├── playwright_crawler_stagehand.mdx
│   │   ├── proxy_management.mdx
│   │   ├── request_loaders.mdx
│   │   ├── request_router.mdx
│   │   ├── running_in_web_server.mdx
│   │   ├── scaling_crawlers.mdx
│   │   ├── service_locator.mdx
│   │   ├── session_management.mdx
│   │   ├── storage_clients.mdx
│   │   ├── storages.mdx
│   │   └── trace_and_monitor_crawlers.mdx
│   ├── introduction/
│   │   ├── 01_setting_up.mdx
│   │   ├── 02_first_crawler.mdx
│   │   ├── 03_adding_more_urls.mdx
│   │   ├── 04_real_world_project.mdx
│   │   ├── 05_crawling.mdx
│   │   ├── 06_scraping.mdx
│   │   ├── 07_saving_data.mdx
│   │   ├── 08_refactoring.mdx
│   │   ├── 09_running_in_cloud.mdx
│   │   ├── code_examples/
│   │   │   ├── 02_bs.py
│   │   │   ├── 02_bs_better.py
│   │   │   ├── 02_request_queue.py
│   │   │   ├── 03_enqueue_strategy.py
│   │   │   ├── 03_finding_new_links.py
│   │   │   ├── 03_globs.py
│   │   │   ├── 03_original_code.py
│   │   │   ├── 03_transform_request.py
│   │   │   ├── 04_sanity_check.py
│   │   │   ├── 05_crawling_detail.py
│   │   │   ├── 05_crawling_listing.py
│   │   │   ├── 06_scraping.py
│   │   │   ├── 07_final_code.py
│   │   │   ├── 07_first_code.py
│   │   │   ├── 08_main.py
│   │   │   ├── 08_routes.py
│   │   │   ├── 09_apify_sdk.py
│   │   │   ├── __init__.py
│   │   │   └── routes.py
│   │   └── index.mdx
│   ├── pyproject.toml
│   ├── quick-start/
│   │   ├── code_examples/
│   │   │   ├── beautifulsoup_crawler_example.py
│   │   │   ├── parsel_crawler_example.py
│   │   │   ├── playwright_crawler_example.py
│   │   │   └── playwright_crawler_headful_example.py
│   │   └── index.mdx
│   └── upgrading/
│       ├── upgrading_to_v0x.md
│       └── upgrading_to_v1.md
├── pyproject.toml
├── renovate.json
├── src/
│   └── crawlee/
│       ├── __init__.py
│       ├── _autoscaling/
│       │   ├── __init__.py
│       │   ├── _types.py
│       │   ├── autoscaled_pool.py
│       │   ├── py.typed
│       │   ├── snapshotter.py
│       │   └── system_status.py
│       ├── _cli.py
│       ├── _consts.py
│       ├── _log_config.py
│       ├── _request.py
│       ├── _service_locator.py
│       ├── _types.py
│       ├── _utils/
│       │   ├── __init__.py
│       │   ├── blocked.py
│       │   ├── byte_size.py
│       │   ├── console.py
│       │   ├── context.py
│       │   ├── crypto.py
│       │   ├── docs.py
│       │   ├── file.py
│       │   ├── globs.py
│       │   ├── html_to_text.py
│       │   ├── models.py
│       │   ├── raise_if_too_many_kwargs.py
│       │   ├── recoverable_state.py
│       │   ├── recurring_task.py
│       │   ├── requests.py
│       │   ├── robots.py
│       │   ├── sitemap.py
│       │   ├── system.py
│       │   ├── time.py
│       │   ├── try_import.py
│       │   ├── urls.py
│       │   ├── wait.py
│       │   └── web.py
│       ├── browsers/
│       │   ├── __init__.py
│       │   ├── _browser_controller.py
│       │   ├── _browser_plugin.py
│       │   ├── _browser_pool.py
│       │   ├── _playwright_browser.py
│       │   ├── _playwright_browser_controller.py
│       │   ├── _playwright_browser_plugin.py
│       │   ├── _types.py
│       │   └── py.typed
│       ├── configuration.py
│       ├── crawlers/
│       │   ├── __init__.py
│       │   ├── _abstract_http/
│       │   │   ├── __init__.py
│       │   │   ├── _abstract_http_crawler.py
│       │   │   ├── _abstract_http_parser.py
│       │   │   ├── _http_crawling_context.py
│       │   │   └── py.typed
│       │   ├── _adaptive_playwright/
│       │   │   ├── __init__.py
│       │   │   ├── _adaptive_playwright_crawler.py
│       │   │   ├── _adaptive_playwright_crawler_statistics.py
│       │   │   ├── _adaptive_playwright_crawling_context.py
│       │   │   ├── _rendering_type_predictor.py
│       │   │   ├── _result_comparator.py
│       │   │   └── _utils.py
│       │   ├── _basic/
│       │   │   ├── __init__.py
│       │   │   ├── _basic_crawler.py
│       │   │   ├── _basic_crawling_context.py
│       │   │   ├── _context_pipeline.py
│       │   │   ├── _context_utils.py
│       │   │   ├── _logging_utils.py
│       │   │   └── py.typed
│       │   ├── _beautifulsoup/
│       │   │   ├── __init__.py
│       │   │   ├── _beautifulsoup_crawler.py
│       │   │   ├── _beautifulsoup_crawling_context.py
│       │   │   ├── _beautifulsoup_parser.py
│       │   │   ├── _utils.py
│       │   │   └── py.typed
│       │   ├── _http/
│       │   │   ├── __init__.py
│       │   │   ├── _http_crawler.py
│       │   │   └── _http_parser.py
│       │   ├── _parsel/
│       │   │   ├── __init__.py
│       │   │   ├── _parsel_crawler.py
│       │   │   ├── _parsel_crawling_context.py
│       │   │   ├── _parsel_parser.py
│       │   │   └── _utils.py
│       │   ├── _playwright/
│       │   │   ├── __init__.py
│       │   │   ├── _playwright_crawler.py
│       │   │   ├── _playwright_crawling_context.py
│       │   │   ├── _playwright_http_client.py
│       │   │   ├── _playwright_post_nav_crawling_context.py
│       │   │   ├── _playwright_pre_nav_crawling_context.py
│       │   │   ├── _types.py
│       │   │   └── _utils.py
│       │   ├── _types.py
│       │   └── py.typed
│       ├── errors.py
│       ├── events/
│       │   ├── __init__.py
│       │   ├── _event_manager.py
│       │   ├── _local_event_manager.py
│       │   ├── _types.py
│       │   └── py.typed
│       ├── fingerprint_suite/
│       │   ├── __init__.py
│       │   ├── _browserforge_adapter.py
│       │   ├── _consts.py
│       │   ├── _fingerprint_generator.py
│       │   ├── _header_generator.py
│       │   ├── _types.py
│       │   └── py.typed
│       ├── http_clients/
│       │   ├── __init__.py
│       │   ├── _base.py
│       │   ├── _curl_impersonate.py
│       │   ├── _httpx.py
│       │   └── _impit.py
│       ├── otel/
│       │   ├── __init__.py
│       │   └── crawler_instrumentor.py
│       ├── project_template/
│       │   ├── cookiecutter.json
│       │   ├── hooks/
│       │   │   ├── post_gen_project.py
│       │   │   └── pre_gen_project.py
│       │   ├── templates/
│       │   │   ├── main.py
│       │   │   ├── main_beautifulsoup.py
│       │   │   ├── main_parsel.py
│       │   │   ├── main_playwright.py
│       │   │   ├── main_playwright_camoufox.py
│       │   │   ├── main_playwright_chrome.py
│       │   │   ├── main_playwright_firefox.py
│       │   │   ├── main_playwright_webkit.py
│       │   │   ├── routes_beautifulsoup.py
│       │   │   ├── routes_parsel.py
│       │   │   └── routes_playwright.py
│       │   └── {{cookiecutter.project_name}}/
│       │       ├── .dockerignore
│       │       ├── Dockerfile
│       │       ├── README.md
│       │       ├── pyproject.toml
│       │       ├── requirements.txt
│       │       └── {{cookiecutter.__package_name}}/
│       │           ├── __init__.py
│       │           ├── __main__.py
│       │           ├── main.py
│       │           └── routes.py
│       ├── proxy_configuration.py
│       ├── py.typed
│       ├── request_loaders/
│       │   ├── __init__.py
│       │   ├── _request_list.py
│       │   ├── _request_loader.py
│       │   ├── _request_manager.py
│       │   ├── _request_manager_tandem.py
│       │   └── _sitemap_request_loader.py
│       ├── router.py
│       ├── sessions/
│       │   ├── __init__.py
│       │   ├── _cookies.py
│       │   ├── _models.py
│       │   ├── _session.py
│       │   ├── _session_pool.py
│       │   └── py.typed
│       ├── statistics/
│       │   ├── __init__.py
│       │   ├── _error_snapshotter.py
│       │   ├── _error_tracker.py
│       │   ├── _models.py
│       │   └── _statistics.py
│       ├── storage_clients/
│       │   ├── __init__.py
│       │   ├── _base/
│       │   │   ├── __init__.py
│       │   │   ├── _dataset_client.py
│       │   │   ├── _key_value_store_client.py
│       │   │   ├── _request_queue_client.py
│       │   │   ├── _storage_client.py
│       │   │   └── py.typed
│       │   ├── _file_system/
│       │   │   ├── __init__.py
│       │   │   ├── _dataset_client.py
│       │   │   ├── _key_value_store_client.py
│       │   │   ├── _request_queue_client.py
│       │   │   ├── _storage_client.py
│       │   │   ├── _utils.py
│       │   │   └── py.typed
│       │   ├── _memory/
│       │   │   ├── __init__.py
│       │   │   ├── _dataset_client.py
│       │   │   ├── _key_value_store_client.py
│       │   │   ├── _request_queue_client.py
│       │   │   ├── _storage_client.py
│       │   │   └── py.typed
│       │   ├── _redis/
│       │   │   ├── __init__.py
│       │   │   ├── _client_mixin.py
│       │   │   ├── _dataset_client.py
│       │   │   ├── _key_value_store_client.py
│       │   │   ├── _request_queue_client.py
│       │   │   ├── _storage_client.py
│       │   │   ├── _utils.py
│       │   │   ├── lua_scripts/
│       │   │   │   ├── atomic_bloom_add_requests.lua
│       │   │   │   ├── atomic_fetch_request.lua
│       │   │   │   ├── atomic_set_add_requests.lua
│       │   │   │   └── reclaim_stale_requests.lua
│       │   │   └── py.typed
│       │   ├── _sql/
│       │   │   ├── __init__.py
│       │   │   ├── _client_mixin.py
│       │   │   ├── _dataset_client.py
│       │   │   ├── _db_models.py
│       │   │   ├── _key_value_store_client.py
│       │   │   ├── _request_queue_client.py
│       │   │   ├── _storage_client.py
│       │   │   └── py.typed
│       │   ├── models.py
│       │   └── py.typed
│       └── storages/
│           ├── __init__.py
│           ├── _base.py
│           ├── _dataset.py
│           ├── _key_value_store.py
│           ├── _request_queue.py
│           ├── _storage_instance_manager.py
│           ├── _utils.py
│           └── py.typed
├── tests/
│   ├── __init__.py
│   ├── e2e/
│   │   ├── __init__.py
│   │   ├── conftest.py
│   │   └── project_template/
│   │       ├── test_static_crawlers_templates.py
│   │       └── utils.py
│   └── unit/
│       ├── README.md
│       ├── __init__.py
│       ├── _autoscaling/
│       │   ├── test_autoscaled_pool.py
│       │   ├── test_snapshotter.py
│       │   └── test_system_status.py
│       ├── _statistics/
│       │   ├── test_error_tracker.py
│       │   ├── test_periodic_logging.py
│       │   ├── test_persistence.py
│       │   ├── test_request_max_duration.py
│       │   └── test_request_processing_record.py
│       ├── _utils/
│       │   ├── test_byte_size.py
│       │   ├── test_console.py
│       │   ├── test_crypto.py
│       │   ├── test_file.py
│       │   ├── test_globs.py
│       │   ├── test_html_to_text.py
│       │   ├── test_measure_time.py
│       │   ├── test_raise_if_too_many_kwargs.py
│       │   ├── test_recurring_task.py
│       │   ├── test_requests.py
│       │   ├── test_robots.py
│       │   ├── test_shared_timeout.py
│       │   ├── test_sitemap.py
│       │   ├── test_system.py
│       │   ├── test_timedelta_ms.py
│       │   └── test_urls.py
│       ├── browsers/
│       │   ├── test_browser_pool.py
│       │   ├── test_playwright_browser.py
│       │   ├── test_playwright_browser_controller.py
│       │   └── test_playwright_browser_plugin.py
│       ├── conftest.py
│       ├── crawlers/
│       │   ├── _adaptive_playwright/
│       │   │   ├── test_adaptive_playwright_crawler.py
│       │   │   ├── test_adaptive_playwright_crawler_statistics.py
│       │   │   ├── test_adaptive_playwright_crawling_context.py
│       │   │   └── test_predictor.py
│       │   ├── _basic/
│       │   │   ├── test_basic_crawler.py
│       │   │   └── test_context_pipeline.py
│       │   ├── _beautifulsoup/
│       │   │   └── test_beautifulsoup_crawler.py
│       │   ├── _http/
│       │   │   └── test_http_crawler.py
│       │   ├── _parsel/
│       │   │   └── test_parsel_crawler.py
│       │   └── _playwright/
│       │       ├── test_playwright_crawler.py
│       │       └── test_utils.py
│       ├── events/
│       │   ├── test_event_manager.py
│       │   └── test_local_event_manager.py
│       ├── fingerprint_suite/
│       │   ├── test_adapters.py
│       │   └── test_header_generator.py
│       ├── http_clients/
│       │   ├── test_http_clients.py
│       │   └── test_httpx.py
│       ├── otel/
│       │   └── test_crawler_instrumentor.py
│       ├── proxy_configuration/
│       │   ├── test_new_proxy_info.py
│       │   └── test_tiers.py
│       ├── request_loaders/
│       │   ├── test_request_list.py
│       │   └── test_sitemap_request_loader.py
│       ├── server.py
│       ├── server_endpoints.py
│       ├── server_static/
│       │   └── test.js
│       ├── sessions/
│       │   ├── test_cookies.py
│       │   ├── test_models.py
│       │   ├── test_session.py
│       │   └── test_session_pool.py
│       ├── storage_clients/
│       │   ├── _file_system/
│       │   │   ├── test_fs_dataset_client.py
│       │   │   ├── test_fs_kvs_client.py
│       │   │   └── test_fs_rq_client.py
│       │   ├── _memory/
│       │   │   ├── test_memory_dataset_client.py
│       │   │   ├── test_memory_kvs_client.py
│       │   │   └── test_memory_rq_client.py
│       │   ├── _redis/
│       │   │   ├── test_redis_dataset_client.py
│       │   │   ├── test_redis_kvs_client.py
│       │   │   └── test_redis_rq_client.py
│       │   └── _sql/
│       │       ├── test_sql_dataset_client.py
│       │       ├── test_sql_kvs_client.py
│       │       └── test_sql_rq_client.py
│       ├── storages/
│       │   ├── conftest.py
│       │   ├── test_dataset.py
│       │   ├── test_key_value_store.py
│       │   ├── test_request_manager_tandem.py
│       │   ├── test_request_queue.py
│       │   └── test_storage_instance_manager.py
│       ├── test_cli.py
│       ├── test_configuration.py
│       ├── test_log_config.py
│       ├── test_router.py
│       ├── test_service_locator.py
│       └── utils.py
├── typos.toml
└── website/
    ├── .eslintrc.json
    ├── .yarnrc.yml
    ├── babel.config.js
    ├── build_api_reference.sh
    ├── docusaurus.config.js
    ├── generate_module_shortcuts.py
    ├── package.json
    ├── patches/
    │   ├── @docusaurus+core+3.4.0.patch
    │   └── @docusaurus+core+3.5.2.patch
    ├── roa-loader/
    │   ├── index.js
    │   └── package.json
    ├── sidebars.js
    ├── src/
    │   ├── components/
    │   │   ├── ApiLink.jsx
    │   │   ├── Button.jsx
    │   │   ├── Button.module.css
    │   │   ├── CopyButton.jsx
    │   │   ├── CopyButton.module.css
    │   │   ├── Gradients.jsx
    │   │   ├── Highlights.jsx
    │   │   ├── Highlights.module.css
    │   │   ├── Homepage/
    │   │   │   ├── HomepageCliExample.jsx
    │   │   │   ├── HomepageCliExample.module.css
    │   │   │   ├── HomepageCtaSection.jsx
    │   │   │   ├── HomepageCtaSection.module.css
    │   │   │   ├── HomepageHeroSection.jsx
    │   │   │   ├── HomepageHeroSection.module.css
    │   │   │   ├── LanguageInfoWidget.jsx
    │   │   │   ├── LanguageInfoWidget.module.css
    │   │   │   ├── LanguageSwitch.jsx
    │   │   │   ├── LanguageSwitch.module.css
    │   │   │   ├── RiverSection.jsx
    │   │   │   ├── RiverSection.module.css
    │   │   │   ├── ThreeCardsWithIcon.jsx
    │   │   │   └── ThreeCardsWithIcon.module.css
    │   │   ├── LLMButtons.jsx
    │   │   ├── LLMButtons.module.css
    │   │   ├── RunnableCodeBlock.jsx
    │   │   └── RunnableCodeBlock.module.css
    │   ├── css/
    │   │   └── custom.css
    │   ├── pages/
    │   │   ├── home_page_example.py
    │   │   ├── index.js
    │   │   └── index.module.css
    │   ├── plugins/
    │   │   └── docusaurus-plugin-segment/
    │   │       ├── index.js
    │   │       └── segment.js
    │   └── theme/
    │       ├── ColorModeToggle/
    │       │   ├── index.js
    │       │   └── styles.module.css
    │       ├── DocItem/
    │       │   ├── Content/
    │       │   │   ├── index.js
    │       │   │   └── styles.module.css
    │       │   └── Layout/
    │       │       ├── index.js
    │       │       └── styles.module.css
    │       ├── Footer/
    │       │   ├── LinkItem/
    │       │   │   ├── index.js
    │       │   │   └── index.module.css
    │       │   ├── index.js
    │       │   └── index.module.css
    │       ├── MDXComponents/
    │       │   └── A.js
    │       ├── Navbar/
    │       │   ├── Content/
    │       │   │   ├── index.js
    │       │   │   └── styles.module.css
    │       │   ├── Logo/
    │       │   │   ├── index.js
    │       │   │   └── index.module.css
    │       │   └── MobileSidebar/
    │       │       ├── Header/
    │       │       │   ├── index.js
    │       │       │   └── index.module.css
    │       │       ├── Layout/
    │       │       │   └── index.js
    │       │       ├── PrimaryMenu/
    │       │       │   └── index.js
    │       │       └── index.js
    │       └── NavbarItem/
    │           └── ComponentTypes.js
    ├── static/
    │   ├── .nojekyll
    │   ├── js/
    │   │   └── custom.js
    │   └── robots.txt
    ├── tools/
    │   ├── docs-prettier.config.js
    │   ├── utils/
    │   │   └── externalLink.js
    │   └── website_gif/
    │       └── website_gif.mjs
    └── tsconfig.eslint.json

================================================
FILE CONTENTS
================================================

================================================
FILE: .editorconfig
================================================
root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

[Makefile]
indent_style = tab

[{*.yaml, *.yml}]
indent_size = 2


================================================
FILE: .github/CODEOWNERS
================================================
# Documentation codeowner

/docs/*.md @TC-MO
/docs/*.mdx @TC-MO


================================================
FILE: .github/pull_request_template.md
================================================
### Description

<!-- The purpose of the PR, list of the changes, ... -->

- TODO

### Issues

<!-- If applicable, reference any related GitHub issues -->

- Closes: #TODO

### Testing

<!-- Describe the testing process for these changes -->

- TODO

### Checklist

- [ ] CI passed


================================================
FILE: .github/workflows/_check_code.yaml
================================================
name: Code checks

on:
  # Runs when manually triggered from the GitHub UI.
  workflow_dispatch:

  # Runs when invoked by another workflow.
  workflow_call:

permissions:
  contents: read

jobs:
  actions_lint_check:
    name: Actions lint check
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
      - name: Run actionlint
        uses: rhysd/actionlint@v1.7.11

  spell_check:
    name: Spell check
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
      - name: Check spelling with typos
        uses: crate-ci/typos@v1

  lint_check:
    name: Lint check
    uses: apify/workflows/.github/workflows/python_lint_check.yaml@main
    with:
      python_versions: '["3.10", "3.11", "3.12", "3.13", "3.14"]'

  type_check:
    name: Type check
    uses: apify/workflows/.github/workflows/python_type_check.yaml@main
    with:
      python_versions: '["3.10", "3.11", "3.12", "3.13", "3.14"]'


================================================
FILE: .github/workflows/_check_docs.yaml
================================================
name: Doc checks

on:
  # Runs when manually triggered from the GitHub UI.
  workflow_dispatch:

  # Runs when invoked by another workflow.
  workflow_call:

permissions:
  contents: read

jobs:
  doc_checks:
    name: Doc checks
    uses: apify/workflows/.github/workflows/python_docs_check.yaml@main


================================================
FILE: .github/workflows/_release_docs.yaml
================================================
name: Doc release

on:
  # Runs when manually triggered from the GitHub UI.
  workflow_dispatch:

  # Runs when invoked by another workflow.
  workflow_call:
    inputs:
      ref:
        required: true
        type: string

permissions:
  contents: read

env:
  NODE_VERSION: 22
  PYTHON_VERSION: 3.14
  CHECKOUT_REF: ${{ github.event_name == 'workflow_call' && inputs.ref || github.ref }}

jobs:
  release_docs:
    name: Doc release
    environment:
      name: github-pages
    permissions:
      contents: write
      pages: write
      id-token: write
    runs-on: ubuntu-latest

    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
        with:
          token: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }}
          ref: ${{ env.CHECKOUT_REF }}

      - name: Set up Node
        uses: actions/setup-node@v6
        with:
          node-version: ${{ env.NODE_VERSION }}

      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: ${{ env.PYTHON_VERSION }}

      - name: Set up uv package manager
        uses: astral-sh/setup-uv@v7
        with:
          python-version: ${{ env.PYTHON_VERSION }}

      - name: Install Python dependencies
        run: uv run poe install-dev

      - name: Build Docusaurus docs
        run: uv run poe build-docs
        env:
          APIFY_SIGNING_TOKEN: ${{ secrets.APIFY_SIGNING_TOKEN }}
          SEGMENT_TOKEN: ${{ secrets.SEGMENT_TOKEN }}

      - name: Set up GitHub Pages
        uses: actions/configure-pages@v5

      - name: Upload GitHub Pages artifact
        uses: actions/upload-pages-artifact@v4
        with:
          path: ./website/build

      - name: Deploy artifact to GitHub Pages
        uses: actions/deploy-pages@v4

      - name: Invalidate CloudFront cache
        run: |
          gh workflow run invalidate-cloudfront.yml \
            --repo apify/apify-docs-private \
            --field deployment=crawlee-web
          echo "✅ CloudFront cache invalidation workflow triggered successfully"
        env:
          GITHUB_TOKEN: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }}


================================================
FILE: .github/workflows/_tests.yaml
================================================
name: Tests

on:
  # Runs when manually triggered from the GitHub UI.
  workflow_dispatch:

  # Runs when invoked by another workflow.
  workflow_call:

permissions:
  contents: read

jobs:
  unit_tests:
    name: Unit tests
    uses: apify/workflows/.github/workflows/python_unit_tests.yaml@main
    secrets: inherit
    with:
      python_versions: '["3.10", "3.11", "3.12", "3.13", "3.14"]'
      operating_systems: '["ubuntu-latest", "windows-latest", "macos-latest"]'
      python_version_for_codecov: "3.14"
      operating_system_for_codecov: ubuntu-latest
      tests_concurrency: "8"


================================================
FILE: .github/workflows/manual_release_stable.yaml
================================================
name: Stable release

on:
  # Runs when manually triggered from the GitHub UI, with options to specify the type of release.
  workflow_dispatch:
    inputs:
      release_type:
        description: Release type
        required: true
        type: choice
        default: auto
        options:
          - auto
          - custom
          - patch
          - minor
          - major
      custom_version:
        description: The custom version to bump to (only for "custom" type)
        required: false
        type: string
        default: ""

concurrency:
  group: release
  cancel-in-progress: false

permissions:
  contents: read

jobs:
  code_checks:
    name: Code checks
    uses: ./.github/workflows/_check_code.yaml

  release_prepare:
    name: Release prepare
    needs: [code_checks]
    runs-on: ubuntu-latest
    outputs:
      version_number: ${{ steps.release_prepare.outputs.version_number }}
      tag_name: ${{ steps.release_prepare.outputs.tag_name }}
      changelog: ${{ steps.release_prepare.outputs.changelog }}
      release_notes: ${{ steps.release_prepare.outputs.release_notes }}
    steps:
      - uses: apify/workflows/git-cliff-release@main
        name: Release prepare
        id: release_prepare
        with:
          release_type: ${{ inputs.release_type }}
          custom_version: ${{ inputs.custom_version }}
          existing_changelog_path: CHANGELOG.md

  changelog_update:
    name: Changelog update
    needs: [release_prepare]
    permissions:
      contents: write
    uses: apify/workflows/.github/workflows/python_bump_and_update_changelog.yaml@main
    with:
      version_number: ${{ needs.release_prepare.outputs.version_number }}
      changelog: ${{ needs.release_prepare.outputs.changelog }}
    secrets: inherit

  github_release:
    name: GitHub release
    needs: [release_prepare, changelog_update]
    runs-on: ubuntu-latest
    permissions:
      contents: write
    env:
      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
    steps:
      - name: GitHub release
        uses: softprops/action-gh-release@v2
        with:
          tag_name: ${{ needs.release_prepare.outputs.tag_name }}
          name: ${{ needs.release_prepare.outputs.version_number }}
          target_commitish: ${{ needs.changelog_update.outputs.changelog_commitish }}
          body: ${{ needs.release_prepare.outputs.release_notes }}

  pypi_publish:
    name: PyPI publish
    needs: [release_prepare, changelog_update]
    runs-on: ubuntu-latest
    permissions:
      contents: write
      id-token: write # Required for OIDC authentication.
    environment:
      name: pypi
      url: https://pypi.org/project/crawlee
    steps:
      - name: Prepare distribution
        uses: apify/workflows/prepare-pypi-distribution@main
        with:
          package_name: crawlee
          is_prerelease: ""
          version_number: ${{ needs.release_prepare.outputs.version_number }}
          ref: ${{ needs.changelog_update.outputs.changelog_commitish }}
      # Publishes the package to PyPI using PyPA official GitHub action with OIDC authentication.
      - name: Publish package to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1

      # TODO: add job for publish package to Conda
      # https://github.com/apify/crawlee-python/issues/104

  doc_release:
    name: Doc release
    needs: [changelog_update, pypi_publish]
    permissions:
      contents: write
      pages: write
      id-token: write
    uses: ./.github/workflows/_release_docs.yaml
    with:
      # Use the ref from the changelog update to include the updated changelog.
      ref: ${{ needs.changelog_update.outputs.changelog_commitish }}
    secrets: inherit


================================================
FILE: .github/workflows/on_issue.yaml
================================================
name: CI (issue)

on:
  # Runs when a new issue is opened.
  issues:
    types:
      - opened

permissions:
  contents: read

jobs:
  label_issues:
    name: Add labels
    runs-on: ubuntu-latest
    permissions:
      issues: write

    steps:
      # Add the "t-tooling" label to all new issues
      - uses: actions/github-script@v8
        with:
          script: |
            github.rest.issues.addLabels({
              issue_number: context.issue.number,
              owner: context.repo.owner,
              repo: context.repo.repo,
              labels: ["t-tooling"]
            })


================================================
FILE: .github/workflows/on_master.yaml
================================================
name: CI (master)

on:
  push:
    branches:
      - master
    tags-ignore:
      - "**" # Ignore all tags to avoid duplicate executions triggered by tag pushes.

concurrency:
  group: release
  cancel-in-progress: false

permissions:
  contents: read

jobs:
  doc_checks:
    name: Doc checks
    uses: ./.github/workflows/_check_docs.yaml

  doc_release:
    # Skip this for non-"docs" commits.
    if: startsWith(github.event.head_commit.message, 'docs')
    name: Doc release
    needs: [doc_checks]
    permissions:
      contents: write
      pages: write
      id-token: write
    uses: ./.github/workflows/_release_docs.yaml
    with:
      # Use the same ref as the one that triggered the workflow.
      ref: ${{ github.ref }}
    secrets: inherit

  code_checks:
    name: Code checks
    uses: ./.github/workflows/_check_code.yaml

  tests:
    # Skip this for "docs" commits.
    if: "!startsWith(github.event.head_commit.message, 'docs')"
    name: Tests
    uses: ./.github/workflows/_tests.yaml
    secrets: inherit

  release_prepare:
    # Run this only for "feat", "fix", "perf", "refactor" and "style" commits.
    if: >-
      startsWith(github.event.head_commit.message, 'feat') ||
      startsWith(github.event.head_commit.message, 'fix') ||
      startsWith(github.event.head_commit.message, 'perf') ||
      startsWith(github.event.head_commit.message, 'refactor') ||
      startsWith(github.event.head_commit.message, 'style')
    name: Release prepare
    needs: [code_checks, tests]
    runs-on: ubuntu-latest
    outputs:
      version_number: ${{ steps.release_prepare.outputs.version_number }}
      tag_name: ${{ steps.release_prepare.outputs.tag_name }}
      changelog: ${{ steps.release_prepare.outputs.changelog }}
    steps:
      - uses: apify/workflows/git-cliff-release@main
        id: release_prepare
        name: Release prepare
        with:
          release_type: prerelease
          existing_changelog_path: CHANGELOG.md

  changelog_update:
    name: Changelog update
    needs: [release_prepare]
    permissions:
      contents: write
    uses: apify/workflows/.github/workflows/python_bump_and_update_changelog.yaml@main
    with:
      version_number: ${{ needs.release_prepare.outputs.version_number }}
      changelog: ${{ needs.release_prepare.outputs.changelog }}
    secrets: inherit

  pypi_publish:
    name: PyPI publish
    needs: [release_prepare, changelog_update]
    runs-on: ubuntu-latest
    permissions:
      contents: write
      id-token: write # Required for OIDC authentication.
    environment:
      name: pypi
      url: https://pypi.org/project/crawlee
    steps:
      - name: Prepare distribution
        uses: apify/workflows/prepare-pypi-distribution@main
        with:
          package_name: crawlee
          is_prerelease: "yes"
          version_number: ${{ needs.release_prepare.outputs.version_number }}
          ref: ${{ needs.changelog_update.outputs.changelog_commitish }}

      - name: Publish package to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1

  doc_release_post_publish:
    name: Doc release post publish
    needs: [changelog_update, pypi_publish]
    permissions:
      contents: write
      pages: write
      id-token: write
    uses: ./.github/workflows/_release_docs.yaml
    with:
      # Use the ref from the changelog update to include the updated changelog.
      ref: ${{ needs.changelog_update.outputs.changelog_commitish }}
    secrets: inherit


================================================
FILE: .github/workflows/on_pull_request.yaml
================================================
name: CI (PR)

on:
  # Runs whenever a pull request is opened or updated.
  pull_request:

permissions:
  contents: read
  pull-requests: read

jobs:
  pr_title_check:
    name: PR title check
    runs-on: ubuntu-latest
    steps:
      - uses: amannn/action-semantic-pull-request@v6.1.1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

  doc_checks:
    name: Doc checks
    uses: ./.github/workflows/_check_docs.yaml

  code_checks:
    name: Code checks
    uses: ./.github/workflows/_check_code.yaml

  tests:
    name: Tests
    uses: ./.github/workflows/_tests.yaml
    secrets: inherit


================================================
FILE: .github/workflows/on_schedule_tests.yaml
================================================
name: Scheduled tests

on:
  # Runs when manually triggered from the GitHub UI.
  workflow_dispatch:

  # Runs on a daily schedule at 06:00 UTC.
  schedule:
    - cron: '0 6 * * *'

concurrency:
  group: scheduled-tests
  cancel-in-progress: false

permissions:
  contents: read

env:
  NODE_VERSION: 22
  PYTHON_VERSION: 3.14
  TESTS_CONCURRENCY: 1

jobs:
  end_to_end_tests:
    name: End-to-end tests
    strategy:
      fail-fast: false
      max-parallel: 12
      matrix:
        crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup"]
        http-client: ["httpx", "curl_impersonate"]
        package-manager: ["pip", "uv", "poetry"]

    runs-on: "ubuntu-latest"

    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Setup node
        uses: actions/setup-node@v6
        with:
          node-version: ${{ env.NODE_VERSION }}

      - name: Install dependencies
        run: npm install -g apify-cli

      - name: Set up Python ${{ env.PYTHON_VERSION }}
        uses: actions/setup-python@v6
        with:
          python-version: ${{ env.PYTHON_VERSION }}

      # installed to be able to patch crawlee in the poetry.lock with custom wheel file for poetry based templates
      - name: Install poetry
        run: pipx install poetry

      - name: Set up uv package manager
        uses: astral-sh/setup-uv@v7
        with:
          python-version: ${{ env.PYTHON_VERSION }}

      # Sync the project, but no need to install the browsers into the test runner environment.
      - name: Install Python dependencies
        run: uv run poe install-sync

      - name: Run templates end-to-end tests
        run: uv run poe e2e-templates-tests -m "${{ matrix.http-client }} and ${{ matrix.crawler-type }} and ${{ matrix.package-manager }}"
        env:
          APIFY_TEST_USER_API_TOKEN: ${{ secrets.APIFY_TEST_USER_API_TOKEN }}


================================================
FILE: .gitignore
================================================
# AI assistant files
.agent
.agents
.ai
.aider
.claude
.codeium
.continue
.copilot
.cursor
.gemini
.llm
.llms
.openai
.serena
.windsurf
.zed-ai
AGENTS.local.md
CLAUDE.local.md
GEMINI.local.md

# Cache
__pycache__
.pytest_cache
.ruff_cache
.ty_cache
.uv-cache

# Virtual envs
.direnv
.env
.envrc
.python-version
.venv

# Other Python tools
.ropeproject

# Mise
mise.toml
.mise.toml

# Egg and build artifacts
*.egg-info/
*.egg
dist/
build/

# Coverage reports
.coverage*
htmlcov
coverage-unit.xml
coverage-integration.xml

# IDE, editors
*~
.DS_Store
.idea
.nvim.lua
.vscode
.zed
Session.vim

# Docs
docs/changelog.md

# Website build artifacts, node dependencies
website/build
website/node_modules
website/.yarn
website/.docusaurus
website/api-typedoc-generated.json
website/apify-shared-docspec-dump.jsonl
website/docspec-dump.jsonl
website/module_shortcuts.json
website/typedoc-types*
# npm lockfile (we use yarn)
website/package-lock.json

# Default directory for memory storage
storage/

# Tmp dir
tmp/


================================================
FILE: .markdownlint.yaml
================================================
default: true
line-length:
  line_length: 120
MD007:
  indent: 4
MD004:
  style: dash
no-inline-html: false


================================================
FILE: .pre-commit-config.yaml
================================================
repos:
  - repo: local
    hooks:
      - id: lint-check
        name: Lint check
        entry: uv run poe lint
        language: system
        pass_filenames: false

      - id: type-check
        name: Type check
        entry: uv run poe type-check
        language: system
        pass_filenames: false


================================================
FILE: .rules.md
================================================
# Coding guidelines

This file provides guidance to programming agents when working with code in this repository.

## Development Commands

All commands use `uv` (package manager) and `poe` (task runner):

```bash
# Install all dependencies (dev + extras + pre-commit + playwright)
uv run poe install-dev

# Run full check suite (lint + type-check + unit tests)
uv run poe check-code

# Linting (ruff format check + ruff check)
uv run poe lint

# Auto-fix formatting
uv run poe format

# Type checking (ty)
uv run poe type-check

# Run all unit tests
uv run poe unit-tests

# Run a single test file
uv run pytest tests/unit/path/to/test_file.py

# Run a single test by name
uv run pytest tests/unit/path/to/test_file.py::test_name -v

# Run tests with coverage XML report
uv run poe unit-tests-cov

# Build package
uv run poe build

# Clean build artifacts
uv run poe clean
```

Note: `uv run poe unit-tests` first runs tests marked `@pytest.mark.run_alone` in isolation, then runs the rest with `-x` (fail-fast) and parallelism via `pytest-xdist`.

## Code Style

- **Linter/formatter**: Ruff with `select = ["ALL"]` and specific ignores
- **Line length**: 120 characters
- **Quotes**: Single quotes (double for docstrings)
- **Docstrings**: Google format (enforced by Ruff)
- **Type checker**: ty (Astral's type checker), target Python 3.10
- **Async mode**: pytest-asyncio in `auto` mode (no need for `@pytest.mark.asyncio`)
- **Commit format**: Conventional Commits (`feat:`, `fix:`, `docs:`, `refactor:`, `test:`, etc.)

## Architecture

### Crawler Hierarchy

```
BasicCrawler[TCrawlingContext, TStatisticsState]
├── AbstractHttpCrawler  →  HttpCrawler, BeautifulSoupCrawler, ParselCrawler
├── PlaywrightCrawler
└── AdaptivePlaywrightCrawler (extends PlaywrightCrawler)
```

- **BasicCrawler** (`src/crawlee/crawlers/_basic/`): Core request lifecycle, autoscaling pool, retries, session management, router dispatch. Generic over `TCrawlingContext`.
- **AbstractHttpCrawler** (`src/crawlee/crawlers/_abstract_http/`): Adds HTTP client integration, response parsing, pre-navigation hooks. Generic over parser result type.
- **PlaywrightCrawler** (`src/crawlee/crawlers/_playwright/`): Browser-based crawling with Playwright.

### Context Pipeline (Middleware Pattern)

Contexts are progressively enhanced through `ContextPipeline` middleware:

```
BasicCrawlingContext → HttpCrawlingContext → ParsedHttpCrawlingContext → BeautifulSoupCrawlingContext
```

Each middleware is an async generator that wraps the next handler, enabling setup/teardown around request processing.

### Storage Layer

Three-tier design:
- **High-level**: `Dataset`, `KeyValueStore`, `RequestQueue` in `src/crawlee/storages/`
- **Storage clients** (`src/crawlee/storage_clients/`): `FileSystemStorageClient` (default), `MemoryStorageClient`, `SqlStorageClient`, `RedisStorageClient`
- **Instance caching**: `StorageInstanceManager` is a global singleton that caches storage instances by ID/name

### Service Locator

`src/crawlee/_service_locator.py` is a global singleton managing `Configuration`, `EventManager`, `StorageClient`, and `StorageInstanceManager`. Prevents double-initialization with `ServiceConflictError`.

### HTTP Clients

Pluggable via `HttpClient` interface in `src/crawlee/http_clients/`:
- `ImpitHttpClient` (default), `HttpxHttpClient`, `CurlImpersonateHttpClient`
- Each provides `crawl()` (for crawler pipeline) and `send_request()` (for in-handler use)

### Request Model

`Request` (`src/crawlee/_request.py`) uses `unique_key` for deduplication. Lifecycle states: `UNPROCESSED → DONE`. Crawlee-specific metadata stored in `user_data['__crawlee']`.

### Router

```python
@crawler.router.default_handler
async def handler(context: BeautifulSoupCrawlingContext): ...

@crawler.router.handler(label='detail')
async def detail(context: BeautifulSoupCrawlingContext): ...
```

Requests are routed by their `label` field; unmatched requests go to the default handler.

### Key Directories

- `src/crawlee/crawlers/` - All crawler implementations
- `src/crawlee/storages/` - Dataset, KVS, RequestQueue
- `src/crawlee/storage_clients/` - Backend implementations
- `src/crawlee/http_clients/` - HTTP client implementations
- `src/crawlee/browsers/` - Playwright browser pool and plugins
- `src/crawlee/sessions/` - Session management with cookie persistence
- `src/crawlee/events/` - Event system (persist state, progress, aborting)
- `src/crawlee/_autoscaling/` - Autoscaled pool for concurrency control
- `src/crawlee/fingerprint_suite/` - Anti-bot fingerprint generation
- `src/crawlee/project_template/` - CLI scaffolding template (excluded from linting)
- `tests/unit/` - Unit tests
- `tests/e2e/` - End-to-end tests (require `apify-cli` + API token)


================================================
FILE: CHANGELOG.md
================================================
# Changelog

All notable changes to this project will be documented in this file.

## [1.6.0](https://github.com/apify/crawlee-python/releases/tag/v1.6.0) (2026-03-20)

### 🚀 Features

- Allow non-href links extract &amp; enqueue ([#1781](https://github.com/apify/crawlee-python/pull/1781)) ([6db365d](https://github.com/apify/crawlee-python/commit/6db365d1625206d8d691256c9cd4b44a821238bb)) by [@kozlice](https://github.com/kozlice)
- Add `post_navigation_hooks` to crawlers ([#1795](https://github.com/apify/crawlee-python/pull/1795)) ([38ceda6](https://github.com/apify/crawlee-python/commit/38ceda635a18cb2f14efc7c8e8b67f3adb7e53fd)) by [@Mantisus](https://github.com/Mantisus)
- Add page lifecycle hooks to `BrowserPool` ([#1791](https://github.com/apify/crawlee-python/pull/1791)) ([6f2ac13](https://github.com/apify/crawlee-python/commit/6f2ac13fea4cfa8a65e6e41430d3e8d28cc3a787)) by [@Mantisus](https://github.com/Mantisus)
- Expose `BrowserType` and `CrawleePage` ([#1798](https://github.com/apify/crawlee-python/pull/1798)) ([b50b9f2](https://github.com/apify/crawlee-python/commit/b50b9f2a8396dcee2bd7eaf76c94d24912c2bc5f)) by [@Mantisus](https://github.com/Mantisus)
- Expose `use_state` in `BasicCrawler` ([#1799](https://github.com/apify/crawlee-python/pull/1799)) ([d121873](https://github.com/apify/crawlee-python/commit/d121873a7f5902b911dd04b4aa9eaf75a8449323)) by [@Mantisus](https://github.com/Mantisus)

### 🐛 Bug Fixes

- **redis:** Do not remove handled request data from request queue ([#1787](https://github.com/apify/crawlee-python/pull/1787)) ([3008c61](https://github.com/apify/crawlee-python/commit/3008c61dcbe07ccdf3c43f198b37582cc1356c9a)) by [@kozlice](https://github.com/kozlice)
- **redis:** Update actual `Request` state in request queue Redis storage client ([#1789](https://github.com/apify/crawlee-python/pull/1789)) ([787231c](https://github.com/apify/crawlee-python/commit/787231cebeb863ee2b4395964a79a37053dbec01)) by [@Mantisus](https://github.com/Mantisus)


## [1.5.0](https://github.com/apify/crawlee-python/releases/tag/v1.5.0) (2026-03-06)

### 🚀 Features

- Use specialized Playwright docker images in templates ([#1757](https://github.com/apify/crawlee-python/pull/1757)) ([747c0cf](https://github.com/apify/crawlee-python/commit/747c0cf4a82296a2e3ea5cac5ef4c9578ea62a0c)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1756](https://github.com/apify/crawlee-python/issues/1756)
- Add `discover_valid_sitemaps` utility ([#1777](https://github.com/apify/crawlee-python/pull/1777)) ([872447b](https://github.com/apify/crawlee-python/commit/872447b60bbdb3926068064a971492807b1bdfbb)) by [@Mantisus](https://github.com/Mantisus), closes [#1740](https://github.com/apify/crawlee-python/issues/1740)

### 🐛 Bug Fixes

- Prevent list modification during iteration in BrowserPool ([#1703](https://github.com/apify/crawlee-python/pull/1703)) ([70309d9](https://github.com/apify/crawlee-python/commit/70309d9bf568d268a26b3ba6392be2b6ff284c65)) by [@vdusek](https://github.com/vdusek)
- Fix ` max_requests_per_crawl` excluding failed requests ([#1766](https://github.com/apify/crawlee-python/pull/1766)) ([d6bb0b4](https://github.com/apify/crawlee-python/commit/d6bb0b4a9dc5dd6668d076fbfa1b5e748deaee0d)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1765](https://github.com/apify/crawlee-python/issues/1765)
- **playwright:** Dispose of `APIResponse` body for `send_request` ([#1771](https://github.com/apify/crawlee-python/pull/1771)) ([29d301b](https://github.com/apify/crawlee-python/commit/29d301bf9d7795f2fbaddb99235a7157b880f60c)) by [@kozlice](https://github.com/kozlice)
- Return `None` from `add_request` when storage client fails to enqueue request ([#1775](https://github.com/apify/crawlee-python/pull/1775)) ([944753a](https://github.com/apify/crawlee-python/commit/944753a71956c30f3ce0896ffa24be7de5348933)) by [@Mantisus](https://github.com/Mantisus)
- Re-use pre-existing browser context in `PlaywrightBrowserController` ([#1778](https://github.com/apify/crawlee-python/pull/1778)) ([4487543](https://github.com/apify/crawlee-python/commit/44875433df83d433aa69ada458b91df3ad569f5e)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1776](https://github.com/apify/crawlee-python/issues/1776)


## [1.4.0](https://github.com/apify/crawlee-python/releases/tag/v1.4.0) (2026-02-17)

### 🚀 Features

- Dynamic memory snapshots  ([#1715](https://github.com/apify/crawlee-python/pull/1715)) ([568a7b1](https://github.com/apify/crawlee-python/commit/568a7b186dedda19ad814ee8af3cd8e256cc4ad9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1704](https://github.com/apify/crawlee-python/issues/1704)
- Add `MySQL` and `MariaDB` support for `SqlStorageClient` ([#1749](https://github.com/apify/crawlee-python/pull/1749)) ([202b500](https://github.com/apify/crawlee-python/commit/202b5009ea5d35ea779eb5b8db1fc575f90ca7bb)) by [@Mantisus](https://github.com/Mantisus)

### 🐛 Bug Fixes

- Make log levels consistent in ServiceLocator ([#1746](https://github.com/apify/crawlee-python/pull/1746)) ([4163413](https://github.com/apify/crawlee-python/commit/4163413049485b035c38efd6a4a7d41502a44cfc)) by [@janbuchar](https://github.com/janbuchar)
- Fix `PlaywrightCrawler` unintentionally setting the global configuration ([#1747](https://github.com/apify/crawlee-python/pull/1747)) ([fa58438](https://github.com/apify/crawlee-python/commit/fa58438026eb72a6002c8d494725bf4e48b4407e)) by [@Pijukatel](https://github.com/Pijukatel)
- Fix `Snapshotter` handling of out of order samples ([#1735](https://github.com/apify/crawlee-python/pull/1735)) ([387c712](https://github.com/apify/crawlee-python/commit/387c712306055d901b1c0df4a9666967f039aefd)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1734](https://github.com/apify/crawlee-python/issues/1734)

### ⚡ Performance

- Optimize metadata records processing in `SqlStorageClient` ([#1551](https://github.com/apify/crawlee-python/pull/1551)) ([df1347a](https://github.com/apify/crawlee-python/commit/df1347aacf05c05980000d15b36b65996119ea86)) by [@Mantisus](https://github.com/Mantisus), closes [#1533](https://github.com/apify/crawlee-python/issues/1533)


## [1.3.2](https://github.com/apify/crawlee-python/releases/tag/v1.3.2) (2026-02-09)

### 🐛 Bug Fixes

- Use `max()` instead of `min()` for `request_max_duration` statistic ([#1701](https://github.com/apify/crawlee-python/pull/1701)) ([85c4335](https://github.com/apify/crawlee-python/commit/85c43351a05ada1369b720061f6f1a7e158340b6)) by [@vdusek](https://github.com/vdusek)
- Prevent mutation of default URL patterns list in `block_requests` ([#1702](https://github.com/apify/crawlee-python/pull/1702)) ([fcf9adb](https://github.com/apify/crawlee-python/commit/fcf9adb6a0cfeaa87ca482372d4e066584eb28d6)) by [@vdusek](https://github.com/vdusek)
- Keep None values for `user_data` in `Request` ([#1707](https://github.com/apify/crawlee-python/pull/1707)) ([3c575bc](https://github.com/apify/crawlee-python/commit/3c575bc2b0f1c89c99d134ad3a3fa7455ccc6910)) by [@Mantisus](https://github.com/Mantisus), closes [#1706](https://github.com/apify/crawlee-python/issues/1706)
- Respect `max_open_pages_per_browser` limit for `PlaywrightBrowserController` on concurrent `new_page` calls ([#1712](https://github.com/apify/crawlee-python/pull/1712)) ([2e5534b](https://github.com/apify/crawlee-python/commit/2e5534b98913d5cbd6b721b2423d063772024417)) by [@Mantisus](https://github.com/Mantisus)


## [1.3.1](https://github.com/apify/crawlee-python/releases/tag/v1.3.1) (2026-01-30)

### 🐛 Bug Fixes

- Reset all counter in metadata with `purge` for `RequestQueue` ([#1686](https://github.com/apify/crawlee-python/pull/1686)) ([ee09260](https://github.com/apify/crawlee-python/commit/ee0926084589f1b6e15840b6185ec5433be3b72f)) by [@Mantisus](https://github.com/Mantisus), closes [#1682](https://github.com/apify/crawlee-python/issues/1682)
- Set default `http3=False` for `ImpitHttpClient` ([#1685](https://github.com/apify/crawlee-python/pull/1685)) ([3f390f6](https://github.com/apify/crawlee-python/commit/3f390f677540a3905038d7db6a6d1efad32fd045)) by [@Mantisus](https://github.com/Mantisus), closes [#1683](https://github.com/apify/crawlee-python/issues/1683)
- Prevent get_request from permanently blocking requests ([#1684](https://github.com/apify/crawlee-python/pull/1684)) ([da416f9](https://github.com/apify/crawlee-python/commit/da416f98fb453904d62e7d29d8f24611ffb3ba8d)) by [@Mirza-Samad-Ahmed-Baig](https://github.com/Mirza-Samad-Ahmed-Baig)
- Do not share state between different crawlers unless requested ([#1669](https://github.com/apify/crawlee-python/pull/1669)) ([64c246b](https://github.com/apify/crawlee-python/commit/64c246bedea14f86e607d23adc5bec644c578364)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1627](https://github.com/apify/crawlee-python/issues/1627)


## [1.3.0](https://github.com/apify/crawlee-python/releases/tag/v1.3.0) (2026-01-20)

### 🚀 Features

- Expose `AdaptivePlaywrightCrawlerStatisticState` for `AdaptivePlaywrightCrawler` ([#1635](https://github.com/apify/crawlee-python/pull/1635)) ([1bb4bcb](https://github.com/apify/crawlee-python/commit/1bb4bcb4ccbec347ad9c14f70e9e946d48e3c38e)) by [@Mantisus](https://github.com/Mantisus)

### 🐛 Bug Fixes

- Prevent race condition in concurrent storage creation ([#1626](https://github.com/apify/crawlee-python/pull/1626)) ([7f17a43](https://github.com/apify/crawlee-python/commit/7f17a4347d5884962767e757a92ec173688fed7b)) by [@Mantisus](https://github.com/Mantisus), closes [#1621](https://github.com/apify/crawlee-python/issues/1621)
- Create correct statistics for `AdaptivePlaywrightCrawler` on initialization with a custom parser ([#1637](https://github.com/apify/crawlee-python/pull/1637)) ([bff7260](https://github.com/apify/crawlee-python/commit/bff726055dd0d7e07a2c546b15cbee22abd85960)) by [@Mantisus](https://github.com/Mantisus), closes [#1630](https://github.com/apify/crawlee-python/issues/1630)
- Fix adding extra link for `EnqueueLinksFunction` with `limit` ([#1674](https://github.com/apify/crawlee-python/pull/1674)) ([71d7867](https://github.com/apify/crawlee-python/commit/71d7867b14f7f07cac06899f5da006091af4a954)) by [@Mantisus](https://github.com/Mantisus), closes [#1673](https://github.com/apify/crawlee-python/issues/1673)


## [1.2.1](https://github.com/apify/crawlee-python/releases/tag/v1.2.1) (2025-12-16)

### 🐛 Bug Fixes

- Fix short error summary ([#1605](https://github.com/apify/crawlee-python/pull/1605)) ([b751208](https://github.com/apify/crawlee-python/commit/b751208d9a56e9d923e4559baeba35e2eede0450)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1602](https://github.com/apify/crawlee-python/issues/1602)
- Freeze core `Request` fields ([#1603](https://github.com/apify/crawlee-python/pull/1603)) ([ae6d86b](https://github.com/apify/crawlee-python/commit/ae6d86b8c82900116032596201d94cd7875aaadc)) by [@Mantisus](https://github.com/Mantisus)
- Respect `enqueue_strategy` after redirects in `enqueue_links` ([#1607](https://github.com/apify/crawlee-python/pull/1607)) ([700df91](https://github.com/apify/crawlee-python/commit/700df91bc9be1299388030a3e48e4dbc6f5b85a0)) by [@Mantisus](https://github.com/Mantisus), closes [#1606](https://github.com/apify/crawlee-python/issues/1606)
- Protect `Request` from partial mutations on request handler failure ([#1585](https://github.com/apify/crawlee-python/pull/1585)) ([a69caf8](https://github.com/apify/crawlee-python/commit/a69caf87edecc755287c53c8cc0ca4725af5d411)) by [@Mantisus](https://github.com/Mantisus), closes [#1514](https://github.com/apify/crawlee-python/issues/1514)


## [1.2.0](https://github.com/apify/crawlee-python/releases/tag/v1.2.0) (2025-12-08)

### 🚀 Features

- Add additional kwargs to Crawler&#x27;s export_data ([#1597](https://github.com/apify/crawlee-python/pull/1597)) ([5977f37](https://github.com/apify/crawlee-python/commit/5977f376b93a7c0d4dd53f0d331a4b04fedba2c6)) by [@vdusek](https://github.com/vdusek), closes [#526](https://github.com/apify/crawlee-python/issues/526)
- Add `goto_options` for `PlaywrightCrawler` ([#1599](https://github.com/apify/crawlee-python/pull/1599)) ([0b82f3b](https://github.com/apify/crawlee-python/commit/0b82f3b6fb175223ea2aa5b348afcd5fdb767972)) by [@Mantisus](https://github.com/Mantisus), closes [#1576](https://github.com/apify/crawlee-python/issues/1576)

### 🐛 Bug Fixes

- Only apply requestHandlerTimeout to request handler ([#1474](https://github.com/apify/crawlee-python/pull/1474)) ([0dfb6c2](https://github.com/apify/crawlee-python/commit/0dfb6c2a13b6650736245fa39b3fbff397644df7)) by [@janbuchar](https://github.com/janbuchar)
- Handle the case when `error_handler` returns `Request` ([#1595](https://github.com/apify/crawlee-python/pull/1595)) ([8a961a2](https://github.com/apify/crawlee-python/commit/8a961a2b07d0d33a7302dbb13c17f3d90999d390)) by [@Mantisus](https://github.com/Mantisus)
- Align `Request.state` transitions with `Request` lifecycle ([#1601](https://github.com/apify/crawlee-python/pull/1601)) ([383225f](https://github.com/apify/crawlee-python/commit/383225f9f055d95ffb1302b8cf96f42ec264f1fc)) by [@Mantisus](https://github.com/Mantisus)


## [1.1.1](https://github.com/apify/crawlee-python/releases/tag/v1.1.1) (2025-12-02)

### 🐛 Bug Fixes

- Unify separators in `unique_key` construction ([#1569](https://github.com/apify/crawlee-python/pull/1569)) ([af46a37](https://github.com/apify/crawlee-python/commit/af46a3733b059a8052489296e172f005def953f7)) by [@vdusek](https://github.com/vdusek), closes [#1512](https://github.com/apify/crawlee-python/issues/1512)
- Fix `same-domain` strategy ignoring public suffix  ([#1572](https://github.com/apify/crawlee-python/pull/1572)) ([3d018b2](https://github.com/apify/crawlee-python/commit/3d018b21a28a4bee493829783057188d6106a69b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1571](https://github.com/apify/crawlee-python/issues/1571)
- Make context helpers work in `FailedRequestHandler` and `ErrorHandler` ([#1570](https://github.com/apify/crawlee-python/pull/1570)) ([b830019](https://github.com/apify/crawlee-python/commit/b830019350830ac33075316061659e2854f7f4a5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1532](https://github.com/apify/crawlee-python/issues/1532)
- Fix non-ASCII character corruption in `FileSystemStorageClient` on systems without UTF-8 default encoding ([#1580](https://github.com/apify/crawlee-python/pull/1580)) ([f179f86](https://github.com/apify/crawlee-python/commit/f179f8671b0b6af9264450e4fef7e49d1cecd2bd)) by [@Mantisus](https://github.com/Mantisus), closes [#1579](https://github.com/apify/crawlee-python/issues/1579)
- Respect `&lt;base&gt;` when enqueuing ([#1590](https://github.com/apify/crawlee-python/pull/1590)) ([de517a1](https://github.com/apify/crawlee-python/commit/de517a1629cc29b20568143eb64018f216d4ba33)) by [@Mantisus](https://github.com/Mantisus), closes [#1589](https://github.com/apify/crawlee-python/issues/1589)


## [1.1.0](https://github.com/apify/crawlee-python/releases/tag/v1.1.0) (2025-11-18)

### 🚀 Features

- Add `chrome` `BrowserType` for `PlaywrightCrawler` to use the Chrome browser ([#1487](https://github.com/apify/crawlee-python/pull/1487)) ([b06937b](https://github.com/apify/crawlee-python/commit/b06937bbc3afe3c936b554bfc503365c1b2c526b)) by [@Mantisus](https://github.com/Mantisus), closes [#1071](https://github.com/apify/crawlee-python/issues/1071)
- Add `RedisStorageClient` based on Redis v8.0+ ([#1406](https://github.com/apify/crawlee-python/pull/1406)) ([d08d13d](https://github.com/apify/crawlee-python/commit/d08d13d39203c24ab61fe254b0956d6744db3b5f)) by [@Mantisus](https://github.com/Mantisus)
- Add support for Python 3.14 ([#1553](https://github.com/apify/crawlee-python/pull/1553)) ([89e9130](https://github.com/apify/crawlee-python/commit/89e9130cabee0fbc974b29c26483b7fa0edf627c)) by [@Mantisus](https://github.com/Mantisus)
- Add `transform_request_function` parameter for `SitemapRequestLoader` ([#1525](https://github.com/apify/crawlee-python/pull/1525)) ([dc90127](https://github.com/apify/crawlee-python/commit/dc901271849b239ba2a947e8ebff8e1815e8c4fb)) by [@Mantisus](https://github.com/Mantisus)

### 🐛 Bug Fixes

- Improve indexing of the `request_queue_records` table for `SqlRequestQueueClient` ([#1527](https://github.com/apify/crawlee-python/pull/1527)) ([6509534](https://github.com/apify/crawlee-python/commit/65095346a9d8b703b10c91e0510154c3c48a4176)) by [@Mantisus](https://github.com/Mantisus), closes [#1526](https://github.com/apify/crawlee-python/issues/1526)
- Improve error handling for `RobotsTxtFile.load` ([#1524](https://github.com/apify/crawlee-python/pull/1524)) ([596a311](https://github.com/apify/crawlee-python/commit/596a31184914a254b3e7a81fd2f48ea8eda7db49)) by [@Mantisus](https://github.com/Mantisus)
- Fix `crawler_runtime` not being updated during run and only in the end ([#1540](https://github.com/apify/crawlee-python/pull/1540)) ([0d6c3f6](https://github.com/apify/crawlee-python/commit/0d6c3f6d3337ddb6cab4873747c28cf95605d550)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1541](https://github.com/apify/crawlee-python/issues/1541)
- Ensure persist state event emission when exiting `EventManager` context ([#1562](https://github.com/apify/crawlee-python/pull/1562)) ([6a44f17](https://github.com/apify/crawlee-python/commit/6a44f172600cbcacebab899082d6efc9105c4e03)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1560](https://github.com/apify/crawlee-python/issues/1560)


## [1.0.4](https://github.com/apify/crawlee-python/releases/tag/v1.0.4) (2025-10-24)

### 🐛 Bug Fixes

- Respect `enqueue_strategy` in `enqueue_links` ([#1505](https://github.com/apify/crawlee-python/pull/1505)) ([6ee04bc](https://github.com/apify/crawlee-python/commit/6ee04bc08c50a70f2e956a79d4ce5072a726c3a8)) by [@Mantisus](https://github.com/Mantisus), closes [#1504](https://github.com/apify/crawlee-python/issues/1504)
- Exclude incorrect links before checking `robots.txt` ([#1502](https://github.com/apify/crawlee-python/pull/1502)) ([3273da5](https://github.com/apify/crawlee-python/commit/3273da5fee62ec9254666b376f382474c3532a56)) by [@Mantisus](https://github.com/Mantisus), closes [#1499](https://github.com/apify/crawlee-python/issues/1499)
- Resolve compatibility issue between `SqlStorageClient` and `AdaptivePlaywrightCrawler` ([#1496](https://github.com/apify/crawlee-python/pull/1496)) ([ce172c4](https://github.com/apify/crawlee-python/commit/ce172c425a8643a1d4c919db4f5e5a6e47e91deb)) by [@Mantisus](https://github.com/Mantisus), closes [#1495](https://github.com/apify/crawlee-python/issues/1495)
- Fix `BasicCrawler` statistics persistence ([#1490](https://github.com/apify/crawlee-python/pull/1490)) ([1eb1c19](https://github.com/apify/crawlee-python/commit/1eb1c19aa6f9dda4a0e3f7eda23f77a554f95076)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1501](https://github.com/apify/crawlee-python/issues/1501)
- Save context state in result for `AdaptivePlaywrightCrawler` after isolated processing in `SubCrawler` ([#1488](https://github.com/apify/crawlee-python/pull/1488)) ([62b7c70](https://github.com/apify/crawlee-python/commit/62b7c70b54085fc65a660062028014f4502beba9)) by [@Mantisus](https://github.com/Mantisus), closes [#1483](https://github.com/apify/crawlee-python/issues/1483)


## [1.0.3](https://github.com/apify/crawlee-python/releases/tag/v1.0.3) (2025-10-17)

### 🐛 Bug Fixes

- Add support for Pydantic v2.12 ([#1471](https://github.com/apify/crawlee-python/pull/1471)) ([35c1108](https://github.com/apify/crawlee-python/commit/35c110878c2f445a2866be2522ea8703e9b371dd)) by [@Mantisus](https://github.com/Mantisus), closes [#1464](https://github.com/apify/crawlee-python/issues/1464)
- Fix database version warning message ([#1485](https://github.com/apify/crawlee-python/pull/1485)) ([18a545e](https://github.com/apify/crawlee-python/commit/18a545ee8add92e844acd0068f9cb8580a82e1c9)) by [@Mantisus](https://github.com/Mantisus)
- Fix `reclaim_request` in `SqlRequestQueueClient` to correctly update the request state ([#1486](https://github.com/apify/crawlee-python/pull/1486)) ([1502469](https://github.com/apify/crawlee-python/commit/150246957f8f7f1ceb77bb77e3a02a903c50cae1)) by [@Mantisus](https://github.com/Mantisus), closes [#1484](https://github.com/apify/crawlee-python/issues/1484)
- Fix `KeyValueStore.auto_saved_value` failing in some scenarios ([#1438](https://github.com/apify/crawlee-python/pull/1438)) ([b35dee7](https://github.com/apify/crawlee-python/commit/b35dee78180e57161b826641d45a61b8d8f6ef51)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1354](https://github.com/apify/crawlee-python/issues/1354)


## [1.0.2](https://github.com/apify/crawlee-python/releases/tag/v1.0.2) (2025-10-08)

### 🐛 Bug Fixes

- Use Self type in the open() method of storage clients ([#1462](https://github.com/apify/crawlee-python/pull/1462)) ([4ec6f6c](https://github.com/apify/crawlee-python/commit/4ec6f6c08f81632197f602ff99151338b3eba6e7)) by [@janbuchar](https://github.com/janbuchar)
- Add storages name validation ([#1457](https://github.com/apify/crawlee-python/pull/1457)) ([84de11a](https://github.com/apify/crawlee-python/commit/84de11a3a603503076f5b7df487c9abab68a9015)) by [@Mantisus](https://github.com/Mantisus), closes [#1434](https://github.com/apify/crawlee-python/issues/1434)
- Pin pydantic version to &lt;2.12.0 to avoid compatibility issues ([#1467](https://github.com/apify/crawlee-python/pull/1467)) ([f11b86f](https://github.com/apify/crawlee-python/commit/f11b86f7ed57f98e83dc1b52f15f2017a919bf59)) by [@vdusek](https://github.com/vdusek)


## [1.0.1](https://github.com/apify/crawlee-python/releases/tag/v1.0.1) (2025-10-06)

### 🐛 Bug Fixes

- Fix memory leak in `PlaywrightCrawler` on browser context creation ([#1446](https://github.com/apify/crawlee-python/pull/1446)) ([bb181e5](https://github.com/apify/crawlee-python/commit/bb181e58d8070fba38e62d6e57fe981a00e5f035)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1443](https://github.com/apify/crawlee-python/issues/1443)
- Update templates to handle optional httpx client ([#1440](https://github.com/apify/crawlee-python/pull/1440)) ([c087efd](https://github.com/apify/crawlee-python/commit/c087efd39baedf46ca3e5cae1ddc1acd6396e6c1)) by [@Pijukatel](https://github.com/Pijukatel)


## [1.0.0](https://github.com/apify/crawlee-python/releases/tag/v1.0.0) (2025-09-29)

- Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v1) for more details.
- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v1) to ensure a smooth update.

### 🚀 Features

- Add utility for load and parse Sitemap and `SitemapRequestLoader` ([#1169](https://github.com/apify/crawlee-python/pull/1169)) ([66599f8](https://github.com/apify/crawlee-python/commit/66599f8d085f3a8622e130019b6fdce2325737de)) by [@Mantisus](https://github.com/Mantisus), closes [#1161](https://github.com/apify/crawlee-python/issues/1161)
- Add periodic status logging and `status_message_callback` parameter for customization ([#1265](https://github.com/apify/crawlee-python/pull/1265)) ([b992fb2](https://github.com/apify/crawlee-python/commit/b992fb2a457dedd20fc3014d7a4a8afe14602342)) by [@Mantisus](https://github.com/Mantisus), closes [#96](https://github.com/apify/crawlee-python/issues/96)
- Add crawlee-cli option to skip project installation ([#1294](https://github.com/apify/crawlee-python/pull/1294)) ([4d5aef0](https://github.com/apify/crawlee-python/commit/4d5aef05613d10c1442fe449d1cf0f63392c98e3)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1122](https://github.com/apify/crawlee-python/issues/1122)
- Improve `Crawlee` CLI help text ([#1297](https://github.com/apify/crawlee-python/pull/1297)) ([afbe10f](https://github.com/apify/crawlee-python/commit/afbe10f15d93353f5bc551bf9f193414179d0dd7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1295](https://github.com/apify/crawlee-python/issues/1295)
- Add basic `OpenTelemetry` instrumentation ([#1255](https://github.com/apify/crawlee-python/pull/1255)) ([a92d8b3](https://github.com/apify/crawlee-python/commit/a92d8b3f843ee795bba7e14710bb1faa1fdbf292)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1254](https://github.com/apify/crawlee-python/issues/1254)
- Add `ImpitHttpClient` http-client client using the `impit` library ([#1151](https://github.com/apify/crawlee-python/pull/1151)) ([0d0d268](https://github.com/apify/crawlee-python/commit/0d0d2681a4379c0e7ba54c49c86dabfef641610f)) by [@Mantisus](https://github.com/Mantisus)
- Prevent overloading system memory when running locally ([#1270](https://github.com/apify/crawlee-python/pull/1270)) ([30de3bd](https://github.com/apify/crawlee-python/commit/30de3bd7722cbc34db9fc582b4bda7dc2dfa90ff)) by [@janbuchar](https://github.com/janbuchar), closes [#1232](https://github.com/apify/crawlee-python/issues/1232)
- Expose `PlaywrightPersistentBrowser` class ([#1314](https://github.com/apify/crawlee-python/pull/1314)) ([b5fa955](https://github.com/apify/crawlee-python/commit/b5fa95508d7c099ff3a342577f338439283a975f)) by [@Mantisus](https://github.com/Mantisus)
- Add `impit` option for Crawlee CLI ([#1312](https://github.com/apify/crawlee-python/pull/1312)) ([508d7ce](https://github.com/apify/crawlee-python/commit/508d7ce4d998f37ab2adcf9c057c3c635a69f863)) by [@Mantisus](https://github.com/Mantisus)
- Persist RequestList state ([#1274](https://github.com/apify/crawlee-python/pull/1274)) ([cc68014](https://github.com/apify/crawlee-python/commit/cc680147ba3cc8b35b9da70274e53e6f5dd92434)) by [@janbuchar](https://github.com/janbuchar), closes [#99](https://github.com/apify/crawlee-python/issues/99)
- Persist `DefaultRenderingTypePredictor` state ([#1340](https://github.com/apify/crawlee-python/pull/1340)) ([fad4c25](https://github.com/apify/crawlee-python/commit/fad4c25fc712915c4a45b24e3290b6f5dbd8a683)) by [@Mantisus](https://github.com/Mantisus), closes [#1272](https://github.com/apify/crawlee-python/issues/1272)
- Persist the `SitemapRequestLoader` state ([#1347](https://github.com/apify/crawlee-python/pull/1347)) ([27ef9ad](https://github.com/apify/crawlee-python/commit/27ef9ad194552ea9f1321d91a7a52054be9a8a51)) by [@Mantisus](https://github.com/Mantisus), closes [#1269](https://github.com/apify/crawlee-python/issues/1269)
- Add support for NDU storages ([#1401](https://github.com/apify/crawlee-python/pull/1401)) ([5dbd212](https://github.com/apify/crawlee-python/commit/5dbd212663e7abc37535713f4c6e3a5bbf30a12e)) by [@vdusek](https://github.com/vdusek), closes [#1175](https://github.com/apify/crawlee-python/issues/1175)
- Add RQ id, name, alias args to `add_requests` and `enqueue_links` methods ([#1413](https://github.com/apify/crawlee-python/pull/1413)) ([1cae2bc](https://github.com/apify/crawlee-python/commit/1cae2bca0b1508fcb3cb419dc239caf33e20a7ef)) by [@Mantisus](https://github.com/Mantisus), closes [#1402](https://github.com/apify/crawlee-python/issues/1402)
- Add `SqlStorageClient` based on `sqlalchemy` v2+ ([#1339](https://github.com/apify/crawlee-python/pull/1339)) ([07c75a0](https://github.com/apify/crawlee-python/commit/07c75a078b443b58bfaaeb72eb2aa1439458dc47)) by [@Mantisus](https://github.com/Mantisus), closes [#307](https://github.com/apify/crawlee-python/issues/307)

### 🐛 Bug Fixes

- Fix memory estimation not working on MacOS ([#1330](https://github.com/apify/crawlee-python/pull/1330)) ([ab020eb](https://github.com/apify/crawlee-python/commit/ab020eb821a75723225b652d64babd84c368183f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1329](https://github.com/apify/crawlee-python/issues/1329)
- Fix retry count to not count the original request ([#1328](https://github.com/apify/crawlee-python/pull/1328)) ([74fa1d9](https://github.com/apify/crawlee-python/commit/74fa1d936cb3c29cf62d87862a96b4266694af2f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1326](https://github.com/apify/crawlee-python/issues/1326)
- [**breaking**] Remove unused &quot;stats&quot; field from RequestQueueMetadata ([#1331](https://github.com/apify/crawlee-python/pull/1331)) ([0a63bef](https://github.com/apify/crawlee-python/commit/0a63bef514b0bdcd3d6f208b386f706d0fe561e6)) by [@vdusek](https://github.com/vdusek)
- Ignore unknown parameters passed in cookies ([#1336](https://github.com/apify/crawlee-python/pull/1336)) ([50d3ef7](https://github.com/apify/crawlee-python/commit/50d3ef7540551383d26d40f3404b435bde35b47d)) by [@Mantisus](https://github.com/Mantisus), closes [#1333](https://github.com/apify/crawlee-python/issues/1333)
- Fix `timeout` for `stream` method in `ImpitHttpClient` ([#1352](https://github.com/apify/crawlee-python/pull/1352)) ([54b693b](https://github.com/apify/crawlee-python/commit/54b693b838f135a596e1e9493b565bc558b19a3a)) by [@Mantisus](https://github.com/Mantisus)
- Include reason in the session rotation warning logs ([#1363](https://github.com/apify/crawlee-python/pull/1363)) ([d6d7a45](https://github.com/apify/crawlee-python/commit/d6d7a45dd64a906419d9552c45062d726cbb1a0f)) by [@vdusek](https://github.com/vdusek), closes [#1318](https://github.com/apify/crawlee-python/issues/1318)
- Improve crawler statistics logging ([#1364](https://github.com/apify/crawlee-python/pull/1364)) ([1eb6da5](https://github.com/apify/crawlee-python/commit/1eb6da5dd85870124593dcad877284ccaed9c0ce)) by [@vdusek](https://github.com/vdusek), closes [#1317](https://github.com/apify/crawlee-python/issues/1317)
- Do not add a request that is already in progress to `MemoryRequestQueueClient` ([#1384](https://github.com/apify/crawlee-python/pull/1384)) ([3af326c](https://github.com/apify/crawlee-python/commit/3af326c9dfa8fffd56a42ca42981374613739e39)) by [@Mantisus](https://github.com/Mantisus), closes [#1383](https://github.com/apify/crawlee-python/issues/1383)
- Save `RequestQueueState` for `FileSystemRequestQueueClient` in default KVS ([#1411](https://github.com/apify/crawlee-python/pull/1411)) ([6ee60a0](https://github.com/apify/crawlee-python/commit/6ee60a08ac1f9414e1b792f4935cc3799cb5089a)) by [@Mantisus](https://github.com/Mantisus), closes [#1410](https://github.com/apify/crawlee-python/issues/1410)
- Set default desired concurrency for non-browser crawlers to 10 ([#1419](https://github.com/apify/crawlee-python/pull/1419)) ([1cc9401](https://github.com/apify/crawlee-python/commit/1cc940197600d2539bda967880d7f9d241eb8c3e)) by [@vdusek](https://github.com/vdusek)

### 🚜 Refactor

- [**breaking**] Introduce new storage client system ([#1194](https://github.com/apify/crawlee-python/pull/1194)) ([de1c03f](https://github.com/apify/crawlee-python/commit/de1c03f70dbd4ae1773fd49c632b3cfcfab82c26)) by [@vdusek](https://github.com/vdusek), closes [#92](https://github.com/apify/crawlee-python/issues/92), [#147](https://github.com/apify/crawlee-python/issues/147), [#783](https://github.com/apify/crawlee-python/issues/783), [#1247](https://github.com/apify/crawlee-python/issues/1247)
- [**breaking**] Split `BrowserType` literal into two different literals based on context ([#1070](https://github.com/apify/crawlee-python/pull/1070)) ([72b5698](https://github.com/apify/crawlee-python/commit/72b5698fa0647ea02b08da5651736cc37c4c0f6a)) by [@Pijukatel](https://github.com/Pijukatel)
- [**breaking**] Change method `HttpResponse.read` from sync to async ([#1296](https://github.com/apify/crawlee-python/pull/1296)) ([83fa8a4](https://github.com/apify/crawlee-python/commit/83fa8a416b6d2d4e27c678b9bf99bd1b8799f57b)) by [@Mantisus](https://github.com/Mantisus)
- [**breaking**] Replace `HttpxHttpClient` with `ImpitHttpClient` as default HTTP client ([#1307](https://github.com/apify/crawlee-python/pull/1307)) ([c803a97](https://github.com/apify/crawlee-python/commit/c803a976776a76846866d533e3a3ee8144e248c4)) by [@Mantisus](https://github.com/Mantisus), closes [#1079](https://github.com/apify/crawlee-python/issues/1079)
- [**breaking**] Change Dataset unwind parameter to accept list of strings ([#1357](https://github.com/apify/crawlee-python/pull/1357)) ([862a203](https://github.com/apify/crawlee-python/commit/862a20398f00fe91802fe7a1ccd58b05aee053a1)) by [@vdusek](https://github.com/vdusek)
- [**breaking**] Remove `Request.id` field ([#1366](https://github.com/apify/crawlee-python/pull/1366)) ([32f3580](https://github.com/apify/crawlee-python/commit/32f3580e9775a871924ab1233085d0c549c4cd52)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1358](https://github.com/apify/crawlee-python/issues/1358)
- [**breaking**] Refactor storage creation and caching, configuration and services ([#1386](https://github.com/apify/crawlee-python/pull/1386)) ([04649bd](https://github.com/apify/crawlee-python/commit/04649bde60d46b2bc18ae4f6e3fd9667d02a9cef)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1379](https://github.com/apify/crawlee-python/issues/1379)


## [0.6.12](https://github.com/apify/crawlee-python/releases/tag/v0.6.12) (2025-07-30)

### 🚀 Features

- Add `retire_browser_after_page_count` parameter for `BrowserPool` ([#1266](https://github.com/apify/crawlee-python/pull/1266)) ([603aa2b](https://github.com/apify/crawlee-python/commit/603aa2b192ef4bc42d88244bd009fffdb0614c06)) by [@Mantisus](https://github.com/Mantisus)

### 🐛 Bug Fixes

- Use `perf_counter_ns` for request duration tracking ([#1260](https://github.com/apify/crawlee-python/pull/1260)) ([9e92f6b](https://github.com/apify/crawlee-python/commit/9e92f6b54400ce5004fbab770e2e4ac42f73148f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1256](https://github.com/apify/crawlee-python/issues/1256)
- Fix memory estimation not working on MacOS (#1330) ([8558954](https://github.com/apify/crawlee-python/commit/8558954feeb7d5e91378186974a29851fedae9c8)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1329](https://github.com/apify/crawlee-python/issues/1329)
- Fix retry count to not count the original request (#1328) ([1aff3aa](https://github.com/apify/crawlee-python/commit/1aff3aaf0cdbe452a3731192449a445e5b2d7a63)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1326](https://github.com/apify/crawlee-python/issues/1326)
- Ignore unknown parameters passed in cookies (#1336) ([0f2610c](https://github.com/apify/crawlee-python/commit/0f2610c0ee1154dc004de60fc57fe7c9f478166a)) by [@Mantisus](https://github.com/Mantisus), closes [#1333](https://github.com/apify/crawlee-python/issues/1333)


## [0.6.11](https://github.com/apify/crawlee-python/releases/tag/v0.6.11) (2025-06-23)

### 🚀 Features

- Add `stream` method for `HttpClient` ([#1241](https://github.com/apify/crawlee-python/pull/1241)) ([95c68b0](https://github.com/apify/crawlee-python/commit/95c68b0b2d0bf9e093c1d0ee1002625172f7a868)) by [@Mantisus](https://github.com/Mantisus)

### 🐛 Bug Fixes

- Fix `ClientSnapshot` overload calculation ([#1228](https://github.com/apify/crawlee-python/pull/1228)) ([a4fc1b6](https://github.com/apify/crawlee-python/commit/a4fc1b6e83143650666108c289c084ea0463b80c)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1207](https://github.com/apify/crawlee-python/issues/1207)
- Use `PSS` instead of `RSS` to estimate children process memory usage on Linux ([#1210](https://github.com/apify/crawlee-python/pull/1210)) ([436032f](https://github.com/apify/crawlee-python/commit/436032f2de5f7d7fa1016033f1bb224159a8e6bf)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1206](https://github.com/apify/crawlee-python/issues/1206)
- Do not raise an error to check &#x27;same-domain&#x27; if there is no hostname in the url ([#1251](https://github.com/apify/crawlee-python/pull/1251)) ([a6c3aab](https://github.com/apify/crawlee-python/commit/a6c3aabf5f8341f215275077b6768a56118bc656)) by [@Mantisus](https://github.com/Mantisus)


## [0.6.10](https://github.com/apify/crawlee-python/releases/tag/v0.6.10) (2025-06-02)

### 🐛 Bug Fixes

- Allow config change on `PlaywrightCrawler` ([#1186](https://github.com/apify/crawlee-python/pull/1186)) ([f17bf31](https://github.com/apify/crawlee-python/commit/f17bf31456b702631aa7e0c26d4f07fd5eb7d1bd)) by [@mylank](https://github.com/mylank), closes [#1185](https://github.com/apify/crawlee-python/issues/1185)
- Add `payload` to `SendRequestFunction` to support `POST` request ([#1202](https://github.com/apify/crawlee-python/pull/1202)) ([e7449f2](https://github.com/apify/crawlee-python/commit/e7449f206c580cb8383a66e4c9ff5f67c5ce8409)) by [@Mantisus](https://github.com/Mantisus)
- Fix match check for specified enqueue strategy for requests with redirect ([#1199](https://github.com/apify/crawlee-python/pull/1199)) ([d84c30c](https://github.com/apify/crawlee-python/commit/d84c30cbd7c94d6525d3b6e8e86b379050454c0e)) by [@Mantisus](https://github.com/Mantisus), closes [#1198](https://github.com/apify/crawlee-python/issues/1198)
- Set `WindowsSelectorEventLoopPolicy` only for curl-impersonate template without `playwright` ([#1209](https://github.com/apify/crawlee-python/pull/1209)) ([f3b839f](https://github.com/apify/crawlee-python/commit/f3b839ffc2ccc1b889b6d5928f35f57b725e27f1)) by [@Mantisus](https://github.com/Mantisus), closes [#1204](https://github.com/apify/crawlee-python/issues/1204)
- Add support non-GET requests for `PlaywrightCrawler` ([#1208](https://github.com/apify/crawlee-python/pull/1208)) ([dbb9f44](https://github.com/apify/crawlee-python/commit/dbb9f44c71af15e1f86766fa0ba68281dd85fd9e)) by [@Mantisus](https://github.com/Mantisus), closes [#1201](https://github.com/apify/crawlee-python/issues/1201)
- Respect `EnqueueLinksKwargs` for `extract_links` function ([#1213](https://github.com/apify/crawlee-python/pull/1213)) ([c9907d6](https://github.com/apify/crawlee-python/commit/c9907d6ff4c3a4a719b279cea77694c00a5a963d)) by [@Mantisus](https://github.com/Mantisus), closes [#1212](https://github.com/apify/crawlee-python/issues/1212)


## [0.6.9](https://github.com/apify/crawlee-python/releases/tag/v0.6.9) (2025-05-02)

### 🚀 Features

- Add an internal `HttpClient` to be used in `send_request` for `PlaywrightCrawler` using `APIRequestContext` bound to the browser context ([#1134](https://github.com/apify/crawlee-python/pull/1134)) ([e794f49](https://github.com/apify/crawlee-python/commit/e794f4985d3a018ee76d634fe2b2c735fb450272)) by [@Mantisus](https://github.com/Mantisus), closes [#928](https://github.com/apify/crawlee-python/issues/928)
- Make timeout error log cleaner ([#1170](https://github.com/apify/crawlee-python/pull/1170)) ([78ea9d2](https://github.com/apify/crawlee-python/commit/78ea9d23e0b2d73286043b68393e462f636625c9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1158](https://github.com/apify/crawlee-python/issues/1158)
- Add `on_skipped_request` decorator, to process links skipped according to `robots.txt` rules ([#1166](https://github.com/apify/crawlee-python/pull/1166)) ([bd16f14](https://github.com/apify/crawlee-python/commit/bd16f14a834eebf485aea6b6a83f2b18bf16b504)) by [@Mantisus](https://github.com/Mantisus), closes [#1160](https://github.com/apify/crawlee-python/issues/1160)

### 🐛 Bug Fixes

- Fix handle error without `args` in `_get_error_message`  for `ErrorTracker` ([#1181](https://github.com/apify/crawlee-python/pull/1181)) ([21944d9](https://github.com/apify/crawlee-python/commit/21944d908b8404d2ad6c182104e7a8c27be12a6e)) by [@Mantisus](https://github.com/Mantisus), closes [#1179](https://github.com/apify/crawlee-python/issues/1179)
- Temporarily add `certifi&lt;=2025.1.31` dependency ([#1183](https://github.com/apify/crawlee-python/pull/1183)) ([25ff961](https://github.com/apify/crawlee-python/commit/25ff961990f9abc9d0673ba6573dfcf46dd6e53f)) by [@Pijukatel](https://github.com/Pijukatel)


## [0.6.8](https://github.com/apify/crawlee-python/releases/tag/v0.6.8) (2025-04-25)

### 🚀 Features

- Handle unprocessed requests in `add_requests_batched` ([#1159](https://github.com/apify/crawlee-python/pull/1159)) ([7851175](https://github.com/apify/crawlee-python/commit/7851175304d63e455223b25853021cfbe15d68bd)) by [@Pijukatel](https://github.com/Pijukatel), closes [#456](https://github.com/apify/crawlee-python/issues/456)
- Add  `respect_robots_txt_file` option ([#1162](https://github.com/apify/crawlee-python/pull/1162)) ([c23f365](https://github.com/apify/crawlee-python/commit/c23f365bfd263b4357edf82c14a7c6ff8dee45e4)) by [@Mantisus](https://github.com/Mantisus)

### 🐛 Bug Fixes

- Update `UnprocessedRequest` to match actual data ([#1155](https://github.com/apify/crawlee-python/pull/1155)) ([a15a1f3](https://github.com/apify/crawlee-python/commit/a15a1f3528c7cbcf78d3bda5a236bcee1d492764)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1150](https://github.com/apify/crawlee-python/issues/1150)
- Fix the order in which cookies are saved to the `SessionCookies` and the handler is executed for `PlaywrightCrawler` ([#1163](https://github.com/apify/crawlee-python/pull/1163)) ([82ff69a](https://github.com/apify/crawlee-python/commit/82ff69acd8e409f56be56dd061aae0f854ec25b4)) by [@Mantisus](https://github.com/Mantisus)
- Call `failed_request_handler` for `SessionError` when session rotation count exceeds maximum ([#1147](https://github.com/apify/crawlee-python/pull/1147)) ([b3637b6](https://github.com/apify/crawlee-python/commit/b3637b68ec7eae9de7f1b923fa2f68885da64b90)) by [@Mantisus](https://github.com/Mantisus)


## [0.6.7](https://github.com/apify/crawlee-python/releases/tag/v0.6.7) (2025-04-17)

### 🚀 Features

- Add `ErrorSnapshotter` to `ErrorTracker` ([#1125](https://github.com/apify/crawlee-python/pull/1125)) ([9666092](https://github.com/apify/crawlee-python/commit/9666092c6a59ac4d34409038d5476e5b6fb58a26)) by [@Pijukatel](https://github.com/Pijukatel), closes [#151](https://github.com/apify/crawlee-python/issues/151)

### 🐛 Bug Fixes

- Improve validation errors in Crawlee CLI ([#1140](https://github.com/apify/crawlee-python/pull/1140)) ([f2d33df](https://github.com/apify/crawlee-python/commit/f2d33dff178a3d3079eb3807feb9645a25cc7a93)) by [@vdusek](https://github.com/vdusek), closes [#1138](https://github.com/apify/crawlee-python/issues/1138)
- Disable logger propagation to prevent duplicate logs ([#1156](https://github.com/apify/crawlee-python/pull/1156)) ([0b3648d](https://github.com/apify/crawlee-python/commit/0b3648d5d399f0af23520f7fb8ee635d38b512c4)) by [@vdusek](https://github.com/vdusek)


## [0.6.6](https://github.com/apify/crawlee-python/releases/tag/v0.6.6) (2025-04-03)

### 🚀 Features

- Add `statistics_log_format` parameter to `BasicCrawler` ([#1061](https://github.com/apify/crawlee-python/pull/1061)) ([635ae4a](https://github.com/apify/crawlee-python/commit/635ae4a56c65e434783ca721f4164203f465abf0)) by [@Mantisus](https://github.com/Mantisus), closes [#700](https://github.com/apify/crawlee-python/issues/700)
- Add Session binding capability via `session_id` in `Request` ([#1086](https://github.com/apify/crawlee-python/pull/1086)) ([cda7b31](https://github.com/apify/crawlee-python/commit/cda7b314ffda3104e4fd28a5e85c9e238d8866a4)) by [@Mantisus](https://github.com/Mantisus), closes [#1076](https://github.com/apify/crawlee-python/issues/1076)
- Add `requests` argument to `EnqueueLinksFunction` ([#1024](https://github.com/apify/crawlee-python/pull/1024)) ([fc8444c](https://github.com/apify/crawlee-python/commit/fc8444c245c7607d3e378a4835d7d3355c4059be)) by [@Pijukatel](https://github.com/Pijukatel)

### 🐛 Bug Fixes

- Add port for `same-origin` strategy check ([#1096](https://github.com/apify/crawlee-python/pull/1096)) ([9e24598](https://github.com/apify/crawlee-python/commit/9e245987d0aab0ba9c763689f12958b5a332db46)) by [@Mantisus](https://github.com/Mantisus)
- Fix handling of loading empty `metadata` file for queue ([#1042](https://github.com/apify/crawlee-python/pull/1042)) ([b00876e](https://github.com/apify/crawlee-python/commit/b00876e8dcb30a12d3737bd31237da9daada46bb)) by [@Mantisus](https://github.com/Mantisus), closes [#1029](https://github.com/apify/crawlee-python/issues/1029)
- Update favicon ([#1114](https://github.com/apify/crawlee-python/pull/1114)) ([eba900f](https://github.com/apify/crawlee-python/commit/eba900fc1e8d918c6fc464657c53004a3e0fe668)) by [@baldasseva](https://github.com/baldasseva)
- **website:** Use correct image source ([#1115](https://github.com/apify/crawlee-python/pull/1115)) ([ee7806f](https://github.com/apify/crawlee-python/commit/ee7806fc2f9b7b590d9668cc9f86009a898a3da6)) by [@baldasseva](https://github.com/baldasseva)


## [0.6.5](https://github.com/apify/crawlee-python/releases/tag/v0.6.5) (2025-03-13)

### 🐛 Bug Fixes

- Update to `browserforge` workaround ([#1075](https://github.com/apify/crawlee-python/pull/1075)) ([2378cf8](https://github.com/apify/crawlee-python/commit/2378cf84ab1ed06473049a9ddfca2ba6f166306d)) by [@Pijukatel](https://github.com/Pijukatel)


## [0.6.4](https://github.com/apify/crawlee-python/releases/tag/v0.6.4) (2025-03-12)

### 🐛 Bug Fixes

- Add a check thread before set `add_signal_handler` ([#1068](https://github.com/apify/crawlee-python/pull/1068)) ([6983bda](https://github.com/apify/crawlee-python/commit/6983bda2dbc202b3ecbf7db62b11deee007b4b5f)) by [@Mantisus](https://github.com/Mantisus)
- Temporary workaround for `browserforge` import time code execution ([#1073](https://github.com/apify/crawlee-python/pull/1073)) ([17d914f](https://github.com/apify/crawlee-python/commit/17d914f78242078f88c07d686a567d1091255eb1)) by [@Pijukatel](https://github.com/Pijukatel)


## [0.6.3](https://github.com/apify/crawlee-python/releases/tag/v0.6.3) (2025-03-07)

### 🚀 Features

- Add project template with `uv` package manager ([#1057](https://github.com/apify/crawlee-python/pull/1057)) ([9ec06e5](https://github.com/apify/crawlee-python/commit/9ec06e58032aa11af46ac9cd1ea7bb002a18eb13)) by [@Mantisus](https://github.com/Mantisus), closes [#1053](https://github.com/apify/crawlee-python/issues/1053)
- Use fingerprint generator in `PlaywrightCrawler` by default  ([#1060](https://github.com/apify/crawlee-python/pull/1060)) ([09cec53](https://github.com/apify/crawlee-python/commit/09cec532911043623eeb475aa8552c70bd94f8b7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1054](https://github.com/apify/crawlee-python/issues/1054)

### 🐛 Bug Fixes

- Update project templates for Poetry v2.x compatibility ([#1049](https://github.com/apify/crawlee-python/pull/1049)) ([96dc2f9](https://github.com/apify/crawlee-python/commit/96dc2f9b53b0a2d0f1d0c73d10e5244114e849ff)) by [@Mantisus](https://github.com/Mantisus), closes [#954](https://github.com/apify/crawlee-python/issues/954)
- Remove tmp folder for PlaywrightCrawler in non-headless mode ([#1046](https://github.com/apify/crawlee-python/pull/1046)) ([3a7f444](https://github.com/apify/crawlee-python/commit/3a7f444fb7ee9a0ab1867c8c9586b15aab1e7df2)) by [@Mantisus](https://github.com/Mantisus)


## [0.6.2](https://github.com/apify/crawlee-python/releases/tag/v0.6.2) (2025-03-05)

### 🚀 Features

- Extend ErrorTracker with error grouping ([#1014](https://github.com/apify/crawlee-python/pull/1014)) ([561de5c](https://github.com/apify/crawlee-python/commit/561de5c6b76af386cad5ac804a22fb7af227e460)) by [@Pijukatel](https://github.com/Pijukatel)


## [0.6.1](https://github.com/apify/crawlee-python/releases/tag/v0.6.1) (2025-03-03)

### 🐛 Bug Fixes

- Add `browserforge` to mandatory dependencies ([#1044](https://github.com/apify/crawlee-python/pull/1044)) ([ddfbde8](https://github.com/apify/crawlee-python/commit/ddfbde89dd3e3cbef0f3954936f4a41c3d6df909)) by [@Pijukatel](https://github.com/Pijukatel)


## [0.6.0](https://github.com/apify/crawlee-python/releases/tag/v0.6.0) (2025-03-03)

- Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v06) for more details.
- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v06) to ensure a smooth update.

### 🚀 Features

- Integrate browserforge fingerprints ([#829](https://github.com/apify/crawlee-python/pull/829)) ([2b156b4](https://github.com/apify/crawlee-python/commit/2b156b4ba688f9111195422e6058dff30eb1f782)) by [@Pijukatel](https://github.com/Pijukatel), closes [#549](https://github.com/apify/crawlee-python/issues/549)
- Add AdaptivePlaywrightCrawler ([#872](https://github.com/apify/crawlee-python/pull/872)) ([5ba70b6](https://github.com/apify/crawlee-python/commit/5ba70b6e846a908a55db461ab0c85e3946f2bc7c)) by [@Pijukatel](https://github.com/Pijukatel)
- Implement `_snapshot_client` for `Snapshotter` ([#957](https://github.com/apify/crawlee-python/pull/957)) ([ba4d384](https://github.com/apify/crawlee-python/commit/ba4d384228d030c20c580ed01fae0e78af3a9543)) by [@Mantisus](https://github.com/Mantisus), closes [#60](https://github.com/apify/crawlee-python/issues/60)
- Add adaptive context helpers ([#964](https://github.com/apify/crawlee-python/pull/964)) ([e248f17](https://github.com/apify/crawlee-python/commit/e248f17fad7b6d1fc5e23a0a1e961db66068a411)) by [@Pijukatel](https://github.com/Pijukatel), closes [#249](https://github.com/apify/crawlee-python/issues/249)
- [**breaking**] Enable additional status codes arguments to PlaywrightCrawler ([#959](https://github.com/apify/crawlee-python/pull/959)) ([87cf446](https://github.com/apify/crawlee-python/commit/87cf446a7cbaa900e28abd93d4c8a2e0d1747059)) by [@Pijukatel](https://github.com/Pijukatel), closes [#953](https://github.com/apify/crawlee-python/issues/953)
- Replace `HeaderGenerator` implementation by `browserforge` implementation ([#960](https://github.com/apify/crawlee-python/pull/960)) ([c2f8c93](https://github.com/apify/crawlee-python/commit/c2f8c93a4ad57c4ede354545bf925bf3707899c9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#937](https://github.com/apify/crawlee-python/issues/937)

### 🐛 Bug Fixes

- Fix playwright template and dockerfile ([#972](https://github.com/apify/crawlee-python/pull/972)) ([c33b34d](https://github.com/apify/crawlee-python/commit/c33b34dd6e253b1261c700857bb5c4bbec6d5c14)) by [@janbuchar](https://github.com/janbuchar), closes [#969](https://github.com/apify/crawlee-python/issues/969)
- Fix installing dependencies via pip in project template ([#977](https://github.com/apify/crawlee-python/pull/977)) ([1e3b8eb](https://github.com/apify/crawlee-python/commit/1e3b8eb1cdb57bf2f7256e8ae5f0706b0afc3ba9)) by [@janbuchar](https://github.com/janbuchar), closes [#975](https://github.com/apify/crawlee-python/issues/975)
- Fix default migration storage ([#1018](https://github.com/apify/crawlee-python/pull/1018)) ([6a0c4d9](https://github.com/apify/crawlee-python/commit/6a0c4d94593f7e94f24eee8a97fc7bc83c4d02e1)) by [@Pijukatel](https://github.com/Pijukatel), closes [#991](https://github.com/apify/crawlee-python/issues/991)
- Fix logger name for http based loggers ([#1023](https://github.com/apify/crawlee-python/pull/1023)) ([bfb3944](https://github.com/apify/crawlee-python/commit/bfb394446351c8f3b9879a9905607f7c929f2542)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1021](https://github.com/apify/crawlee-python/issues/1021)
- Remove allow_redirects override in CurlImpersonateHttpClient ([#1017](https://github.com/apify/crawlee-python/pull/1017)) ([01d855a](https://github.com/apify/crawlee-python/commit/01d855a43389a6b4b16ec74767624fa7eb13151f)) by [@2tunnels](https://github.com/2tunnels), closes [#1016](https://github.com/apify/crawlee-python/issues/1016)
- Remove follow_redirects override in HttpxHttpClient ([#1015](https://github.com/apify/crawlee-python/pull/1015)) ([88afda3](https://github.com/apify/crawlee-python/commit/88afda33e77be84bc91ad1239740b8e661bef2a2)) by [@2tunnels](https://github.com/2tunnels), closes [#1013](https://github.com/apify/crawlee-python/issues/1013)
- Fix flaky test_common_headers_and_user_agent ([#1030](https://github.com/apify/crawlee-python/pull/1030)) ([58aa70e](https://github.com/apify/crawlee-python/commit/58aa70e9600d313b823a1376ab9b36fb416c1c4a)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1027](https://github.com/apify/crawlee-python/issues/1027)

### 🚜 Refactor

- [**breaking**] Remove unused config properties ([#978](https://github.com/apify/crawlee-python/pull/978)) ([4b7fe29](https://github.com/apify/crawlee-python/commit/4b7fe2930540a5fbd753135e3ce29dc80f80c543)) by [@vdusek](https://github.com/vdusek)
- [**breaking**] Remove Base prefix from abstract class names ([#980](https://github.com/apify/crawlee-python/pull/980)) ([8ccb5d4](https://github.com/apify/crawlee-python/commit/8ccb5d41a1dae9b02088b433266ac89bd089561a)) by [@vdusek](https://github.com/vdusek)
- [**breaking**] Сhange default `incognito context` to `persistent context` for `Playwright` ([#985](https://github.com/apify/crawlee-python/pull/985)) ([f01520d](https://github.com/apify/crawlee-python/commit/f01520d22b31af9f0f13ca162cc47e6aa9744c6d)) by [@Mantisus](https://github.com/Mantisus), closes [#721](https://github.com/apify/crawlee-python/issues/721), [#963](https://github.com/apify/crawlee-python/issues/963)
- [**breaking**] Change `Session` cookies from `dict` to `SessionCookies` with `CookieJar` ([#984](https://github.com/apify/crawlee-python/pull/984)) ([6523b3a](https://github.com/apify/crawlee-python/commit/6523b3ade0eed53b0363ddce250c557024339b5e)) by [@Mantisus](https://github.com/Mantisus), closes [#710](https://github.com/apify/crawlee-python/issues/710), [#933](https://github.com/apify/crawlee-python/issues/933)
- [**breaking**] Replace enum with literal for `EnqueueStrategy` ([#1019](https://github.com/apify/crawlee-python/pull/1019)) ([d2481ef](https://github.com/apify/crawlee-python/commit/d2481ef71d3539979c5b1129387e72b4126fe366)) by [@vdusek](https://github.com/vdusek)
- [**breaking**] Update status code handling ([#1028](https://github.com/apify/crawlee-python/pull/1028)) ([6b59471](https://github.com/apify/crawlee-python/commit/6b5947125e63abdfff481b0669398fc9a7293e55)) by [@Mantisus](https://github.com/Mantisus), closes [#830](https://github.com/apify/crawlee-python/issues/830), [#998](https://github.com/apify/crawlee-python/issues/998)
- [**breaking**] Move `cli` dependencies to optional dependencies ([#1011](https://github.com/apify/crawlee-python/pull/1011)) ([4382959](https://github.com/apify/crawlee-python/commit/43829590c6b4efd1dc9b833373f82a842a0a1a8e)) by [@Mantisus](https://github.com/Mantisus), closes [#703](https://github.com/apify/crawlee-python/issues/703), [#1010](https://github.com/apify/crawlee-python/issues/1010)


## [0.5.4](https://github.com/apify/crawlee-python/releases/tag/v0.5.4) (2025-02-05)

### 🚀 Features

- Add support `use_incognito_pages` for `browser_launch_options` in `PlaywrightCrawler` ([#941](https://github.com/apify/crawlee-python/pull/941)) ([eae3a33](https://github.com/apify/crawlee-python/commit/eae3a33a1842ebbdac5f9c51866a4be4bcf1ae2c)) by [@Mantisus](https://github.com/Mantisus)

### 🐛 Bug Fixes

- Fix session management with retire ([#947](https://github.com/apify/crawlee-python/pull/947)) ([caee03f](https://github.com/apify/crawlee-python/commit/caee03fe3a43cc1d7a8d3f9e19b42df1bdb1c0aa)) by [@Mantisus](https://github.com/Mantisus)
- Fix templates - poetry-plugin-export version and camoufox template name ([#952](https://github.com/apify/crawlee-python/pull/952)) ([7addea6](https://github.com/apify/crawlee-python/commit/7addea6605359cceba208e16ec9131724bdb3e9b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#951](https://github.com/apify/crawlee-python/issues/951)
- Fix convert relative link to absolute in `enqueue_links` for response with redirect ([#956](https://github.com/apify/crawlee-python/pull/956)) ([694102e](https://github.com/apify/crawlee-python/commit/694102e163bb9021a4830d2545d153f6f8f3de90)) by [@Mantisus](https://github.com/Mantisus), closes [#955](https://github.com/apify/crawlee-python/issues/955)
- Fix `CurlImpersonateHttpClient` cookies handler ([#946](https://github.com/apify/crawlee-python/pull/946)) ([ed415c4](https://github.com/apify/crawlee-python/commit/ed415c433da2a40b0ee62534f0730d0737e991b8)) by [@Mantisus](https://github.com/Mantisus)


## [0.5.3](https://github.com/apify/crawlee-python/releases/tag/v0.5.3) (2025-01-31)

### 🚀 Features

- Add keep_alive flag to `crawler.__init__` ([#921](https://github.com/apify/crawlee-python/pull/921)) ([7a82d0c](https://github.com/apify/crawlee-python/commit/7a82d0cbdbe6c8739d4bf6a9b014e31f07e5a520)) by [@Pijukatel](https://github.com/Pijukatel), closes [#891](https://github.com/apify/crawlee-python/issues/891)
- Add `block_requests` helper for `PlaywrightCrawler` ([#919](https://github.com/apify/crawlee-python/pull/919)) ([1030459](https://github.com/apify/crawlee-python/commit/103045994908f80cffee5ccfff91a040e0042f48)) by [@Mantisus](https://github.com/Mantisus), closes [#848](https://github.com/apify/crawlee-python/issues/848)
- Return request handlers from decorator methods to allow further decoration ([#934](https://github.com/apify/crawlee-python/pull/934)) ([9ec0aae](https://github.com/apify/crawlee-python/commit/9ec0aae54e2a340d29c893567ae80bf8bd4510a9)) by [@mylank](https://github.com/mylank)
- Add `transform_request_function` for `enqueue_links` ([#923](https://github.com/apify/crawlee-python/pull/923)) ([6b15957](https://github.com/apify/crawlee-python/commit/6b159578f612251e6d2253a72b6521430f4f9b09)) by [@Mantisus](https://github.com/Mantisus), closes [#894](https://github.com/apify/crawlee-python/issues/894)
- Add `time_remaining_secs` property to `MIGRATING` event data ([#940](https://github.com/apify/crawlee-python/pull/940)) ([b44501b](https://github.com/apify/crawlee-python/commit/b44501bcadbd12673a8f47aa92f12da8e404f60b)) by [@fnesveda](https://github.com/fnesveda)
- Add LogisticalRegressionPredictor - rendering type predictor for adaptive crawling ([#930](https://github.com/apify/crawlee-python/pull/930)) ([8440499](https://github.com/apify/crawlee-python/commit/8440499468db115a4c478e9bcdb692554d1655c5)) by [@Pijukatel](https://github.com/Pijukatel)

### 🐛 Bug Fixes

- Fix crawler not retrying user handler if there was timeout in the handler ([#909](https://github.com/apify/crawlee-python/pull/909)) ([f4090ef](https://github.com/apify/crawlee-python/commit/f4090ef0ea0281d53dab16a77ceea2ef6ac43d76)) by [@Pijukatel](https://github.com/Pijukatel), closes [#907](https://github.com/apify/crawlee-python/issues/907)
- Optimize memory consumption for `HttpxHttpClient`, fix proxy handling ([#905](https://github.com/apify/crawlee-python/pull/905)) ([d7ad480](https://github.com/apify/crawlee-python/commit/d7ad480834263ae0480049cb0a8db4dfc3946d8d)) by [@Mantisus](https://github.com/Mantisus), closes [#895](https://github.com/apify/crawlee-python/issues/895)
- Fix `BrowserPool` and `PlaywrightBrowserPlugin` closure ([#932](https://github.com/apify/crawlee-python/pull/932)) ([997543d](https://github.com/apify/crawlee-python/commit/997543d2fa5afba49929f4407ee95d7a4933a50d)) by [@Mantisus](https://github.com/Mantisus)


## [0.5.2](https://github.com/apify/crawlee-python/releases/tag/v0.5.2) (2025-01-17)

### 🐛 Bug Fixes

- Avoid `use_state` race conditions. Remove key argument to `use_state` ([#868](https://github.com/apify/crawlee-python/pull/868)) ([000b976](https://github.com/apify/crawlee-python/commit/000b9761211502d86a893a31e3ca21998a6e3b99)) by [@Pijukatel](https://github.com/Pijukatel), closes [#856](https://github.com/apify/crawlee-python/issues/856)
- Restore proxy functionality for PlaywrightCrawler broken in v0.5 ([#889](https://github.com/apify/crawlee-python/pull/889)) ([908c944](https://github.com/apify/crawlee-python/commit/908c944ff9b1fc8ed7eb35f0078a1de71e34d5c5)) by [@Mantisus](https://github.com/Mantisus), closes [#887](https://github.com/apify/crawlee-python/issues/887)
- Fix the usage of Configuration ([#899](https://github.com/apify/crawlee-python/pull/899)) ([0f1cf6f](https://github.com/apify/crawlee-python/commit/0f1cf6f0b52c92ca4e465a2a01f8111cd9ab42ec)) by [@vdusek](https://github.com/vdusek), closes [#670](https://github.com/apify/crawlee-python/issues/670)


## [0.5.1](https://github.com/apify/crawlee-python/releases/tag/v0.5.1) (2025-01-07)

### 🐛 Bug Fixes

- Make result of RequestList.is_empty independent of fetch_next_request calls ([#876](https://github.com/apify/crawlee-python/pull/876)) ([d50249e](https://github.com/apify/crawlee-python/commit/d50249ecbfe2a04f508fcdc3261e050349bd0da2)) by [@janbuchar](https://github.com/janbuchar)


## [0.5.0](https://github.com/apify/crawlee-python/releases/tag/v0.5.0) (2025-01-02)

- Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v05) for more details.
- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v05) to ensure a smooth update.

### 🚀 Features

- Add possibility to use None as no proxy in tiered proxies ([#760](https://github.com/apify/crawlee-python/pull/760)) ([0fbd017](https://github.com/apify/crawlee-python/commit/0fbd01723b9fe2e3410e0f358cab2f22848b08d0)) by [@Pijukatel](https://github.com/Pijukatel), closes [#687](https://github.com/apify/crawlee-python/issues/687)
- Add `use_state` context method ([#682](https://github.com/apify/crawlee-python/pull/682)) ([868b41e](https://github.com/apify/crawlee-python/commit/868b41ebd4c8003fa60ab07887577d0fb85b6ecc)) by [@Mantisus](https://github.com/Mantisus), closes [#191](https://github.com/apify/crawlee-python/issues/191)
- Add pre-navigation hooks router to AbstractHttpCrawler ([#791](https://github.com/apify/crawlee-python/pull/791)) ([0f23205](https://github.com/apify/crawlee-python/commit/0f23205923065074c522b3de9d47218a204dfa78)) by [@Pijukatel](https://github.com/Pijukatel), closes [#635](https://github.com/apify/crawlee-python/issues/635)
- Add example of how to integrate Camoufox into PlaywrightCrawler ([#789](https://github.com/apify/crawlee-python/pull/789)) ([246cfc4](https://github.com/apify/crawlee-python/commit/246cfc4ebc8bce1d15e1dddd62d652bd65869328)) by [@Pijukatel](https://github.com/Pijukatel), closes [#684](https://github.com/apify/crawlee-python/issues/684)
- Expose event types, improve on&#x2F;emit signature, allow parameterless listeners ([#800](https://github.com/apify/crawlee-python/pull/800)) ([c102c4c](https://github.com/apify/crawlee-python/commit/c102c4c894a00b09adfd5f4911563c81cf3e98b4)) by [@janbuchar](https://github.com/janbuchar), closes [#561](https://github.com/apify/crawlee-python/issues/561)
- Add stop method to BasicCrawler ([#807](https://github.com/apify/crawlee-python/pull/807)) ([6d01af4](https://github.com/apify/crawlee-python/commit/6d01af4231d02b4349a8719f5ed18d812843fde5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#651](https://github.com/apify/crawlee-python/issues/651)
- Add `html_to_text` helper function ([#792](https://github.com/apify/crawlee-python/pull/792)) ([2b9d970](https://github.com/apify/crawlee-python/commit/2b9d97009dd653870681bb3cadbb46b214ff1a73)) by [@Pijukatel](https://github.com/Pijukatel), closes [#659](https://github.com/apify/crawlee-python/issues/659)
- [**breaking**] Implement `RequestManagerTandem`, remove `add_request` from `RequestList`, accept any iterable in `RequestList` constructor ([#777](https://github.com/apify/crawlee-python/pull/777)) ([4172652](https://github.com/apify/crawlee-python/commit/4172652079e5e91190c1cc5e2138fd41a7c84a6b)) by [@janbuchar](https://github.com/janbuchar)

### 🐛 Bug Fixes

- Fix circular import in `KeyValueStore` ([#805](https://github.com/apify/crawlee-python/pull/805)) ([8bdf49d](https://github.com/apify/crawlee-python/commit/8bdf49d1cb2a94b66f69fd1b77063a4113517fae)) by [@Mantisus](https://github.com/Mantisus), closes [#804](https://github.com/apify/crawlee-python/issues/804)
- [**breaking**] Refactor service usage to rely on `service_locator` ([#691](https://github.com/apify/crawlee-python/pull/691)) ([1d31c6c](https://github.com/apify/crawlee-python/commit/1d31c6c7e7a9ec7cee5b2de900568d9f77db65ba)) by [@vdusek](https://github.com/vdusek), closes [#369](https://github.com/apify/crawlee-python/issues/369), [#539](https://github.com/apify/crawlee-python/issues/539), [#699](https://github.com/apify/crawlee-python/issues/699)
- Pass `verify` in httpx client ([#802](https://github.com/apify/crawlee-python/pull/802)) ([074d083](https://github.com/apify/crawlee-python/commit/074d0836b55e52f13726e7cd1c21602623fda4fc)) by [@Mantisus](https://github.com/Mantisus), closes [#798](https://github.com/apify/crawlee-python/issues/798)
- Fix `page_options` for `PlaywrightBrowserPlugin` ([#796](https://github.com/apify/crawlee-python/pull/796)) ([bd3bdd4](https://github.com/apify/crawlee-python/commit/bd3bdd4046c2ddea62feb77322033cad50f382dd)) by [@Mantisus](https://github.com/Mantisus), closes [#755](https://github.com/apify/crawlee-python/issues/755)
- Fix event migrating handler in `RequestQueue` ([#825](https://github.com/apify/crawlee-python/pull/825)) ([fd6663f](https://github.com/apify/crawlee-python/commit/fd6663f903bc7eecd1000da89e06197b43dfb962)) by [@Mantisus](https://github.com/Mantisus), closes [#815](https://github.com/apify/crawlee-python/issues/815)
- Respect user configuration for work with status codes ([#812](https://github.com/apify/crawlee-python/pull/812)) ([8daf4bd](https://github.com/apify/crawlee-python/commit/8daf4bd49c1b09a0924f827daedebf7600ac609b)) by [@Mantisus](https://github.com/Mantisus), closes [#708](https://github.com/apify/crawlee-python/issues/708), [#756](https://github.com/apify/crawlee-python/issues/756)
- `abort-on-error` for successive runs ([#834](https://github.com/apify/crawlee-python/pull/834)) ([0cea673](https://github.com/apify/crawlee-python/commit/0cea67387bf366800b447de784af580159b199ee)) by [@Mantisus](https://github.com/Mantisus)
- Relax ServiceLocator restrictions ([#837](https://github.com/apify/crawlee-python/pull/837)) ([aa3667f](https://github.com/apify/crawlee-python/commit/aa3667f344d78945df3eca77431e1409f43f8bb5)) by [@janbuchar](https://github.com/janbuchar), closes [#806](https://github.com/apify/crawlee-python/issues/806)
- Fix typo in exports ([#841](https://github.com/apify/crawlee-python/pull/841)) ([8fa6ac9](https://github.com/apify/crawlee-python/commit/8fa6ac994fe4f3f6430cb796a0c6a732c93c672b)) by [@janbuchar](https://github.com/janbuchar)

### 🚜 Refactor

- [**breaking**] Refactor HttpCrawler, BeautifulSoupCrawler, ParselCrawler inheritance ([#746](https://github.com/apify/crawlee-python/pull/746)) ([9d3c269](https://github.com/apify/crawlee-python/commit/9d3c2697c91ce93028ca86a91d85d465d36c1ad7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#350](https://github.com/apify/crawlee-python/issues/350)
- [**breaking**] Remove `json_` and `order_no` from `Request` ([#788](https://github.com/apify/crawlee-python/pull/788)) ([5381d13](https://github.com/apify/crawlee-python/commit/5381d13aa51a757fc1906f400788555df090a1af)) by [@Mantisus](https://github.com/Mantisus), closes [#94](https://github.com/apify/crawlee-python/issues/94)
- [**breaking**] Rename PwPreNavContext to PwPreNavCrawlingContext ([#827](https://github.com/apify/crawlee-python/pull/827)) ([84b61a3](https://github.com/apify/crawlee-python/commit/84b61a3d25bee42faed4e81cd156663f251b3d3d)) by [@vdusek](https://github.com/vdusek)
- [**breaking**] Rename PlaywrightCrawler kwargs: browser_options, page_options ([#831](https://github.com/apify/crawlee-python/pull/831)) ([ffc6048](https://github.com/apify/crawlee-python/commit/ffc6048e9dc5c5e862271fa50c48bb0fb6f0a18f)) by [@Pijukatel](https://github.com/Pijukatel)
- [**breaking**] Update the crawlers &amp; storage clients structure ([#828](https://github.com/apify/crawlee-python/pull/828)) ([0ba04d1](https://github.com/apify/crawlee-python/commit/0ba04d1633881043928a408678932c46fb90e21f)) by [@vdusek](https://github.com/vdusek), closes [#764](https://github.com/apify/crawlee-python/issues/764)


## [0.4.5](https://github.com/apify/crawlee-python/releases/tag/v0.4.5) (2024-12-06)

### 🚀 Features

- Improve project bootstrapping ([#538](https://github.com/apify/crawlee-python/pull/538)) ([367899c](https://github.com/apify/crawlee-python/commit/367899cbad5021674f6e41c4dd7eb2266fe043aa)) by [@janbuchar](https://github.com/janbuchar), closes [#317](https://github.com/apify/crawlee-python/issues/317), [#414](https://github.com/apify/crawlee-python/issues/414), [#495](https://github.com/apify/crawlee-python/issues/495), [#511](https://github.com/apify/crawlee-python/issues/511)

### 🐛 Bug Fixes

- Add upper bound of HTTPX version ([#775](https://github.com/apify/crawlee-python/pull/775)) ([b59e34d](https://github.com/apify/crawlee-python/commit/b59e34d6301e26825d88608152ffb337ef602a9f)) by [@vdusek](https://github.com/vdusek)
- Fix incorrect use of desired concurrency ratio ([#780](https://github.com/apify/crawlee-python/pull/780)) ([d1f8bfb](https://github.com/apify/crawlee-python/commit/d1f8bfb68ce2ef13b550ce415a3689858112a4c7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#759](https://github.com/apify/crawlee-python/issues/759)
- Remove pydantic constraint &lt;2.10.0 and update timedelta validator, serializer type hints ([#757](https://github.com/apify/crawlee-python/pull/757)) ([c0050c0](https://github.com/apify/crawlee-python/commit/c0050c0ee76e5deb28f174ecf276b0e6abf68b9d)) by [@Pijukatel](https://github.com/Pijukatel)


## [0.4.4](https://github.com/apify/crawlee-python/releases/tag/v0.4.4) (2024-11-29)

### 🚀 Features

- Expose browser_options and page_options to PlaywrightCrawler ([#730](https://github.com/apify/crawlee-python/pull/730)) ([dbe85b9](https://github.com/apify/crawlee-python/commit/dbe85b90e59def281cfc6617a0eb869a4adf2fc0)) by [@vdusek](https://github.com/vdusek), closes [#719](https://github.com/apify/crawlee-python/issues/719)
- Add `abort_on_error` property ([#731](https://github.com/apify/crawlee-python/pull/731)) ([6dae03a](https://github.com/apify/crawlee-python/commit/6dae03a68a2d23c68c78d8d44611d43e40eb9404)) by [@Mantisus](https://github.com/Mantisus), closes [#704](https://github.com/apify/crawlee-python/issues/704)

### 🐛 Bug Fixes

- Fix init of context managers and context handling in `BasicCrawler` ([#714](https://github.com/apify/crawlee-python/pull/714)) ([486fe6d](https://github.com/apify/crawlee-python/commit/486fe6d6cd56cb560ab51a32ec0286d9e32267cb)) by [@vdusek](https://github.com/vdusek)


## [0.4.3](https://github.com/apify/crawlee-python/releases/tag/v0.4.3) (2024-11-21)

### 🐛 Bug Fixes

- Pydantic 2.10.0 issues ([#716](https://github.com/apify/crawlee-python/pull/716)) ([8d8b3fc](https://github.com/apify/crawlee-python/commit/8d8b3fcff8be10edf5351f5324c7ba112c1d2ba0)) by [@Pijukatel](https://github.com/Pijukatel)


## [0.4.2](https://github.com/apify/crawlee-python/releases/tag/v0.4.2) (2024-11-20)

### 🐛 Bug Fixes

- Respect custom HTTP headers in `PlaywrightCrawler` ([#685](https://github.com/apify/crawlee-python/pull/685)) ([a84125f](https://github.com/apify/crawlee-python/commit/a84125f031347426de44b8f015c87882c8f96f72)) by [@Mantisus](https://github.com/Mantisus)
- Fix serialization payload in Request. Fix Docs for Post Request ([#683](https://github.com/apify/crawlee-python/pull/683)) ([e8b4d2d](https://github.com/apify/crawlee-python/commit/e8b4d2d4989fd9967403b828c914cb7ae2ef9b8b)) by [@Mantisus](https://github.com/Mantisus), closes [#668](https://github.com/apify/crawlee-python/issues/668)
- Accept string payload in the Request constructor ([#697](https://github.com/apify/crawlee-python/pull/697)) ([19f5add](https://github.com/apify/crawlee-python/commit/19f5addc0223d68389eea47864830c709335ab6e)) by [@vdusek](https://github.com/vdusek)
- Fix snapshots handling ([#692](https://github.com/apify/crawlee-python/pull/692)) ([4016c0d](https://github.com/apify/crawlee-python/commit/4016c0d8121a8950ab1df22188eac838a011c39f)) by [@Pijukatel](https://github.com/Pijukatel)


## [0.4.1](https://github.com/apify/crawlee-python/releases/tag/v0.4.1) (2024-11-11)

### 🚀 Features

- Add `max_crawl_depth` option to `BasicCrawler` ([#637](https://github.com/apify/crawlee-python/pull/637)) ([77deaa9](https://github.com/apify/crawlee-python/commit/77deaa964e2c1e74af1c5117a13d8d8257f0e27e)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#460](https://github.com/apify/crawlee-python/issues/460)
- Add BeautifulSoupParser type alias ([#674](https://github.com/apify/crawlee-python/pull/674)) ([b2cf88f](https://github.com/apify/crawlee-python/commit/b2cf88ffea8d75808c9210850a03fcc70b0b9e3d)) by [@Pijukatel](https://github.com/Pijukatel)

### 🐛 Bug Fixes

- Fix total_size usage in memory size monitoring ([#661](https://github.com/apify/crawlee-python/pull/661)) ([c2a3239](https://github.com/apify/crawlee-python/commit/c2a32397eecd5cc7f412c2af7269b004a8b2eaf2)) by [@janbuchar](https://github.com/janbuchar)
- Add HttpHeaders to module exports ([#664](https://github.com/apify/crawlee-python/pull/664)) ([f0c5ca7](https://github.com/apify/crawlee-python/commit/f0c5ca717d9f9e304d375da2c23552c26ca870da)) by [@vdusek](https://github.com/vdusek), closes [#663](https://github.com/apify/crawlee-python/issues/663)
- Fix unhandled ValueError in request handler result processing ([#666](https://github.com/apify/crawlee-python/pull/666)) ([0a99d7f](https://github.com/apify/crawlee-python/commit/0a99d7f693245eb9a065016fb6f2d268f6956805)) by [@janbuchar](https://github.com/janbuchar)
- Fix BaseDatasetClient.iter_items type hints ([#680](https://github.com/apify/crawlee-python/pull/680)) ([a968b1b](https://github.com/apify/crawlee-python/commit/a968b1be6fceb56676b0198a044c8fceac7c92a6)) by [@Pijukatel](https://github.com/Pijukatel)


## [0.4.0](https://github.com/apify/crawlee-python/releases/tag/v0.4.0) (2024-11-01)

- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v04) to ensure a smooth update.

### 🚀 Features

- [**breaking**] Add headers in unique key computation ([#609](https://github.com/apify/crawlee-python/pull/609)) ([6c4746f](https://github.com/apify/crawlee-python/commit/6c4746fa8ff86952a812b32a1d70dc910e76b43e)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#548](https://github.com/apify/crawlee-python/issues/548)
- Add `pre_navigation_hooks` to `PlaywrightCrawler` ([#631](https://github.com/apify/crawlee-python/pull/631)) ([5dd5b60](https://github.com/apify/crawlee-python/commit/5dd5b60e2a44d5bd3748b613790e1bee3232d6f3)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#427](https://github.com/apify/crawlee-python/issues/427)
- Add `always_enqueue` option to bypass URL deduplication ([#621](https://github.com/apify/crawlee-python/pull/621)) ([4e59fa4](https://github.com/apify/crawlee-python/commit/4e59fa46daaec05e52262cf62c26f28ddcd772af)) by [@Rutam21](https://github.com/Rutam21), closes [#547](https://github.com/apify/crawlee-python/issues/547)
- Split and add extra configuration to export_data method ([#580](https://github.com/apify/crawlee-python/pull/580)) ([6751635](https://github.com/apify/crawlee-python/commit/6751635e1785a4a27f60092c82f5dd0c40193d52)) by [@deshansh](https://github.com/deshansh), closes [#526](https://github.com/apify/crawlee-python/issues/526)

### 🐛 Bug Fixes

- Use strip in headers normalization ([#614](https://github.com/apify/crawlee-python/pull/614)) ([a15b21e](https://github.com/apify/crawlee-python/commit/a15b21e51deaf2b67738f95bc2b15c1c16d1775f)) by [@vdusek](https://github.com/vdusek)
- [**breaking**] Merge payload and data fields of Request ([#542](https://github.com/apify/crawlee-python/pull/542)) ([d06fcef](https://github.com/apify/crawlee-python/commit/d06fcef3fee44616ded5f587b9c7313b82a57cc7)) by [@vdusek](https://github.com/vdusek), closes [#560](https://github.com/apify/crawlee-python/issues/560)
- Default ProxyInfo port if httpx.URL port is None ([#619](https://github.com/apify/crawlee-python/pull/619)) ([8107a6f](https://github.com/apify/crawlee-python/commit/8107a6f97e8f16a330e7d02d3fc6ea34c5f78d77)) by [@steffansafey](https://github.com/steffansafey), closes [#618](https://github.com/apify/crawlee-python/issues/618)

### ⚙️ Miscellaneous Tasks

- [**breaking**] Remove Request.query_params field ([#639](https://github.com/apify/crawlee-python/pull/639)) ([6ec0ec4](https://github.com/apify/crawlee-python/commit/6ec0ec4fa0cef9b8bf893e70d99f068675c9c54c)) by [@vdusek](https://github.com/vdusek), closes [#615](https://github.com/apify/crawlee-python/issues/615)


## [0.3.9](https://github.com/apify/crawlee-python/releases/tag/v0.3.9) (2024-10-23)

### 🚀 Features

- Key-value store context helpers ([#584](https://github.com/apify/crawlee-python/pull/584)) ([fc15622](https://github.com/apify/crawlee-python/commit/fc156222c3747fc4cc7bd7666a21769845c7d0d5)) by [@janbuchar](https://github.com/janbuchar)
- Added get_public_url method to KeyValueStore ([#572](https://github.com/apify/crawlee-python/pull/572)) ([3a4ba8f](https://github.com/apify/crawlee-python/commit/3a4ba8f459903b6288aec40de2c3ca862e36abec)) by [@akshay11298](https://github.com/akshay11298), closes [#514](https://github.com/apify/crawlee-python/issues/514)

### 🐛 Bug Fixes

- Workaround for JSON value typing problems ([#581](https://github.com/apify/crawlee-python/pull/581)) ([403496a](https://github.com/apify/crawlee-python/commit/403496a53c12810351139a6e073238143ecc5930)) by [@janbuchar](https://github.com/janbuchar), closes [#563](https://github.com/apify/crawlee-python/issues/563)


## [0.3.8](https://github.com/apify/crawlee-python/releases/tag/v0.3.8) (2024-10-02)

### 🚀 Features

- Mask Playwright's "headless" headers ([#545](https://github.com/apify/crawlee-python/pull/545)) ([d1445e4](https://github.com/apify/crawlee-python/commit/d1445e4858fd804bb4a2e35efa1d2f5254d8df6b)) by [@vdusek](https://github.com/vdusek), closes [#401](https://github.com/apify/crawlee-python/issues/401)
- Add new model for `HttpHeaders` ([#544](https://github.com/apify/crawlee-python/pull/544)) ([854f2c1](https://github.com/apify/crawlee-python/commit/854f2c1e2e09cf398e04b1e153534282add1247e)) by [@vdusek](https://github.com/vdusek)

### 🐛 Bug Fixes

- Call `error_handler` for `SessionError` ([#557](https://github.com/apify/crawlee-python/pull/557)) ([e75ac4b](https://github.com/apify/crawlee-python/commit/e75ac4b70cd48a4ca9f8245cea3c5f3c188b8824)) by [@vdusek](https://github.com/vdusek), closes [#546](https://github.com/apify/crawlee-python/issues/546)
- Extend from `StrEnum` in `RequestState` to fix serialization ([#556](https://github.com/apify/crawlee-python/pull/556)) ([6bf35ba](https://github.com/apify/crawlee-python/commit/6bf35ba4a6913819706ebd1d2c1156a4c62f944e)) by [@vdusek](https://github.com/vdusek), closes [#551](https://github.com/apify/crawlee-python/issues/551)
- Add equality check to UserData model ([#562](https://github.com/apify/crawlee-python/pull/562)) ([899a25c](https://github.com/apify/crawlee-python/commit/899a25ca63f570b3c4d8d56c85a838b371fd3924)) by [@janbuchar](https://github.com/janbuchar)


## [0.3.7](https://github.com/apify/crawlee-python/releases/tag/v0.3.7) (2024-09-25)

### 🐛 Bug Fixes

- Improve `Request.user_data` serialization ([#540](https://github.com/apify/crawlee-python/pull/540)) ([de29c0e](https://github.com/apify/crawlee-python/commit/de29c0e6b737a9d2544c5382472618dde76eb2a5)) by [@janbuchar](https://github.com/janbuchar), closes [#524](https://github.com/apify/crawlee-python/issues/524)
- Adopt new version of curl-cffi ([#543](https://github.com/apify/crawlee-python/pull/543)) ([f6fcf48](https://github.com/apify/crawlee-python/commit/f6fcf48d99bfcb4b8e75c5c9c38dc8c265164a10)) by [@vdusek](https://github.com/vdusek)


## [0.3.6](https://github.com/apify/crawlee-python/releases/tag/v0.3.6) (2024-09-19)

### 🚀 Features

- Add HTTP/2 support for HTTPX client ([#513](https://github.com/apify/crawlee-python/pull/513)) ([0eb0a33](https://github.com/apify/crawlee-python/commit/0eb0a33411096011198e52c393f35730f1a0b6ac)) by [@vdusek](https://github.com/vdusek), closes [#512](https://github.com/apify/crawlee-python/issues/512)
- Expose extended unique key when creating a new Request ([#515](https://github.com/apify/crawlee-python/pull/515)) ([1807f41](https://github.com/apify/crawlee-python/commit/1807f419e47a815dd706d09acb0f3b3af8cfc691)) by [@vdusek](https://github.com/vdusek)
- Add header generator and integrate it into HTTPX client ([#530](https://github.com/apify/crawlee-python/pull/530)) ([b63f9f9](https://github.com/apify/crawlee-python/commit/b63f9f98c6613e095546ef544eab271d433e3379)) by [@vdusek](https://github.com/vdusek), closes [#402](https://github.com/apify/crawlee-python/issues/402)

### 🐛 Bug Fixes

- Use explicitly UTF-8 encoding in local storage ([#533](https://github.com/apify/crawlee-python/pull/533)) ([a3a0ab2](https://github.com/apify/crawlee-python/commit/a3a0ab2f6809b7a06319a77dfbf289df78638dea)) by [@vdusek](https://github.com/vdusek), closes [#532](https://github.com/apify/crawlee-python/issues/532)


## [0.3.5](https://github.com/apify/crawlee-python/releases/tag/v0.3.5) (2024-09-10)

### 🚀 Features

- Memory usage limit configuration via environment variables ([#502](https://github.com/apify/crawlee-python/pull/502)) ([c62e554](https://github.com/apify/crawlee-python/commit/c62e5545de6a1836f0514ebd3dd695e4fd856844)) by [@janbuchar](https://github.com/janbuchar)

### 🐛 Bug Fixes

- Http clients detect 4xx as errors by default ([#498](https://github.com/apify/crawlee-python/pull/498)) ([1895dca](https://github.com/apify/crawlee-python/commit/1895dca538f415feca37b4a030525c7c0d32f114)) by [@vdusek](https://github.com/vdusek), closes [#496](https://github.com/apify/crawlee-python/issues/496)
- Correctly handle log level configuration ([#508](https://github.com/apify/crawlee-python/pull/508)) ([7ea8fe6](https://github.com/apify/crawlee-python/commit/7ea8fe69f4a6146a1e417bebff60c08a85e2ca27)) by [@janbuchar](https://github.com/janbuchar)


## [0.3.4](https://github.com/apify/crawlee-python/releases/tag/v0.3.4) (2024-09-05)

### 🐛 Bug Fixes

- Expose basic crawling context ([#501](https://github.com/apify/crawlee-python/pull/501)) ([b484535](https://github.com/apify/crawlee-python/commit/b484535dbacc5d206a026f55a1d3e58edd375e91)) by [@vdusek](https://github.com/vdusek)


## [0.3.3](https://github.com/apify/crawlee-python/releases/tag/v0.3.3) (2024-09-05)

### 🐛 Bug Fixes

- Deduplicate requests by unique key before submitting them to the queue ([#499](https://github.com/apify/crawlee-python/pull/499)) ([6a3e0e7](https://github.com/apify/crawlee-python/commit/6a3e0e78490851c43cefb0497ce34ca52a31a25c)) by [@janbuchar](https://github.com/janbuchar)


## [0.3.2](https://github.com/apify/crawlee-python/releases/tag/v0.3.2) (2024-09-02)

### 🐛 Bug Fixes

- Double incrementation of `item_count` ([#443](https://github.com/apify/crawlee-python/pull/443)) ([cd9adf1](https://github.com/apify/crawlee-python/commit/cd9adf15731e8c4a39cb142b6d1a62909cafdc51)) by [@cadlagtrader](https://github.com/cadlagtrader), closes [#442](https://github.com/apify/crawlee-python/issues/442)
- Field alias in `BatchRequestsOperationResponse` ([#485](https://github.com/apify/crawlee-python/pull/485)) ([126a862](https://github.com/apify/crawlee-python/commit/126a8629cb5b989a0f9fe22156fb09731a34acd2)) by [@janbuchar](https://github.com/janbuchar)
- JSON handling with Parsel ([#490](https://github.com/apify/crawlee-python/pull/490)) ([ebf5755](https://github.com/apify/crawlee-python/commit/ebf575539ffb631ae131a1b801cec8f21dd0cf4c)) by [@janbuchar](https://github.com/janbuchar), closes [#488](https://github.com/apify/crawlee-python/issues/488)


## [0.3.1](https://github.com/apify/crawlee-python/releases/tag/v0.3.1) (2024-08-30)

### 🚀 Features

- Curl http client selects chrome impersonation by default ([#473](https://github.com/apify/crawlee-python/pull/473)) ([82dc939](https://github.com/apify/crawlee-python/commit/82dc93957b1a380ea975564dea5c6ba4639be548)) by [@vdusek](https://github.com/vdusek)


## [0.3.0](https://github.com/apify/crawlee-python/releases/tag/v0.3.0) (2024-08-27)

- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v03) to ensure a smooth update.

### 🚀 Features

- Implement ParselCrawler that adds support for Parsel ([#348](https://github.com/apify/crawlee-python/pull/348)) ([a3832e5](https://github.com/apify/crawlee-python/commit/a3832e527f022f32cce4a80055da3b7967b74522)) by [@asymness](https://github.com/asymness), closes [#335](https://github.com/apify/crawlee-python/issues/335)
- Add support for filling a web form ([#453](https://github.com/apify/crawlee-python/pull/453)) ([5a125b4](https://github.com/apify/crawlee-python/commit/5a125b464b2619000b92dacad4c3a7faa1869f29)) by [@vdusek](https://github.com/vdusek), closes [#305](https://github.com/apify/crawlee-python/issues/305)

### 🐛 Bug Fixes

- Remove indentation from statistics logging and print the data in tables ([#322](https://github.com/apify/crawlee-python/pull/322)) ([359b515](https://github.com/apify/crawlee-python/commit/359b515d647f064886f91441c2c01d3099e21035)) by [@TymeeK](https://github.com/TymeeK), closes [#306](https://github.com/apify/crawlee-python/issues/306)
- Remove redundant log, fix format ([#408](https://github.com/apify/crawlee-python/pull/408)) ([8d27e39](https://github.com/apify/crawlee-python/commit/8d27e3928c605d6eceb51a948453a15024fa2aa2)) by [@janbuchar](https://github.com/janbuchar)
- Dequeue items from RequestQueue in the correct order ([#411](https://github.com/apify/crawlee-python/pull/411)) ([96fc33e](https://github.com/apify/crawlee-python/commit/96fc33e2cc4631cae3c50dad9eace6407103a2a9)) by [@janbuchar](https://github.com/janbuchar)
- Relative URLS supports & If not a URL, pass #417 ([#431](https://github.com/apify/crawlee-python/pull/431)) ([ccd8145](https://github.com/apify/crawlee-python/commit/ccd81454166ece68391cdffedb8efe9e663361d9)) by [@black7375](https://github.com/black7375), closes [#417](https://github.com/apify/crawlee-python/issues/417)
- Typo in ProlongRequestLockResponse ([#458](https://github.com/apify/crawlee-python/pull/458)) ([30ccc3a](https://github.com/apify/crawlee-python/commit/30ccc3a4763bc3706a3bbeaedc95f9648f5ba09a)) by [@janbuchar](https://github.com/janbuchar)
- Add missing __all__ to top-level __init__.py file ([#463](https://github.com/apify/crawlee-python/pull/463)) ([353a1ce](https://github.com/apify/crawlee-python/commit/353a1ce28cd38c97ffb36dc1e6b0e86d3aef1a48)) by [@janbuchar](https://github.com/janbuchar)

### 🚜 Refactor

- [**breaking**] RequestQueue and service management rehaul ([#429](https://github.com/apify/crawlee-python/pull/429)) ([b155a9f](https://github.com/apify/crawlee-python/commit/b155a9f602a163e891777bef5608072fb5d0156f)) by [@janbuchar](https://github.com/janbuchar), closes [#83](https://github.com/apify/crawlee-python/issues/83), [#174](https://github.com/apify/crawlee-python/issues/174), [#203](https://github.com/apify/crawlee-python/issues/203), [#423](https://github.com/apify/crawlee-python/issues/423)
- [**breaking**] Declare private and public interface ([#456](https://github.com/apify/crawlee-python/pull/456)) ([d6738df](https://github.com/apify/crawlee-python/commit/d6738df30586934e8d1aba50b9cd437a0ea40400)) by [@vdusek](https://github.com/vdusek)


## [0.2.1](https://github.com/apify/crawlee-python/releases/tag/v0.2.1) (2024-08-05)

### 🐛 Bug Fixes

- Do not import curl impersonate in http clients init ([#396](https://github.com/apify/crawlee-python/pull/396)) ([3bb8009](https://github.com/apify/crawlee-python/commit/3bb80093e61c1615f869ecd5ab80b061e0e5db36)) by [@vdusek](https://github.com/vdusek)


## [0.2.0](https://github.com/apify/crawlee-python/releases/tag/v0.2.0) (2024-08-05)

### 🚀 Features

- Add new curl impersonate HTTP client ([#387](https://github.com/apify/crawlee-python/pull/387)) ([9c06260](https://github.com/apify/crawlee-python/commit/9c06260c0ee958522caa9322001a3186e9e43af4)) by [@vdusek](https://github.com/vdusek), closes [#292](https://github.com/apify/crawlee-python/issues/292)
- **playwright:** `infinite_scroll` helper ([#393](https://github.com/apify/crawlee-python/pull/393)) ([34f74bd](https://github.com/apify/crawlee-python/commit/34f74bdcffb42a6c876a856e1c89923d9b3e60bd)) by [@janbuchar](https://github.com/janbuchar)


## [0.1.2](https://github.com/apify/crawlee-python/releases/tag/v0.1.2) (2024-07-30)

### 🚀 Features

- Add URL validation ([#343](https://github.com/apify/crawlee-python/pull/343)) ([1514538](https://github.com/apify/crawlee-python/commit/15145388009c85ab54dc72ea8f2d07efd78f80fd)) by [@vdusek](https://github.com/vdusek), closes [#300](https://github.com/apify/crawlee-python/issues/300)

### 🐛 Bug Fixes

- Minor log fix ([#341](https://github.com/apify/crawlee-python/pull/341)) ([0688bf1](https://github.com/apify/crawlee-python/commit/0688bf1860534ab6b2a85dc850bf3d56507ab154)) by [@souravjain540](https://github.com/souravjain540)
- Also use error_handler for context pipeline errors ([#331](https://github.com/apify/crawlee-python/pull/331)) ([7a66445](https://github.com/apify/crawlee-python/commit/7a664456b45c7e429b4c90aaf1c09d5796b93e3d)) by [@janbuchar](https://github.com/janbuchar), closes [#296](https://github.com/apify/crawlee-python/issues/296)
- Strip whitespace from href in enqueue_links ([#346](https://github.com/apify/crawlee-python/pull/346)) ([8a3174a](https://github.com/apify/crawlee-python/commit/8a3174aed24f9eb4f9ac415a79a58685a081cde2)) by [@janbuchar](https://github.com/janbuchar), closes [#337](https://github.com/apify/crawlee-python/issues/337)
- Warn instead of crashing when an empty dataset is being exported ([#342](https://github.com/apify/crawlee-python/pull/342)) ([22b95d1](https://github.com/apify/crawlee-python/commit/22b95d1948d4acd23a010898fa6af2f491e7f514)) by [@janbuchar](https://github.com/janbuchar), closes [#334](https://github.com/apify/crawlee-python/issues/334)
- Avoid Github rate limiting in project bootstrapping test ([#364](https://github.com/apify/crawlee-python/pull/364)) ([992f07f](https://github.com/apify/crawlee-python/commit/992f07f266f7b8433d99e9a179f277995f81eb17)) by [@janbuchar](https://github.com/janbuchar)
- Pass crawler configuration to storages ([#375](https://github.com/apify/crawlee-python/pull/375)) ([b2d3a52](https://github.com/apify/crawlee-python/commit/b2d3a52712abe21f4a4a5db4e20c80afe72c27de)) by [@janbuchar](https://github.com/janbuchar)
- Purge request queue on repeated crawler runs ([#377](https://github.com/apify/crawlee-python/pull/377)) ([7ad3d69](https://github.com/apify/crawlee-python/commit/7ad3d6908e153c590bff72478af7ee3239a249bc)) by [@janbuchar](https://github.com/janbuchar), closes [#152](https://github.com/apify/crawlee-python/issues/152)


## [0.1.1](https://github.com/apify/crawlee-python/releases/tag/v0.1.1) (2024-07-19)

### 🚀 Features

- Expose crawler log ([#316](https://github.com/apify/crawlee-python/pull/316)) ([ae475fa](https://github.com/apify/crawlee-python/commit/ae475fa450c4fe053620d7b7eb475f3d58804674)) by [@vdusek](https://github.com/vdusek), closes [#303](https://github.com/apify/crawlee-python/issues/303)
- Integrate proxies into `PlaywrightCrawler` ([#325](https://github.com/apify/crawlee-python/pull/325)) ([2e072b6](https://github.com/apify/crawlee-python/commit/2e072b6ad7d5d82d96a7b489cafb87e7bfaf6e83)) by [@vdusek](https://github.com/vdusek)
- Blocking detection for playwright crawler ([#328](https://github.com/apify/crawlee-python/pull/328)) ([49ff6e2](https://github.com/apify/crawlee-python/commit/49ff6e25c12a97550eee718d64bb4130f9990189)) by [@vdusek](https://github.com/vdusek), closes [#239](https://github.com/apify/crawlee-python/issues/239)

### 🐛 Bug Fixes

- Pylance reportPrivateImportUsage errors ([#313](https://github.com/apify/crawlee-python/pull/313)) ([09d7203](https://github.com/apify/crawlee-python/commit/09d72034d5db8c47f461111ec093761935a3e2ef)) by [@vdusek](https://github.com/vdusek), closes [#283](https://github.com/apify/crawlee-python/issues/283)
- Set httpx logging to warning ([#314](https://github.com/apify/crawlee-python/pull/314)) ([1585def](https://github.com/apify/crawlee-python/commit/1585defffb2c0c844fab39bbc0e0b793d6169cbf)) by [@vdusek](https://github.com/vdusek), closes [#302](https://github.com/apify/crawlee-python/issues/302)
- Byte size serialization in MemoryInfo ([#245](https://github.com/apify/crawlee-python/pull/245)) ([a030174](https://github.com/apify/crawlee-python/commit/a0301746c2df076d281708344fb906e1c42e0790)) by [@janbuchar](https://github.com/janbuchar)
- Project bootstrapping in existing folder ([#318](https://github.com/apify/crawlee-python/pull/318)) ([c630818](https://github.com/apify/crawlee-python/commit/c630818538e0c37217ab73f6c6da05505ed8b364)) by [@janbuchar](https://github.com/janbuchar), closes [#301](https://github.com/apify/crawlee-python/issues/301)


## [0.1.0](https://github.com/apify/crawlee-python/releases/tag/v0.1.0) (2024-07-08)

### 🚀 Features

- Project templates ([#237](https://github.com/apify/crawlee-python/pull/237)) ([c23c12c](https://github.com/apify/crawlee-python/commit/c23c12c66688f825f74deb39702f07cc6c6bbc46)) by [@janbuchar](https://github.com/janbuchar), closes [#215](https://github.com/apify/crawlee-python/issues/215)

### 🐛 Bug Fixes

- CLI UX improvements ([#271](https://github.com/apify/crawlee-python/pull/271)) ([123d515](https://github.com/apify/crawlee-python/commit/123d515b224c663577bfe0fab387d0aa11e5e4d4)) by [@janbuchar](https://github.com/janbuchar), closes [#267](https://github.com/apify/crawlee-python/issues/267)
- Error handling in CLI and templates documentation ([#273](https://github.com/apify/crawlee-python/pull/273)) ([61083c3](https://github.com/apify/crawlee-python/commit/61083c33434d431a118538f15bfa9a68c312ab03)) by [@vdusek](https://github.com/vdusek), closes [#268](https://github.com/apify/crawlee-python/issues/268)


## [0.0.7](https://github.com/apify/crawlee-python/releases/tag/v0.0.7) (2024-06-27)

### 🐛 Bug Fixes

- Do not wait for consistency in request queue ([#235](https://github.com/apify/crawlee-python/pull/235)) ([03ff138](https://github.com/apify/crawlee-python/commit/03ff138aadaf8e915abc7fafb854fe12947b9696)) by [@vdusek](https://github.com/vdusek)
- Selector handling in BeautifulSoupCrawler enqueue_links ([#231](https://github.com/apify/crawlee-python/pull/231)) ([896501e](https://github.com/apify/crawlee-python/commit/896501edb44f801409fec95cb3e5f2bcfcb4188d)) by [@janbuchar](https://github.com/janbuchar), closes [#230](https://github.com/apify/crawlee-python/issues/230)
- Handle blocked request ([#234](https://github.com/apify/crawlee-python/pull/234)) ([f8ef79f](https://github.com/apify/crawlee-python/commit/f8ef79ffcb7410713182af716d37dbbaad66fdbc)) by [@Mantisus](https://github.com/Mantisus)
- Improve AutoscaledPool state management ([#241](https://github.com/apify/crawlee-python/pull/241)) ([fdea3d1](https://github.com/apify/crawlee-python/commit/fdea3d16b13afe70039d864de861486c760aa0ba)) by [@janbuchar](https://github.com/janbuchar), closes [#236](https://github.com/apify/crawlee-python/issues/236)


## [0.0.6](https://github.com/apify/crawlee-python/releases/tag/v0.0.6) (2024-06-25)

### 🚀 Features

- Maintain a global configuration instance ([#207](https://github.com/apify/crawlee-python/pull/207)) ([e003aa6](https://github.com/apify/crawlee-python/commit/e003aa63d859bec8199d0c890b5c9604f163ccd3)) by [@janbuchar](https://github.com/janbuchar)
- Add max requests per crawl to `BasicCrawler` ([#198](https://github.com/apify/crawlee-python/pull/198)) ([b5b3053](https://github.com/apify/crawlee-python/commit/b5b3053f43381601274e4034d07b4bf41720c7c2)) by [@vdusek](https://github.com/vdusek)
- Add support decompress *br* response content ([#226](https://github.com/apify/crawlee-python/pull/226)) ([a3547b9](https://github.com/apify/crawlee-python/commit/a3547b9c882dc5333a4fcd1223687ef85e79138d)) by [@Mantisus](https://github.com/Mantisus)
- BasicCrawler.export_data helper ([#222](https://github.com/apify/crawlee-python/pull/222)) ([237ec78](https://github.com/apify/crawlee-python/commit/237ec789b7dccc17cc57ef47ec56bcf73c6ca006)) by [@janbuchar](https://github.com/janbuchar), closes [#211](https://github.com/apify/crawlee-python/issues/211)
- Automatic logging setup ([#229](https://github.com/apify/crawlee-python/pull/229)) ([a67b72f](https://github.com/apify/crawlee-python/commit/a67b72faacd75674071bae496d59e1c60636350c)) by [@janbuchar](https://github.com/janbuchar), closes [#214](https://github.com/apify/crawlee-python/issues/214)

### 🐛 Bug Fixes

- Handling of relative URLs in add_requests ([#213](https://github.com/apify/crawlee-python/pull/213)) ([8aa8c57](https://github.com/apify/crawlee-python/commit/8aa8c57f44149caa0e01950a5d773726f261699a)) by [@janbuchar](https://github.com/janbuchar), closes [#202](https://github.com/apify/crawlee-python/issues/202), [#204](https://github.com/apify/crawlee-python/issues/204)
- Graceful exit in BasicCrawler.run ([#224](https://github.com/apify/crawlee-python/pull/224)) ([337286e](https://github.com/apify/crawlee-python/commit/337286e1b721cf61f57bc0ff3ead08df1f4f5448)) by [@janbuchar](https://github.com/janbuchar), closes [#212](https://github.com/apify/crawlee-python/issues/212)


## [0.0.5](https://github.com/apify/crawlee-python/releases/tag/v0.0.5) (2024-06-21)

### 🚀 Features

- Browser rotation and better browser abstraction ([#177](https://github.com/apify/crawlee-python/pull/177)) ([a42ae6f](https://github.com/apify/crawlee-python/commit/a42ae6f53c5e24678f04011c3684290b68684016)) by [@vdusek](https://github.com/vdusek), closes [#131](https://github.com/apify/crawlee-python/issues/131)
- Add emit persist state event to event manager ([#181](https://github.com/apify/crawlee-python/pull/181)) ([97f6c68](https://github.com/apify/crawlee-python/commit/97f6c68275b65f76c62b6d16d94354fc7f00d336)) by [@vdusek](https://github.com/vdusek)
- Batched request addition in RequestQueue ([#186](https://github.com/apify/crawlee-python/pull/186)) ([f48c806](https://github.com/apify/crawlee-python/commit/f48c8068fe16ce3dd4c46fc248733346c0621411)) by [@vdusek](https://github.com/vdusek)
- Add storage helpers to crawler & context ([#192](https://github.com/apify/crawlee-python/pull/192)) ([f8f4066](https://github.com/apify/crawlee-python/commit/f8f4066d8b32d6e7dc0d999a5aa8db75f99b43b8)) by [@vdusek](https://github.com/vdusek), closes [#98](https://github.com/apify/crawlee-python/issues/98), [#100](https://github.com/apify/crawlee-python/issues/100), [#172](https://github.com/apify/crawlee-python/issues/172)
- Handle all supported configuration options ([#199](https://github.com/apify/crawlee-python/pull/199)) ([23c901c](https://github.com/apify/crawlee-python/commit/23c901cd68cf14b4041ee03568622ee32822e94b)) by [@janbuchar](https://github.com/janbuchar), closes [#84](https://github.com/apify/crawlee-python/issues/84)
- Add Playwright's enqueue links helper ([#196](https://github.com/apify/crawlee-python/pull/196)) ([849d73c](https://github.com/apify/crawlee-python/commit/849d73cc7d137171b98f9f2ab85374e8beec0dad)) by [@vdusek](https://github.com/vdusek)

### 🐛 Bug Fixes

- Tmp path in tests is working ([#164](https://github.com/apify/crawlee-python/pull/164)) ([382b6f4](https://github.com/apify/crawlee-python/commit/382b6f48174bdac3931cc379eaf770ab06f826dc)) by [@vdusek](https://github.com/vdusek), closes [#159](https://github.com/apify/crawlee-python/issues/159)
- Add explicit err msgs for missing pckg extras during import ([#165](https://github.com/apify/crawlee-python/pull/165)) ([200ebfa](https://github.com/apify/crawlee-python/commit/200ebfa63d6e20e17c8ca29544ef7229ed0df308)) by [@vdusek](https://github.com/vdusek), closes [#155](https://github.com/apify/crawlee-python/issues/155)
- Make timedelta_ms accept string-encoded numbers ([#190](https://github.com/apify/crawlee-python/pull/190)) ([d8426ff](https://github.com/apify/crawlee-python/commit/d8426ff41e36f701af459ad17552fee39637674d)) by [@janbuchar](https://github.com/janbuchar)
- **deps:** Update dependency psutil to v6 ([#193](https://github.com/apify/crawlee-python/pull/193)) ([eb91f51](https://github.com/apify/crawlee-python/commit/eb91f51e19da406e3f9293e5336c1f85fc7885a4)) by [@renovate[bot]](https://github.com/renovate[bot])
- Improve compatibility between ProxyConfiguration and its SDK counterpart ([#201](https://github.com/apify/crawlee-python/pull/201)) ([1a76124](https://github.com/apify/crawlee-python/commit/1a76124080d561e0153a4dda0bdb0d9863c3aab6)) by [@janbuchar](https://github.com/janbuchar)
- Correct return type of storage get_info methods ([#200](https://github.com/apify/crawlee-python/pull/200)) ([332673c](https://github.com/apify/crawlee-python/commit/332673c4fb519b80846df7fb8cd8bb521538a8a4)) by [@janbuchar](https://github.com/janbuchar)
- Type error in statistics persist state ([#206](https://github.com/apify/crawlee-python/pull/206)) ([96ceef6](https://github.com/apify/crawlee-python/commit/96ceef697769cd57bd1a50b6615cf1e70549bd2d)) by [@vdusek](https://github.com/vdusek), closes [#194](https://github.com/apify/crawlee-python/issues/194)


## [0.0.4](https://github.com/apify/crawlee-python/releases/tag/v0.0.4) (2024-05-30)

### 🚀 Features

- Capture statistics about the crawler run ([#142](https://github.com/apify/crawlee-python/pull/142)) ([eeebe9b](https://github.com/apify/crawlee-python/commit/eeebe9b1e24338d68a0a55228bbfc717f4d9d295)) by [@janbuchar](https://github.com/janbuchar), closes [#97](https://github.com/apify/crawlee-python/issues/97)
- Proxy configuration ([#156](https://github.com/apify/crawlee-python/pull/156)) ([5c3753a](https://github.com/apify/crawlee-python/commit/5c3753a5527b1d01f7260b9e4c566e43f956a5e8)) by [@janbuchar](https://github.com/janbuchar), closes [#136](https://github.com/apify/crawlee-python/issues/136)
- Add first version of browser pool and playwright crawler ([#161](https://github.com/apify/crawlee-python/pull/161)) ([2d2a050](https://github.com/apify/crawlee-python/commit/2d2a0505b1c2b1529a8835163ca97d1ec2a6e44a)) by [@vdusek](https://github.com/vdusek)


## [0.0.3](https://github.com/apify/crawlee-python/releases/tag/v0.0.3) (2024-05-13)

### 🚀 Features

- AutoscaledPool implementation ([#55](https://github.com/apify/crawlee-python/pull/55)) ([621ada2](https://github.com/apify/crawlee-python/commit/621ada2bd1ba4e2346fb948dc02686e2b37e3856)) by [@janbuchar](https://github.com/janbuchar), closes [#19](https://github.com/apify/crawlee-python/issues/19)
- Add Snapshotter ([#20](https://github.com/apify/crawlee-python/pull/20)) ([492ee38](https://github.com/apify/crawlee-python/commit/492ee38c893b8f54e9583dd492576c5106e29881)) by [@vdusek](https://github.com/vdusek)
- Implement BasicCrawler ([#56](https://github.com/apify/crawlee-python/pull/56)) ([6da971f](https://github.com/apify/crawlee-python/commit/6da971fcddbf8b6795346c88e295dada28e7b1d3)) by [@janbuchar](https://github.com/janbuchar), closes [#30](https://github.com/apify/crawlee-python/issues/30)
- BeautifulSoupCrawler ([#107](https://github.com/apify/crawlee-python/pull/107)) ([4974dfa](https://github.com/apify/crawlee-python/commit/4974dfa20c7911ee073438fd388e60ba4b2c07db)) by [@janbuchar](https://github.com/janbuchar), closes [#31](https://github.com/apify/crawlee-python/issues/31)
- Add_requests and enqueue_links context helpers ([#120](https://github.com/apify/crawlee-python/pull/120)) ([dc850a5](https://github.com/apify/crawlee-python/commit/dc850a5778b105ff09e19eaecbb0a12d94798a62)) by [@janbuchar](https://github.com/janbuchar), closes [#5](https://github.com/apify/crawlee-python/issues/5)
- Use SessionPool in BasicCrawler ([#128](https://github.com/apify/crawlee-python/pull/128)) ([9fc4648](https://github.com/apify/crawlee-python/commit/9fc464837e596b3b5a7cd818b6d617550e249352)) by [@janbuchar](https://github.com/janbuchar), closes [#110](https://github.com/apify/crawlee-python/issues/110)
- Add base storage client and resource subclients ([#138](https://github.com/apify/crawlee-python/pull/138)) ([44d6597](https://github.com/apify/crawlee-python/commit/44d65974e4837576918069d7e63f8b804964971a)) by [@vdusek](https://github.com/vdusek)

### 🐛 Bug Fixes

- **deps:** Update dependency docutils to ^0.21.0 ([#101](https://github.com/apify/crawlee-python/pull/101)) ([534b613](https://github.com/apify/crawlee-python/commit/534b613f7cdfe7adf38b548ee48537db3167d1ec)) by [@renovate[bot]](https://github.com/renovate[bot])
- **deps:** Update dependency eval-type-backport to ^0.2.0 ([#124](https://github.com/apify/crawlee-python/pull/124)) ([c9e69a8](https://github.com/apify/crawlee-python/commit/c9e69a8534f4d82d9a6314947d76a86bcb744607)) by [@renovate[bot]](https://github.com/renovate[bot])
- Fire local SystemInfo events every second ([#144](https://github.com/apify/crawlee-python/pull/144)) ([f1359fa](https://github.com/apify/crawlee-python/commit/f1359fa7eea23f8153ad711287c073e45d498401)) by [@vdusek](https://github.com/vdusek)
- Storage manager & purging the defaults ([#150](https://github.com/apify/crawlee-python/pull/150)) ([851042f](https://github.com/apify/crawlee-python/commit/851042f25ad07e25651768e476f098ef0ed21914)) by [@vdusek](https://github.com/vdusek)


<!-- generated by git-cliff -->

================================================
FILE: CONTRIBUTING.md
================================================
# Development

Here you'll find a contributing guide to get started with development.

## Environment

For local development, it is required to have Python 3.10 (or a later version) installed.

We use [uv](https://docs.astral.sh/uv/) for project management. Install it and set up your IDE accordingly.

We use [Poe the Poet](https://poethepoet.natn.io/) as a task runner, similar to npm scripts in `package.json`.
All tasks are defined in `pyproject.toml` under `[tool.poe.tasks]` and can be run with `uv run poe <task>`.

### Available tasks

| Task | Description |
| ---- | ----------- |
| `install-dev` | Install development dependencies |
| `check-code` | Run lint, type-check, and unit-tests |
| `lint` | Run linter |
| `format` | Fix lint issues and format code |
| `type-check` | Run type checker |
| `unit-tests` | Run unit tests |
| `unit-tests-cov` | Run unit tests with coverage |
| `e2e-templates-tests` | Run end-to-end template tests |
| `build-docs` | Build documentation website |
| `run-docs` | Run documentation website locally |
| `build` | Build package |
| `clean` | Remove build artifacts and clean caches |

## Dependencies

To install this package and its development dependencies, run:

```sh
uv run poe install-dev
```

## Code checking

To execute all code checking tools together, run:

```sh
uv run poe check-code
```

### Linting

We utilize [ruff](https://docs.astral.sh/ruff/) for linting, which analyzes code for potential issues and enforces consistent style. Refer to `pyproject.toml` for configuration details.

To run linting:

```sh
uv run poe lint
```

### Formatting

Our automated code formatting also leverages [ruff](https://docs.astral.sh/ruff/), ensuring uniform style and addressing fixable linting issues. Configuration specifics are outlined in `pyproject.toml`.

To run formatting:

```sh
uv run poe format
```

### Type checking

Type checking is handled by [ty](https://docs.astral.sh/ty/), verifying code against type annotations. Configuration settings can be found in `pyproject.toml`.

To run type checking:

```sh
uv run poe type-check
```

### Unit tests

We use [pytest](https://docs.pytest.org/) as a testing framework with many plugins. Check `pyproject.toml` for configuration details and installed plugins.

To run unit tests:

```sh
uv run poe unit-tests
```

To run unit tests with coverage report:

```sh
uv run poe unit-tests-cov
```

## End-to-end tests

Prerequisites:

- [apify-cli](https://docs.apify.com/cli/docs/installation) installed and available in `PATH`
- Set `APIFY_TEST_USER_API_TOKEN` to your [Apify API token](https://docs.apify.com/platform/integrations/api#api-token)

To run end-to-end tests:

```sh
uv run poe e2e-templates-tests
```

## Documentation

We follow the [Google docstring format](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) for code documentation. All user-facing classes and functions must be documented. Documentation standards are enforced using [Ruff](https://docs.astral.sh/ruff/).

Our API documentation is generated from these docstrings using [pydoc-markdown](https://pypi.org/project/pydoc-markdown/) with custom post-processing. Additional content is provided through markdown files in the `docs/` directory. The final documentation is rendered using [Docusaurus](https://docusaurus.io/) and published to GitHub Pages.

To run the documentation locally, ensure you have `Node.js` 20+ installed, then run:

```sh
uv run poe run-docs
```

## Commits

We use [Conventional Commits](https://www.conventionalcommits.org/) format for commit messages. This convention is used to automatically determine version bumps during the release process.

### Available commit types

| Type | Description |
| ---- | ----------- |
| `feat` | A new feature |
| `fix` | A bug fix |
| `docs` | Documentation only changes |
| `style` | Changes that do not affect the meaning of the code (white-space, formatting, missing semi-colons, etc) |
| `refactor` | A code change that neither fixes a bug nor adds a feature |
| `perf` | A code change that improves performance |
| `test` | Adding missing tests or correcting existing tests |
| `build` | Changes that affect the build system or external dependencies (example scopes: gulp, broccoli, npm) |
| `ci` | Changes to our CI configuration files and scripts (example scopes: Travis, Circle, BrowserStack, SauceLabs) |
| `chore` | Other changes that don't modify src or test files |
| `revert` | Reverts a previous commit |

## Release process

Publishing new versions to [PyPI](https://pypi.org/project/crawlee) is automated through GitHub Actions.

- **Beta releases**: On each commit to the master branch, a new beta release is automatically published. The version number is determined based on the latest release and conventional commits. The beta version suffix is incremented by 1 from the last beta release on PyPI.
- **Stable releases**: A stable version release may be created by triggering the `release` GitHub Actions workflow. The version number is determined based on the latest release and conventional commits (`auto` release type), or it may be overridden using the `custom` release type.

### Publishing to PyPI manually

1. **Do not do this unless absolutely necessary.** In all conceivable scenarios, you should use the `release` workflow instead.
2. **Make sure you know what you're doing.**

3. Update the version number:

- Modify the `version` field under `project` in `pyproject.toml`.

```toml
[project]
name = "crawlee"
version = "x.z.y"
```

4. Build the package:

```sh
uv run poe build
```

5. Upload to PyPI:

```sh
uv publish --token YOUR_API_TOKEN
```


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "{}"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright 2023 Apify Technologies s.r.o.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
<h1 align="center">
    <a href="https://crawlee.dev">
        <picture>
          <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/apify/crawlee-python/master/website/static/img/crawlee-dark.svg?sanitize=true">
          <img alt="Crawlee" src="https://raw.githubusercontent.com/apify/crawlee-python/master/website/static/img/crawlee-light.svg?sanitize=true" width="500">
        </picture>
    </a>
    <br>
    <small>A web scraping and browser automation library</small>
</h1>

<p align=center>
    <a href="https://trendshift.io/repositories/11169" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11169" alt="apify%2Fcrawlee-python | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</p>

<p align="center">
  <a href="https://badge.fury.io/py/crawlee" rel="nofollow"><img src="https://badge.fury.io/py/crawlee.svg" alt="PyPI package version"></a>
  <a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/dm/crawlee" alt="PyPI package downloads"></a>
  <a href="https://codecov.io/gh/apify/crawlee-python"><img src="https://codecov.io/gh/apify/crawlee-python/graph/badge.svg?token=cCju61iPQG" alt="Codecov report"></a>
  <a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/pyversions/crawlee" alt="PyPI Python version"></a>
  <a href="https://discord.gg/jyEM2PRvMU" rel="nofollow"><img src="https://img.shields.io/discord/801163717915574323?label=discord" alt="Chat on Discord"></a>
</p>

Crawlee covers your crawling and scraping end-to-end and **helps you build reliable scrapers. Fast.**

Your crawlers will appear almost human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it.

> 👉 **View full documentation, guides and examples on the [Crawlee project website](https://crawlee.dev/python/)** 👈

We also have a TypeScript implementation of the Crawlee, which you can explore and utilize for your projects. Visit our GitHub repository for more information [Crawlee for JS/TS on GitHub](https://github.com/apify/crawlee).

## Installation

We recommend visiting the [Introduction tutorial](https://crawlee.dev/python/docs/introduction) in Crawlee documentation for more information.

Crawlee is available as [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal.

To install Crawlee with all features, run the following command:

```sh
python -m pip install 'crawlee[all]'
```

Then, install the [Playwright](https://playwright.dev/) dependencies:

```sh
playwright install
```

Verify that Crawlee is successfully installed:

```sh
python -c 'import crawlee; print(crawlee.__version__)'
```

For detailed installation instructions see the [Setting up](https://crawlee.dev/python/docs/introduction/setting-up) documentation page.

### With Crawlee CLI

The quickest way to get started with Crawlee is by using the Crawlee CLI and selecting one of the prepared templates. First, ensure you have [uv](https://pypi.org/project/uv/) installed:

```sh
uv --help
```

If [uv](https://pypi.org/project/uv/) is not installed, follow the official [installation guide](https://docs.astral.sh/uv/getting-started/installation/).

Then, run the CLI and choose from the available templates:

```sh
uvx 'crawlee[cli]' create my-crawler
```

If you already have `crawlee` installed, you can spin it up by running:

```sh
crawlee create my-crawler
```

## Examples

Here are some practical examples to help you get started with different types of crawlers in Crawlee. Each example demonstrates how to set up and run a crawler for specific use cases, whether you need to handle simple HTML pages or interact with JavaScript-heavy sites. A crawler run will create a `storage/` directory in your current working directory.

### BeautifulSoupCrawler

The [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler) downloads web pages using an HTTP library and provides HTML-parsed content to the user. By default it uses [`HttpxHttpClient`](https://crawlee.dev/python/api/class/HttpxHttpClient) for HTTP communication and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) for parsing HTML. It is ideal for projects that require efficient extraction of data from HTML content. This crawler has very good performance since it does not use a browser. However, if you need to execute client-side JavaScript, to get your content, this is not going to be enough and you will need to use [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler). Also if you want to use this crawler, make sure you install `crawlee` with `beautifulsoup` extra.

```python
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': context.soup.title.string if context.soup.title else None,
        }

        # Push the extracted data to the default dataset.
        await context.push_data(data)

        # Enqueue all links found on the page.
        await context.enqueue_links()

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())
```

### PlaywrightCrawler

The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) uses a headless browser to download web pages and provides an API for data extraction. It is built on [Playwright](https://playwright.dev/), an automation library designed for managing headless browsers. It excels at retrieving web pages that rely on client-side JavaScript for content generation, or tasks requiring interaction with JavaScript-driven content. For scenarios where JavaScript execution is unnecessary or higher performance is required, consider using the [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler). Also if you want to use this crawler, make sure you install `crawlee` with `playwright` extra.

```python
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': await context.page.title(),
        }

        # Push the extracted data to the default dataset.
        await context.push_data(data)

        # Enqueue all links found on the page.
        await context.enqueue_links()

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())
```

### More examples

Explore our [Examples](https://crawlee.dev/python/docs/examples) page in the Crawlee documentation for a wide range of additional use cases and demonstrations.

## Features

Why Crawlee is the preferred choice for web scraping and crawling?

### Why use Crawlee instead of just a random HTTP library with an HTML parser?

- Unified interface for **HTTP & headless browser** crawling.
- Automatic **parallel crawling** based on available system resources.
- Written in Python with **type hints** - enhances DX (IDE autocompletion) and reduces bugs (static type checking).
- Automatic **retries** on errors or when you’re getting blocked.
- Integrated **proxy rotation** and session management.
- Configurable **request routing** - direct URLs to the appropriate handlers.
- Persistent **queue for URLs** to crawl.
- Pluggable **storage** of both tabular data and files.
- Robust **error handling**.

### Why to use Crawlee rather than Scrapy?

- **Asyncio-based** – Leveraging the standard [Asyncio](https://docs.python.org/3/library/asyncio.html) library, Crawlee delivers better performance and seamless compatibility with other modern asynchronous libraries.
- **Type hints** – Newer project built with modern Python, and complete type hint coverage for a better developer experience.
- **Simple integration** – Crawlee crawlers are regular Python scripts, requiring no additional launcher executor. This flexibility allows to integrate a crawler directly into other applications.
- **State persistence** – Supports state persistence during interruptions, saving time and costs by avoiding the need to restart scraping pipelines from scratch after an issue.
- **Organized data storages** – Allows saving of multiple types of results in a single scraping run. Offers several storing options (see [datasets](https://crawlee.dev/python/api/class/Dataset) & [key-value stores](https://crawlee.dev/python/api/class/KeyValueStore)).

## Running on the Apify platform

Crawlee is open-source and runs anywhere, but since it's developed by [Apify](https://apify.com), it's easy to set up on the Apify platform and run in the cloud. Visit the [Apify SDK website](https://docs.apify.com/sdk/python/) to learn more about deploying Crawlee to the Apify platform.

## Support

If you find any bug or issue with Crawlee, please [submit an issue on GitHub](https://github.com/apify/crawlee-python/issues). For questions, you can ask on [Stack Overflow](https://stackoverflow.com/questions/tagged/apify), in GitHub Discussions or you can join our [Discord server](https://discord.com/invite/jyEM2PRvMU).

## Contributing

Your code contributions are welcome, and you'll be praised for eternity! If you have any ideas for improvements, either submit an issue or create a pull request. For contribution guidelines and the code of conduct, see [CONTRIBUTING.md](https://github.com/apify/crawlee-python/blob/master/CONTRIBUTING.md).

## License

This project is licensed under the Apache License 2.0 - see the [LICENSE](https://github.com/apify/crawlee-python/blob/master/LICENSE) file for details.


================================================
FILE: codecov.yaml
================================================
coverage:
  status:
    project:
      default:
        target: auto
        threshold: 0.10%   # tolerate up to 0.10% decrease
        informational: true # CI check reports status but never fails
    patch:
      default:
        target: 50%         # error only if patch coverage drops below 50%
        informational: true  # CI check reports status but never fails


================================================
FILE: docs/deployment/apify_platform.mdx
================================================
---
id: apify-platform
title: Apify platform
description: Apify platform - large-scale and high-performance web scraping
---

import ApiLink from '@site/src/components/ApiLink';

import CodeBlock from '@theme/CodeBlock';

import LogWithConfigExample from '!!raw-loader!./code_examples/apify/log_with_config_example.py';
import CrawlerAsActorExample from '!!raw-loader!./code_examples/apify/crawler_as_actor_example.py';
import ProxyExample from '!!raw-loader!./code_examples/apify/proxy_example.py';
import ProxyAdvancedExample from '!!raw-loader!./code_examples/apify/proxy_advanced_example.py';

Apify is a [platform](https://apify.com) built to serve large-scale and high-performance web scraping and automation needs. It provides easy access to [compute instances (Actors)](#what-is-an-actor), convenient request and result storages, [proxies](../guides/proxy-management), scheduling, webhooks and [more](https://docs.apify.com/), accessible through a [web interface](https://console.apify.com) or an [API](https://docs.apify.com/api).

While we think that the Apify platform is super cool, and it's definitely worth signing up for a [free account](https://console.apify.com/sign-up), **Crawlee is and will always be open source**, runnable locally or on any cloud infrastructure.

:::note

We do not test Crawlee in other cloud environments such as Lambda or on specific architectures such as Raspberry PI. We strive to make it work, but there are no guarantees.

:::

## Requirements

To run your Crawlee code on Apify platform, you need an Apify account. If you don't have one yet, you can sign up [here](https://console.apify.com/sign-up).

Additionally, you must have the [Apify CLI](https://docs.apify.com/cli/) installed on your computer. For installation instructions, refer to the [Installation guide](https://docs.apify.com/cli/docs/installation).

Finally, ensure that the [Apify SDK] (https://docs.apify.com/sdk/python/) is installed in your project. You can install it using `pip`:

```bash
pip install apify
```

## Logging into Apify platform from Crawlee

To access your [Apify account](https://console.apify.com/sign-up) from Crawlee, you must provide credentials - your [API token](https://console.apify.com/account?tab=integrations). You can do that either by utilizing [Apify CLI](https://docs.apify.com/cli/) or with environment variables.

Once you provide credentials to your Apify CLI installation, you will be able to use all the Apify platform features, such as calling Actors, saving to cloud storages, using Apify proxies, setting up webhooks and so on.

### Log in with CLI

Apify CLI allows you to log in to your Apify account on your computer. If you then run your crawler using the CLI, your credentials will automatically be added.

```bash
npm install -g apify-cli
apify login -t YOUR_API_TOKEN
```

### Log in with environment variables

Alternatively, you can always provide credentials to your Actor by setting the [`APIFY_TOKEN`](#apify_token) environment variable to your API token.

> There's also the [`APIFY_PROXY_PASSWORD`](#apify_proxy_password)
> environment variable. Actor automatically infers that from your token, but it can be useful
> when you need to access proxies from a different account than your token represents.

### Log in with Configuration

Another option is to use the [`Configuration`](https://docs.apify.com/sdk/python/reference/class/Configuration) instance and set your api token there.

<CodeBlock className="language-python">
    {LogWithConfigExample}
</CodeBlock>

## What is an Actor

When you deploy your script to the Apify platform, it becomes an [Actor](https://apify.com/actors). An Actor is a serverless microservice that accepts an input and produces an output. It can run for a few seconds, hours or even infinitely. An Actor can perform anything from a simple action such as filling out a web form or sending an email, to complex operations such as crawling an entire website and removing duplicates from a large dataset.

Actors can be shared in the [Apify Store](https://apify.com/store) so that other people can use them. But don't worry, if you share your Actor in the store and somebody uses it, it runs under their account, not yours.

**Related links**

- [Store of existing Actors](https://apify.com/store)
- [Documentation](https://docs.apify.com/actors)
- [View Actors in Apify Console](https://console.apify.com/actors)
- [API reference](https://apify.com/docs/api/v2#/reference/actors)

## Running an Actor locally

First let's create a boilerplate of the new Actor. You could use Apify CLI and just run:

```bash
apify create my-hello-world
```

The CLI will prompt you to select a project boilerplate template - let's pick "Crawlee + BeautifulSoup". The tool will create a directory called `my-hello-world` with Python project files. You can run the Actor as follows:

```bash
cd my-hello-world
apify run
```

## Running Crawlee code as an Actor

For running Crawlee code as an Actor on [Apify platform](https://apify.com/actors) you need to wrap the body of the main function of your crawler with `async with Actor`.

:::info NOTE
Adding `async with Actor` is the only important thing needed to run it on Apify platform as an Actor. It is needed to initialize your Actor (e.g. to set the correct storage implementation) and to correctly handle exiting the process.
:::

Let's look at the `BeautifulSoupCrawler` example from the [Quick start](../quick-start) guide:

<CodeBlock className="language-python">
    {CrawlerAsActorExample}
</CodeBlock>

Note that you could also run your Actor (that is using Crawlee) locally with Apify CLI. You could start it via the following command in your project folder:

```bash
apify run
```

## Deploying an Actor to Apify platform

Now (assuming you are already logged in to your Apify account) you can easily deploy your code to the Apify platform by running:

```bash
apify push
```

Your script will be uploaded to and built on the Apify platform so that it can be run there. For more information, view the
[Apify Actor](https://docs.apify.com/cli) documentation.

## Usage on Apify platform

You can also develop your Actor in an online code editor directly on the platform (you'll need an Apify Account). Let's go to the [Actors](https://console.apify.com/actors) page in the app, click *Create new* and then go to the *Source* tab and start writing the code or paste one of the examples from the [Examples](../examples) section.

## Storages

There are several things worth mentioning here.

### Helper functions for default Key-Value Store and Dataset

To simplify access to the _default_ storages, instead of using the helper functions of respective storage classes, you could use:
- [`Actor.set_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#set_value), [`Actor.get_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_value), [`Actor.get_input()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_input) for [`Key-Value Store`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore)
- [`Actor.push_data()`](https://docs.apify.com/sdk/python/reference/class/Actor#push_data) for [`Dataset`](https://docs.apify.com/sdk/python/reference/class/Dataset)

### Using platform storage in a local Actor

When you plan to use the platform storage while developing and running your Actor locally, you should use [`Actor.open_key_value_store()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_key_value_store), [`Actor.open_dataset()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_dataset) and [`Actor.open_request_queue()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_request_queue) to open the respective storage.

Using each of these methods allows to pass the `force_cloud` keyword argument. If set to `True`, cloud storage will be used instead of the folder on the local disk.

:::note
If you don't plan to force usage of the platform storages when running the Actor locally, there is no need to use the [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) class for it. The Crawlee variants <ApiLink to="class/KeyValueStore#open">`KeyValueStore.open()`</ApiLink>, <ApiLink to="class/Dataset#open">`Dataset.open()`</ApiLink> and <ApiLink to="class/RequestQueue#open">`RequestQueue.open()`</ApiLink> will work the same.
:::

{/*
### Getting public url of an item in the platform storage

If you need to share a link to some file stored in a [Key-Value](https://docs.apify.com/sdk/python/reference/class/KeyValueStore) Store on Apify platform, you can use [`get_public_url()`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore#get_public_url) method. It accepts only one parameter: `key` - the key of the item you want to share.

<CodeBlock language="python">
    {GetPublicUrlSource}
</CodeBlock>

*/}

### Exporting dataset data

When the <ApiLink to="class/Dataset">`Dataset`</ApiLink> is stored on the [Apify platform](https://apify.com/actors), you can export its data to the following formats: HTML, JSON, CSV, Excel, XML and RSS. The datasets are displayed on the Actor run details page and in the [Storage](https://console.apify.com/storage) section in the Apify Console. The actual data is exported using the [Get dataset items](https://apify.com/docs/api/v2#/reference/datasets/item-collection/get-items) Apify API endpoint. This way you can easily share the crawling results.

**Related links**

- [Apify platform storage documentation](https://docs.apify.com/storage)
- [View storage in Apify Console](https://console.apify.com/storage)
- [Key-value stores API reference](https://apify.com/docs/api/v2#/reference/key-value-stores)
- [Datasets API reference](https://docs.apify.com/api/v2#/reference/datasets)
- [Request queues API reference](https://docs.apify.com/api/v2#/reference/request-queues)

## Environment variables

The following describes select environment variables set by the Apify platform. For a complete list, see the [Environment variables](https://docs.apify.com/platform/actors/development/programming-interface/environment-variables) section in the Apify platform documentation.

:::note

It's important to notice that `CRAWLEE_` environment variables don't need to be replaced with equivalent `APIFY_` ones. Likewise, Crawlee understands `APIFY_` environment variables.

:::

### `APIFY_TOKEN`

The API token for your Apify account. It is used to access the Apify API, e.g. to access cloud storage
or to run an Actor on the Apify platform. You can find your API token on the
[Account Settings / Integrations](https://console.apify.com/account?tab=integrations) page.

### Combinations of `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR`

By combining the env vars in various ways, you can greatly influence the Actor's behavior.

| Env Vars                                | API | Storages         |
| --------------------------------------- | --- | ---------------- |
|  none OR `CRAWLEE_STORAGE_DIR`          | no  | local            |
| `APIFY_TOKEN`                           | yes | Apify platform   |
| `APIFY_TOKEN` AND `CRAWLEE_STORAGE_DIR` | yes | local + platform |

When using both `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR`, you can use all the Apify platform
features and your data will be stored locally by default. If you want to access platform storages,
you can use the `force_cloud=true` option in their respective functions.

### `APIFY_PROXY_PASSWORD`

Optional password to [Apify Proxy](https://docs.apify.com/proxy) for IP address rotation.
Assuming Apify Account was already created, you can find the password on the [Proxy page](https://console.apify.com/proxy)
in the Apify Console. The password is automatically inferred using the `APIFY_TOKEN` env var,
so in most cases, you don't need to touch it. You should use it when, for some reason,
you need access to Apify Proxy, but not access to Apify API, or when you need access to
proxy from a different account than your token represents.

## Proxy management

In addition to your own proxy servers and proxy servers acquired from
third-party providers used together with Crawlee, you can also rely on [Apify Proxy](https://apify.com/proxy)
for your scraping needs.

### Apify proxy

If you are already subscribed to Apify Proxy, you can start using them immediately in only a few lines of code (for local usage you first should be [logged in](#logging-into-apify-platform-from-crawlee) to your Apify account.

<CodeBlock className="language-python">
    {ProxyExample}
</CodeBlock>

Note that unlike using your own proxies in Crawlee, you shouldn't use the constructor to create <ApiLink to="class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> instances. For using the Apify Proxy you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function instead.

### Advanced Apify proxy configuration

With Apify Proxy, you can select specific proxy groups to use, or countries to connect from.
This allows you to get better proxy performance after some initial research.

<CodeBlock className="language-python">
    {ProxyAdvancedExample}
</CodeBlock>

Now your crawlers will use only Residential proxies from the US. Note that you must first get access
to a proxy group before you are able to use it. You can check proxy groups available to you
in the [proxy dashboard](https://console.apify.com/proxy).

### Apify proxy vs. own proxies

The [`ProxyConfiguration`](https://docs.apify.com/sdk/python/reference/class/ProxyConfiguration) class covers both Apify Proxy and custom proxy URLs so that you can easily switch between proxy providers. However, some features of the class are available only to Apify Proxy users, mainly because Apify Proxy is what one would call a super-proxy. It's not a single proxy server, but an API endpoint that allows connection through millions of different IP addresses. So the class essentially has two modes: Apify Proxy or Own (third party) proxy.

The difference is easy to remember.
- If you're using your own proxies - you should create a <ApiLink to="class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> instance directly.
- If you are planning to use Apify Proxy - you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function. The `new_url_function` parameter enables the use of your custom proxy URLs, whereas all the other options are there to configure Apify Proxy.

**Related links**

- [Apify Proxy docs](https://docs.apify.com/proxy)


================================================
FILE: docs/deployment/aws_lambda.mdx
================================================
---
id: aws-lambda
title: Deploy on AWS Lambda
description: Prepare your crawler to run on AWS Lambda.
---

import ApiLink from '@site/src/components/ApiLink';

import CodeBlock from '@theme/CodeBlock';

import BeautifulSoupCrawlerLambda from '!!raw-loader!./code_examples/aws/beautifulsoup_crawler_lambda.py';
import PlaywrightCrawlerLambda from '!!raw-loader!./code_examples/aws/playwright_crawler_lambda.py';
import PlaywrightCrawlerDockerfile from '!!raw-loader!./code_examples/aws/playwright_dockerfile';

[AWS Lambda](https://docs.aws.amazon.com/lambda/latest/dg/welcome.html) is a serverless compute service that lets you run code without provisioning or managing servers. This guide covers deploying <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> and <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>.

The code examples are based on the [BeautifulSoupCrawler example](../examples/beautifulsoup-crawler).

## BeautifulSoupCrawler on AWS Lambda

For simple crawlers that don't require browser rendering, you can deploy using a ZIP archive.

### Updating the code

When instantiating a crawler, use <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink>. By default, Crawlee uses file-based storage, but the Lambda filesystem is read-only (except for `/tmp`). Using `MemoryStorageClient` tells Crawlee to use in-memory storage instead.

Wrap the crawler logic in a `lambda_handler` function. This is the entry point that AWS will execute.

:::important

Make sure to always instantiate a new crawler for every Lambda invocation. AWS keeps the environment running for some time after the first execution (to reduce cold-start times), so subsequent calls may access an already-used crawler instance.

**TL;DR: Keep your Lambda stateless.**

:::

Finally, return the scraped data from the Lambda when the crawler run ends.

<CodeBlock language="python" title="lambda_function.py">
    {BeautifulSoupCrawlerLambda}
</CodeBlock>

### Preparing the environment

Lambda requires all dependencies to be included in the deployment package. Create a virtual environment and install dependencies:

```bash
python3.14 -m venv .venv
source .venv/bin/activate
pip install 'crawlee[beautifulsoup]' 'boto3' 'aws-lambda-powertools'
```

[`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) is the AWS SDK for Python. Including it in your dependencies is recommended to avoid version misalignment issues with the Lambda runtime.

### Creating the ZIP archive

Create a ZIP archive from your project, including dependencies from the virtual environment:

```bash
cd .venv/lib/python3.14/site-packages
zip -r ../../../../package.zip .
cd ../../../../
zip package.zip lambda_function.py
```

:::note Large dependencies?

AWS has a limit of 50 MB for direct upload and 250 MB for unzipped deployment package size.

A better way to manage dependencies is by using Lambda Layers. With Layers, you can share files between multiple Lambda functions and keep the actual code as slim as possible.

To create a Lambda Layer:

1. Create a `python/` folder and copy dependencies from `site-packages` into it
2. Create a zip archive: `zip -r layer.zip python/`
3. Create a new Lambda Layer from the archive (you may need to upload it to S3 first)
4. Attach the Layer to your Lambda function

:::

### Creating the Lambda function

Create the Lambda function in the AWS Lambda Console:

1. Navigate to `Lambda` in [AWS Management Console](https://aws.amazon.com/console/).
2. Click **Create function**.
3. Select **Author from scratch**.
4. Enter a **Function name**, for example `BeautifulSoupTest`.
5. Choose a **Python runtime** that matches the version used in your virtual environment (for example, Python 3.14).
6. Click **Create function** to finish.

Once created, upload `package.zip` as the code source in the AWS Lambda Console using the "Upload from" button.

In Lambda Runtime Settings, set the handler. Since the file is named `lambda_function.py` and the function is `lambda_handler`, you can use the default value `lambda_function.lambda_handler`.

:::tip Configuration

In the Configuration tab, you can adjust:

- **Memory**: Memory size can greatly affect execution speed. A minimum of 256-512 MB is recommended.
- **Timeout**: Set according to the size of the website you are scraping (1 minute for the example code).
- **Ephemeral storage**: Size of the `/tmp` directory.

See the [official documentation](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html) to learn how performance and cost scale with memory.

:::

After the Lambda deploys, you can test it by clicking the "Test" button. The event contents don't matter for a basic test, but you can parameterize your crawler by parsing the event object that AWS passes as the first argument to the handler.

## PlaywrightCrawler on AWS Lambda

For crawlers that require browser rendering, you need to deploy using Docker container images because Playwright and browser binaries exceed Lambda's ZIP deployment size limits.

### Updating the code

As with <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, use <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink> and wrap the logic in a `lambda_handler` function. Additionally, configure `browser_launch_options` with flags optimized for serverless environments. These flags disable sandboxing and GPU features that aren't available in Lambda's containerized runtime.

<CodeBlock language="python" title="main.py">
    {PlaywrightCrawlerLambda}
</CodeBlock>

### Installing and configuring AWS CLI

Install AWS CLI following the [official documentation](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) according to your operating system.

Authenticate by running:

```bash
aws login
```

### Preparing the project

Initialize the project by running `uvx 'crawlee[cli]' create`.

Or use a single command if you don't need interactive mode:

```bash
uvx 'crawlee[cli]' create aws_playwright --crawler-type playwright --http-client impit --package-manager uv --no-apify --start-url 'https://crawlee.dev' --install
```

Add the following dependencies:

```bash
uv add awslambdaric aws-lambda-powertools boto3
```

[`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) is the AWS SDK for Python. Use it if your function integrates with any other AWS services.

The project is created with a Dockerfile that needs to be modified for AWS Lambda by adding `ENTRYPOINT` and updating `CMD`:

<CodeBlock language="dockerfile" title="Dockerfile">
    {PlaywrightCrawlerDockerfile}
</CodeBlock>

### Building and pushing the Docker image

Create a repository `lambda/aws-playwright` in [Amazon Elastic Container Registry](https://docs.aws.amazon.com/AmazonECR/latest/userguide/what-is-ecr.html) in the same region where your Lambda functions will run. To learn more, refer to the [official documentation](https://docs.aws.amazon.com/AmazonECR/latest/userguide/getting-started-cli.html).

Navigate to the created repository and click the "View push commands" button. This will open a window with console commands for uploading the Docker image to your repository. Execute them.

Example:
```bash
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin {user-specific-data}
docker build --platform linux/amd64 --provenance=false -t lambda/aws-playwright .
docker tag lambda/aws-playwright:latest {user-specific-data}/lambda/aws-playwright:latest
docker push {user-specific-data}/lambda/aws-playwright:latest
```

### Creating the Lambda function

1. Navigate to `Lambda` in [AWS Management Console](https://aws.amazon.com/console/).
2. Click **Create function**.
3. Select **Container image**.
4. Browse and select your ECR image.
5. Click **Create function** to finish.

:::tip Configuration

In the Configuration tab, you can adjust resources. Playwright crawlers require more resources than BeautifulSoup crawlers:

- **Memory**: Minimum 1024 MB recommended. Browser operations are memory-intensive, so 2048 MB or more may be needed for complex pages.
- **Timeout**: Set according to crawl size. Browser startup adds overhead, so allow at least 5 minutes even for simple crawls.
- **Ephemeral storage**: Default 512 MB is usually sufficient unless downloading large files.

See the [official documentation](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html) to learn how performance and cost scale with memory.

:::

After the Lambda deploys, click the "Test" button to invoke it. The event contents don't matter for a basic test, but you can parameterize your crawler by parsing the event object that AWS passes as the first argument to the handler.


================================================
FILE: docs/deployment/code_examples/apify/crawler_as_actor_example.py
================================================
import asyncio

from apify import Actor

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    # Wrap the crawler code in an Actor context manager.
    async with Actor:
        crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)

        @crawler.router.default_handler
        async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
            context.log.info(f'Processing {context.request.url} ...')
            data = {
                'url': context.request.url,
                'title': context.soup.title.string if context.soup.title else None,
            }
            await context.push_data(data)
            await context.enqueue_links()

        await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/deployment/code_examples/apify/get_public_url.py
================================================
import asyncio

from apify import Actor


async def main() -> None:
    async with Actor:
        store = await Actor.open_key_value_store()
        await store.set_value('your-file', {'foo': 'bar'})
        url = store.get_public_url('your-file')
        Actor.log.info(f'KVS public URL: {url}')
        # https://api.apify.com/v2/key-value-stores/<your-store-id>/records/your-file


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/deployment/code_examples/apify/log_with_config_example.py
================================================
import asyncio

from apify import Actor, Configuration


async def main() -> None:
    # Create a new configuration with your API key. You can find it at
    # https://console.apify.com/settings/integrations. It can be provided either
    # as a parameter "token" or as an environment variable "APIFY_TOKEN".
    config = Configuration(
        token='apify_api_YOUR_TOKEN',
    )

    async with Actor(config):
        Actor.log.info('Hello from Apify platform!')


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/deployment/code_examples/apify/proxy_advanced_example.py
================================================
import asyncio

from apify import Actor


async def main() -> None:
    async with Actor:
        proxy_configuration = await Actor.create_proxy_configuration(
            password='apify_proxy_YOUR_PASSWORD',
            # Specify the proxy group to use.
            groups=['RESIDENTIAL'],
            # Set the country code for the proxy.
            country_code='US',
        )

        # ...


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/deployment/code_examples/apify/proxy_example.py
================================================
import asyncio

from apify import Actor


async def main() -> None:
    async with Actor:
        # Create a new Apify Proxy configuration. The password can be found at
        # https://console.apify.com/proxy/http-settings and should be provided either
        # as a parameter "password" or as an environment variable "APIFY_PROXY_PASSWORD".
        proxy_configuration = await Actor.create_proxy_configuration(
            password='apify_proxy_YOUR_PASSWORD',
        )

        if not proxy_configuration:
            Actor.log.warning('Failed to create proxy configuration.')
            return

        proxy_url = await proxy_configuration.new_url()
        Actor.log.info(f'Proxy URL: {proxy_url}')


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/deployment/code_examples/aws/beautifulsoup_crawler_lambda.py
================================================
import asyncio
import json
from datetime import timedelta
from typing import Any

from aws_lambda_powertools.utilities.typing import LambdaContext

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.storage_clients import MemoryStorageClient
from crawlee.storages import Dataset, RequestQueue


async def main() -> str:
    # highlight-start
    # Disable writing storage data to the file system
    storage_client = MemoryStorageClient()
    # highlight-end

    # Initialize storages
    dataset = await Dataset.open(storage_client=storage_client)
    request_queue = await RequestQueue.open(storage_client=storage_client)

    crawler = BeautifulSoupCrawler(
        storage_client=storage_client,
        max_request_retries=1,
        request_handler_timeout=timedelta(seconds=30),
        max_requests_per_crawl=10,
    )

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        data = {
            'url': context.request.url,
            'title': context.soup.title.string if context.soup.title else None,
            'h1s': [h1.text for h1 in context.soup.find_all('h1')],
            'h2s': [h2.text for h2 in context.soup.find_all('h2')],
            'h3s': [h3.text for h3 in context.soup.find_all('h3')],
        }

        await context.push_data(data)
        await context.enqueue_links()

    await crawler.run(['https://crawlee.dev'])

    # Extract data saved in `Dataset`
    data = await crawler.get_data()

    # Clean up storages after the crawl
    await dataset.drop()
    await request_queue.drop()

    # Serialize the list of scraped items to JSON string
    return json.dumps(data.items)


def lambda_handler(_event: dict[str, Any], _context: LambdaContext) -> dict[str, Any]:
    result = asyncio.run(main())
    # Return the response with results
    return {'statusCode': 200, 'body': result}


================================================
FILE: docs/deployment/code_examples/aws/playwright_crawler_lambda.py
================================================
import asyncio
import json
from datetime import timedelta
from typing import Any

from aws_lambda_powertools.utilities.typing import LambdaContext

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.storage_clients import MemoryStorageClient
from crawlee.storages import Dataset, RequestQueue


async def main() -> str:
    # highlight-start
    # Disable writing storage data to the file system
    storage_client = MemoryStorageClient()
    # highlight-end

    # Initialize storages
    dataset = await Dataset.open(storage_client=storage_client)
    request_queue = await RequestQueue.open(storage_client=storage_client)

    crawler = PlaywrightCrawler(
        storage_client=storage_client,
        max_request_retries=1,
        request_handler_timeout=timedelta(seconds=30),
        max_requests_per_crawl=10,
        # highlight-start
        # Configure Playwright to run in AWS Lambda environment
        browser_launch_options={
            'args': [
                '--no-sandbox',
                '--disable-setuid-sandbox',
                '--disable-dev-shm-usage',
                '--disable-gpu',
                '--single-process',
            ]
        },
        # highlight-end
    )

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        data = {
            'url': context.request.url,
            'title': await context.page.title(),
            'h1s': await context.page.locator('h1').all_text_contents(),
            'h2s': await context.page.locator('h2').all_text_contents(),
            'h3s': await context.page.locator('h3').all_text_contents(),
        }

        await context.push_data(data)
        await context.enqueue_links()

    await crawler.run(['https://crawlee.dev'])

    # Extract data saved in `Dataset`
    data = await crawler.get_data()

    # Clean up storages after the crawl
    await dataset.drop()
    await request_queue.drop()

    # Serialize the list of scraped items to JSON string
    return json.dumps(data.items)


def lambda_handler(_event: dict[str, Any], _context: LambdaContext) -> dict[str, Any]:
    result = asyncio.run(main())
    # Return the response with results
    return {'statusCode': 200, 'body': result}


================================================
FILE: docs/deployment/code_examples/aws/playwright_dockerfile
================================================
FROM apify/actor-python-playwright:3.14

RUN apt update && apt install -yq git && rm -rf /var/lib/apt/lists/*

RUN pip install -U pip setuptools \
    && pip install 'uv<1'

ENV UV_PROJECT_ENVIRONMENT="/usr/local"

COPY pyproject.toml uv.lock ./

RUN echo "Python version:" \
    && python --version \
    && echo "Installing dependencies:" \
    && PLAYWRIGHT_INSTALLED=$(pip freeze | grep -q playwright && echo "true" || echo "false") \
    && if [ "$PLAYWRIGHT_INSTALLED" = "true" ]; then \
        echo "Playwright already installed, excluding from uv sync" \
        && uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact --no-install-package playwright; \
    else \
        echo "Playwright not found, installing all dependencies" \
        && uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact; \
    fi \
    && echo "All installed Python packages:" \
    && pip freeze

COPY . ./

RUN python -m compileall -q .

# highlight-start
# AWS Lambda entrypoint
ENTRYPOINT [ "/usr/local/bin/python3", "-m", "awslambdaric" ]

# Lambda handler function
CMD [ "aws_playwright.main.lambda_handler" ]
# highlight-end


================================================
FILE: docs/deployment/code_examples/google/cloud_run_example.py
================================================
import json
import os

import uvicorn
from litestar import Litestar, get

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.storage_clients import MemoryStorageClient


@get('/')
async def main() -> str:
    """The crawler entry point that will be called when the HTTP endpoint is accessed."""
    # highlight-start
    # Disable writing storage data to the file system
    storage_client = MemoryStorageClient()
    # highlight-end

    crawler = PlaywrightCrawler(
        headless=True,
        max_requests_per_crawl=10,
        browser_type='firefox',
        storage_client=storage_client,
    )

    @crawler.router.default_handler
    async def default_handler(context: PlaywrightCrawlingContext) -> None:
        """Default request handler that processes each page during crawling."""
        context.log.info(f'Processing {context.request.url} ...')
        title = await context.page.query_selector('title')
        await context.push_data(
            {
                'url': context.request.loaded_url,
                'title': await title.inner_text() if title else None,
            }
        )

        await context.enqueue_links()

    await crawler.run(['https://crawlee.dev'])

    data = await crawler.get_data()

    # Return the results as JSON to the client
    return json.dumps(data.items)


# Initialize the Litestar app with our route handler
app = Litestar(route_handlers=[main])

# Start the Uvicorn server using the `PORT` environment variable provided by GCP
# This is crucial - Cloud Run expects your app to listen on this specific port
uvicorn.run(app, host='0.0.0.0', port=int(os.environ.get('PORT', '8080')))  # noqa: S104 # Use all interfaces in a container, safely


================================================
FILE: docs/deployment/code_examples/google/google_example.py
================================================
import asyncio
import json
from datetime import timedelta

import functions_framework
from flask import Request, Response

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.storage_clients import MemoryStorageClient


async def main() -> str:
    # highlight-start
    # Disable writing storage data to the file system
    storage_client = MemoryStorageClient()
    # highlight-end

    crawler = BeautifulSoupCrawler(
        storage_client=storage_client,
        max_request_retries=1,
        request_handler_timeout=timedelta(seconds=30),
        max_requests_per_crawl=10,
    )

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        data = {
            'url': context.request.url,
            'title': context.soup.title.string if context.soup.title else None,
            'h1s': [h1.text for h1 in context.soup.find_all('h1')],
            'h2s': [h2.text for h2 in context.soup.find_all('h2')],
            'h3s': [h3.text for h3 in context.soup.find_all('h3')],
        }

        await context.push_data(data)
        await context.enqueue_links()

    await crawler.run(['https://crawlee.dev'])

    # highlight-start
    # Extract data saved in `Dataset`
    data = await crawler.get_data()
    # Serialize to json string and return
    return json.dumps(data.items)
    # highlight-end


@functions_framework.http
def crawlee_run(request: Request) -> Response:
    # You can pass data to your crawler using `request`
    function_id = request.headers['Function-Execution-Id']
    response_str = asyncio.run(main())

    # Return a response with the crawling results
    return Response(response=response_str, status=200)


================================================
FILE: docs/deployment/google_cloud.mdx
================================================
---
id: gcp-cloud-run-functions
title: Cloud Run functions
description: Prepare your crawler to run in Cloud Run functions on Google Cloud Platform.
---

import ApiLink from '@site/src/components/ApiLink';

import CodeBlock from '@theme/CodeBlock';

import GoogleFunctions from '!!raw-loader!./code_examples/google/google_example.py';

[Google Cloud Run Functions](https://cloud.google.com/functions) is a serverless execution environment for running simple HTTP-based web scrapers. This service is best suited for lightweight crawlers that don't require browser rendering capabilities and can be executed via HTTP requests.

## Updating the project

For the project foundation, use <ApiLink to="class/BeautifulSoupCrawler">BeautifulSoupCrawler</ApiLink> as described in this [example](../examples/beautifulsoup-crawler).

Add [`functions-framework`](https://pypi.org/project/functions-framework/) to your dependencies file `requirements.txt`. If you're using a project manager like `poetry` or `uv`, export your dependencies to `requirements.txt`.

Update the project code to make it compatible with Cloud Functions and return data in JSON format. Also add an entry point that Cloud Functions will use to run the project.

<CodeBlock className="language-python">
    {GoogleFunctions.replace(/^.*?\n/, '')}
</CodeBlock>

You can test your project locally. Start the server by running:

```bash
functions-framework --target=crawlee_run
```

Then make a GET request to `http://127.0.0.1:8080/`, for example in your browser.

## Deploying to Google Cloud Platform

In the Google Cloud dashboard, create a new function, allocate memory and CPUs to it, set region and function timeout.

When deploying, select **"Use an inline editor to create a function"**. This allows you to configure the project using only the Google Cloud Console dashboard.

Using the `inline editor`, update the function files according to your project. **Make sure** to update the `requirements.txt` file to match your project's dependencies.

Also, make sure to set the **Function entry point** to the name of the function decorated with `@functions_framework.http`, which in our case is `crawlee_run`.

After the Function deploys, you can test it by clicking the "Test" button. This button opens a popup with a `curl` script that calls your new Cloud Function. To avoid having to install the `gcloud` CLI application locally, you can also run this script in the Cloud Shell by clicking the link above the code block.


================================================
FILE: docs/deployment/google_cloud_run.mdx
================================================
---
id: gcp-cloud-run
title: Cloud Run
description: Prepare your crawler to run in Cloud Run on Google Cloud Platform.
---

import ApiLink from '@site/src/components/ApiLink';

import CodeBlock from '@theme/CodeBlock';

import GoogleCloudRun from '!!raw-loader!./code_examples/google/cloud_run_example.py';


[Google Cloud Run](https://cloud.google.com/run)  is a container-based serverless platform that allows you to run web crawlers with headless browsers. This service is recommended when your Crawlee applications need browser rendering capabilities, require more granular control, or have complex dependencies that aren't supported by [Cloud Functions](./gcp-cloud-run-functions).

GCP Cloud Run allows you to deploy using Docker containers, giving you full control over your environment and the flexibility to use any web server framework of your choice, unlike Cloud Functions which are limited to [Flask](https://flask.palletsprojects.com/en/stable/).

## Preparing the project

We'll prepare our project using [Litestar](https://litestar.dev/) and the [Uvicorn](https://www.uvicorn.org/) web server. The HTTP server handler will wrap the crawler to communicate with clients. Because the Cloud Run platform sees only an opaque Docker container, we have to take care of this bit ourselves.

:::info

GCP passes you an environment variable called `PORT` - your HTTP server is expected to be listening on this port (GCP exposes this one to the outer world).

:::

<CodeBlock className="language-python">
    {GoogleCloudRun.replace(/^.*?\n/, '')}
</CodeBlock>


:::tip

Always make sure to keep all the logic in the request handler - as with other FaaS services, your request handlers have to be **stateless.**

:::

## Deploying to Google Cloud Platform

Now, we’re ready to deploy! If you have initialized your project using `uvx crawlee create`, the initialization script has prepared a Dockerfile for you.

All you have to do now is run `gcloud run deploy` in your project folder (the one with your Dockerfile in it). The gcloud CLI application will ask you a few questions, such as what region you want to deploy your application in, or whether you want to make your application public or private.

After answering those questions, you should be able to see your application in the GCP dashboard and run it using the link you find there.

:::tip

In case your first execution of your newly created Cloud Run fails, try editing the Run configuration - mainly setting the available memory to 1GiB or more and updating the request timeout according to the size of the website you are scraping.

:::


================================================
FILE: docs/examples/add_data_to_dataset.mdx
================================================
---
id: add-data-to-dataset
title: Add data to dataset
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_bs.py';
import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_pw.py';
import DatasetExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_dataset.py';

This example demonstrates how to store extracted data into datasets using the <ApiLink to="class/PushDataFunction#open">`context.push_data`</ApiLink> helper function. If the specified dataset does not already exist, it will be created automatically. Additionally, you can save data to custom datasets by providing `dataset_id` or `dataset_name` parameters to the <ApiLink to="class/PushDataFunction#open">`push_data`</ApiLink> function.

<Tabs groupId="main">
    <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {BeautifulSoupExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {PlaywrightExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>

Each item in the dataset will be stored in its own file within the following directory:

```text
{PROJECT_FOLDER}/storage/datasets/default/
```

For more control, you can also open a dataset manually using the asynchronous constructor <ApiLink to="class/Dataset#open">`Dataset.open`</ApiLink>

<RunnableCodeBlock className="language-python" language="python">
    {DatasetExample}
</RunnableCodeBlock>


================================================
FILE: docs/examples/beautifulsoup_crawler.mdx
================================================
---
id: beautifulsoup-crawler
title: BeautifulSoup crawler
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler.py';

This example demonstrates how to use <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> to crawl a list of URLs, load each URL using a plain HTTP request, parse the HTML using the [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library and extract some data from it - the page title and all `<h1>`, `<h2>` and `<h3>` tags. This setup is perfect for scraping specific elements from web pages. Thanks to the well-known BeautifulSoup, you can easily navigate the HTML structure and retrieve the data you need with minimal code. It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request.

<RunnableCodeBlock className="language-python" language="python">
    {BeautifulSoupExample}
</RunnableCodeBlock>


================================================
FILE: docs/examples/capture_screenshot_using_playwright.mdx
================================================
---
id: capture-screenshots-using-playwright
title: Capture screenshots using Playwright
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import CaptureScreenshotExample from '!!raw-loader!roa-loader!./code_examples/capture_screenshot_using_playwright.py';

This example demonstrates how to capture screenshots of web pages using <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and store them in the key-value store.

The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is configured to automate the browsing and interaction with web pages. It uses headless Chromium as the browser type to perform these tasks. Each web page specified in the initial list of URLs is visited sequentially, and a screenshot of the page is captured using Playwright's `page.screenshot()` method.

The captured screenshots are stored in the key-value store, which is suitable for managing and storing files in various formats. In this case, screenshots are stored as PNG images with a unique key generated from the URL of the page.

<RunnableCodeBlock className="language-python" language="python">
    {CaptureScreenshotExample}
</RunnableCodeBlock>


================================================
FILE: docs/examples/capturing_page_snapshots_with_error_snapshotter.mdx
================================================
---
id: capturing-page-snapshots-with-error-snapshotter
title: Capturing page snapshots with ErrorSnapshotter
description: How to capture page snapshots on errors.
---
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
import ApiLink from '@site/src/components/ApiLink';
import ParselCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_with_error_snapshotter.py';
import PlaywrightCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_error_snapshotter.py';


This example demonstrates how to capture page snapshots on first occurrence of each unique error. The capturing happens automatically if you set `save_error_snapshots=True` in the crawler's <ApiLink to="class/Statistics">`Statistics`</ApiLink>. The error snapshot can contain `html` file and `jpeg` file that are created from the page where the unhandled exception was raised. Captured error snapshot files are saved to the default key-value store. Both <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and [HTTP crawlers](../guides/http-crawlers) are capable of capturing the html file, but only <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is able to capture page screenshot as well.

<Tabs>
    <TabItem value="ParselCrawler" label="ParselCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            { ParselCrawlerWithErrorSnapshotter }
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            { PlaywrightCrawlerWithErrorSnapshotter }
        </RunnableCodeBlock>
    </TabItem>
</Tabs>


================================================
FILE: docs/examples/code_examples/adaptive_playwright_crawler.py
================================================
import asyncio
from datetime import timedelta

from playwright.async_api import Route

from crawlee.crawlers import (
    AdaptivePlaywrightCrawler,
    AdaptivePlaywrightCrawlingContext,
    AdaptivePlaywrightPreNavCrawlingContext,
)


async def main() -> None:
    # Crawler created by following factory method will use `beautifulsoup`
    # for parsing static content.
    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
        playwright_crawler_specific_kwargs={'headless': False},
    )

    @crawler.router.default_handler
    async def request_handler_for_label(
        context: AdaptivePlaywrightCrawlingContext,
    ) -> None:
        # Do some processing using `parsed_content`
        context.log.info(context.parsed_content.title)

        # Locate element h2 within 5 seconds
        h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))
        # Do stuff with element found by the selector
        context.log.info(h2)

        # Find more links and enqueue them.
        await context.enqueue_links()
        # Save some data.
        await context.push_data({'Visited url': context.request.url})

    @crawler.pre_navigation_hook
    async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
        """Hook executed both in static sub crawler and playwright sub crawler.

        Trying to access `context.page` in this hook would raise `AdaptiveContextError`
        for pages crawled without playwright."""
        context.log.info(f'pre navigation hook for: {context.request.url} ...')

    @crawler.pre_navigation_hook(playwright_only=True)
    async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
        """Hook executed only in playwright sub crawler.

        It is safe to access `page` object.
        """

        async def some_routing_function(route: Route) -> None:
            await route.continue_()

        await context.page.route('*/**', some_routing_function)
        context.log.info(
            f'Playwright only pre navigation hook for: {context.request.url} ...'
        )

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/add_data_to_dataset_bs.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': context.soup.title.string if context.soup.title else None,
            'html': str(context.soup)[:1000],
        }

        # Push the extracted data to the default dataset.
        await context.push_data(data)

    # Run the crawler with the initial list of requests.
    await crawler.run(
        [
            'https://crawlee.dev',
            'https://apify.com',
            'https://example.com',
        ]
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/add_data_to_dataset_dataset.py
================================================
import asyncio

from crawlee.storages import Dataset


async def main() -> None:
    # Open dataset manually using asynchronous constructor open().
    dataset = await Dataset.open()

    # Interact with dataset directly.
    await dataset.push_data({'key': 'value'})


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/add_data_to_dataset_pw.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': await context.page.title(),
            'html': str(await context.page.content())[:1000],
        }

        # Push the extracted data to the default dataset.
        await context.push_data(data)

    # Run the crawler with the initial list of requests.
    await crawler.run(
        [
            'https://crawlee.dev',
            'https://apify.com',
            'https://example.com',
        ]
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/beautifulsoup_crawler.py
================================================
import asyncio
from datetime import timedelta

from crawlee.crawlers import (
    BasicCrawlingContext,
    BeautifulSoupCrawler,
    BeautifulSoupCrawlingContext,
)


async def main() -> None:
    # Create an instance of the BeautifulSoupCrawler class, a crawler that automatically
    # loads the URLs and parses their HTML using the BeautifulSoup library.
    crawler = BeautifulSoupCrawler(
        # On error, retry each page at most once.
        max_request_retries=1,
        # Increase the timeout for processing each page to 30 seconds.
        request_handler_timeout=timedelta(seconds=30),
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    # The handler receives a context parameter, providing various properties and
    # helper methods. Here are a few key ones we use for demonstration:
    # - request: an instance of the Request class containing details such as the URL
    #   being crawled and the HTTP method used.
    # - soup: the BeautifulSoup object containing the parsed HTML of the response.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': context.soup.title.string if context.soup.title else None,
            'h1s': [h1.text for h1 in context.soup.find_all('h1')],
            'h2s': [h2.text for h2 in context.soup.find_all('h2')],
            'h3s': [h3.text for h3 in context.soup.find_all('h3')],
        }

        # Push the extracted data to the default dataset. In local configuration,
        # the data will be stored as JSON files in ./storage/datasets/default.
        await context.push_data(data)

    # Register pre navigation hook which will be called before each request.
    # This hook is optional and does not need to be defined at all.
    @crawler.pre_navigation_hook
    async def some_hook(context: BasicCrawlingContext) -> None:
        pass

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/beautifulsoup_crawler_keep_alive.py
================================================
import asyncio

from crawlee._types import BasicCrawlingContext
from crawlee.crawlers import BeautifulSoupCrawler


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Keep the crawler alive even when there are no requests to be processed now.
        keep_alive=True,
    )

    def stop_crawler_if_url_visited(context: BasicCrawlingContext) -> None:
        """Stop crawler once specific url is visited.

        Example of guard condition to stop the crawler."""
        if context.request.url == 'https://crawlee.dev/docs/examples':
            crawler.stop(
                'Stop crawler that was in keep_alive state after specific url was visite'
            )
        else:
            context.log.info('keep_alive=True, waiting for more requests to come.')

    async def add_request_later(url: str, after_s: int) -> None:
        """Add requests to the queue after some time. Can be done by external code."""
        # Just an example of request being added to the crawler later,
        # when it is waiting due to `keep_alive=True`.
        await asyncio.sleep(after_s)
        await crawler.add_requests([url])

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BasicCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Stop crawler if some guard condition has been met.
        stop_crawler_if_url_visited(context)

    # Start some tasks that will add some requests later to simulate real situation,
    # where requests are added later by external code.
    add_request_later_task1 = asyncio.create_task(
        add_request_later(url='https://crawlee.dev', after_s=1)
    )
    add_request_later_task2 = asyncio.create_task(
        add_request_later(url='https://crawlee.dev/docs/examples', after_s=5)
    )

    # Run the crawler without the initial list of requests.
    # Wait for more requests to be added to the queue later due to `keep_alive=True`.
    await crawler.run()

    await asyncio.gather(add_request_later_task1, add_request_later_task2)


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/beautifulsoup_crawler_stop.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    # Create an instance of the BeautifulSoupCrawler class, a crawler that automatically
    # loads the URLs and parses their HTML using the BeautifulSoup library.
    crawler = BeautifulSoupCrawler()

    # Define the default request handler, which will be called for every request.
    # The handler receives a context parameter, providing various properties and
    # helper methods. Here are a few key ones we use for demonstration:
    # - request: an instance of the Request class containing details such as the URL
    #   being crawled and the HTTP method used.
    # - soup: the BeautifulSoup object containing the parsed HTML of the response.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Create custom condition to stop crawler once it finds what it is looking for.
        if 'crawlee' in context.request.url:
            crawler.stop(
                reason='Manual stop of crawler after finding `crawlee` in the url.'
            )

        # Extract data from the page.
        data = {
            'url': context.request.url,
        }

        # Push the extracted data to the default dataset. In local configuration,
        # the data will be stored as JSON files in ./storage/datasets/default.
        await context.push_data(data)

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/capture_screenshot_using_playwright.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.storages import KeyValueStore


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
        # Headless mode, set to False to see the browser in action.
        headless=False,
        # Browser types supported by Playwright.
        browser_type='chromium',
    )

    # Open the default key-value store.
    kvs = await KeyValueStore.open()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Capture the screenshot of the page using Playwright's API.
        screenshot = await context.page.screenshot()
        name = context.request.url.split('/')[-1]

        # Store the screenshot in the key-value store.
        await kvs.set_value(
            key=f'screenshot-{name}',
            value=screenshot,
            content_type='image/png',
        )

    # Run the crawler with the initial list of URLs.
    await crawler.run(
        [
            'https://crawlee.dev',
            'https://apify.com',
            'https://example.com',
        ]
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/configure_json_logging.py
================================================
from __future__ import annotations

import asyncio
import inspect
import logging
import sys
from typing import TYPE_CHECKING

from loguru import logger

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext

if TYPE_CHECKING:
    from loguru import Record


# Configure loguru interceptor to capture standard logging output
class InterceptHandler(logging.Handler):
    def emit(self, record: logging.LogRecord) -> None:
        # Get corresponding Loguru level if it exists
        try:
            level: str | int = logger.level(record.levelname).name
        except ValueError:
            level = record.levelno

        # Find caller from where originated the logged message
        frame, depth = inspect.currentframe(), 0
        while frame:
            filename = frame.f_code.co_filename
            is_logging = filename == logging.__file__
            is_frozen = 'importlib' in filename and '_bootstrap' in filename
            if depth > 0 and not (is_logging | is_frozen):
                break
            frame = frame.f_back
            depth += 1

        dummy_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None)
        standard_attrs = set(dummy_record.__dict__.keys())
        extra_dict = {
            key: value
            for key, value in record.__dict__.items()
            if key not in standard_attrs
        }

        (
            logger.bind(**extra_dict)
            .opt(depth=depth, exception=record.exc_info)
            .patch(lambda loguru_record: loguru_record.update({'name': record.name}))
            .log(level, record.getMessage())
        )


# Configure loguru formatter
def formatter(record: Record) -> str:
    basic_format = '[{name}] | <level>{level: ^8}</level> | - {message}'
    if record['extra']:
        basic_format = basic_format + ' {extra}'
    return f'{basic_format}\n'


# Remove default loguru logger
logger.remove()

# Set up loguru with JSONL serialization in file `crawler.log`
logger.add('crawler.log', format=formatter, serialize=True, level='INFO')

# Set up loguru logger for console
logger.add(sys.stderr, format=formatter, colorize=True, level='INFO')

# Configure standard logging to use our interceptor
logging.basicConfig(handlers=[InterceptHandler()], level=logging.INFO, force=True)


async def main() -> None:
    # Initialize crawler with disabled table logs
    crawler = HttpCrawler(
        configure_logging=False,  # Disable default logging configuration
        statistics_log_format='inline',  # Set inline formatting for statistics logs
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

    # Run the crawler
    await crawler.run(['https://www.crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/crawl_all_links_on_website_bs.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Enqueue all links found on the page.
        await context.enqueue_links()

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/crawl_all_links_on_website_pw.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Enqueue all links found on the page.
        await context.enqueue_links()

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/crawl_multiple_urls_bs.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

    # Run the crawler with the initial list of requests.
    await crawler.run(
        [
            'https://crawlee.dev',
            'https://apify.com',
            'https://example.com',
        ]
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/crawl_multiple_urls_pw.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

    # Run the crawler with the initial list of requests.
    await crawler.run(
        [
            'https://crawlee.dev',
            'https://apify.com',
            'https://example.com',
        ]
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/crawl_specific_links_on_website_bs.py
================================================
import asyncio

from crawlee import Glob
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Enqueue all the documentation links found on the page, except for the examples.
        await context.enqueue_links(
            include=[Glob('https://crawlee.dev/docs/**')],
            exclude=[Glob('https://crawlee.dev/docs/examples')],
        )

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/crawl_specific_links_on_website_pw.py
================================================
import asyncio

from crawlee import Glob
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Enqueue all the documentation links found on the page, except for the examples.
        await context.enqueue_links(
            include=[Glob('https://crawlee.dev/docs/**')],
            exclude=[Glob('https://crawlee.dev/docs/examples')],
        )

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/crawl_website_with_relative_links_all_links.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Enqueue all links found on the page. Any URLs found will be matched by
        # this strategy, even if they go off the site you are currently crawling.
        await context.enqueue_links(strategy='all')

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/crawl_website_with_relative_links_same_domain.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Setting the strategy to same domain will enqueue all links found that
        # are on the same hostname as request.loaded_url or request.url.
        await context.enqueue_links(strategy='same-domain')

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/crawl_website_with_relative_links_same_hostname.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Setting the strategy to same hostname will enqueue all links found that are on
        # the same hostname (including subdomains) as request.loaded_url or request.url.
        await context.enqueue_links(strategy='same-hostname')

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/crawl_website_with_relative_links_same_origin.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Setting the strategy to same origin will enqueue all links found that are on
        # the same origin as request.loaded_url or request.url.
        await context.enqueue_links(strategy='same-origin')

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/export_entire_dataset_to_file_csv.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': context.soup.title.string if context.soup.title else None,
        }

        # Enqueue all links found on the page.
        await context.enqueue_links()

        # Push the extracted data to the default dataset.
        await context.push_data(data)

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev'])

    # Export the entire dataset to a CSV file.
    # Use semicolon as delimiter and always quote strings.
    await crawler.export_data(path='results.csv', delimiter=';', quoting='all')


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/export_entire_dataset_to_file_json.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': context.soup.title.string if context.soup.title else None,
        }

        # Enqueue all links found on the page.
        await context.enqueue_links()

        # Push the extracted data to the default dataset.
        await context.push_data(data)

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev'])

    # Export the entire dataset to a JSON file.
    # Set ensure_ascii=False to allow Unicode characters in the output.
    await crawler.export_data(path='results.json', ensure_ascii=False)


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/extract_and_add_specific_links_on_website_bs.py
================================================
import asyncio

from crawlee import Glob
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract all the documentation links found on the page, except for the examples.
        extracted_links = await context.extract_links(
            include=[Glob('https://crawlee.dev/docs/**')],
            exclude=[Glob('https://crawlee.dev/docs/examples')],
        )
        # Some very custom filtering which can't be achieved by `extract_links` arguments.
        max_link_length = 30
        filtered_links = [
            link for link in extracted_links if len(link.url) < max_link_length
        ]
        # Add filtered links to the request queue.
        await context.add_requests(filtered_links)

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/extract_and_add_specific_links_on_website_pw.py
================================================
import asyncio

from crawlee import Glob
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract all the documentation links found on the page, except for the examples.
        extracted_links = await context.extract_links(
            include=[Glob('https://crawlee.dev/docs/**')],
            exclude=[Glob('https://crawlee.dev/docs/examples')],
        )
        # Some very custom filtering which can't be achieved by `extract_links` arguments.
        max_link_length = 30
        filtered_links = [
            link for link in extracted_links if len(link.url) < max_link_length
        ]
        # Add filtered links to the request queue.
        await context.add_requests(filtered_links)

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/fill_and_submit_web_form_crawler.py
================================================
import asyncio
from urllib.parse import urlencode

from crawlee import Request
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
    crawler = HttpCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')
        response = (await context.http_response.read()).decode('utf-8')
        context.log.info(f'Response: {response}')  # To see the response in the logs.

    # Prepare a POST request to the form endpoint.
    request = Request.from_url(
        url='https://httpbin.org/post',
        method='POST',
        headers={'content-type': 'application/x-www-form-urlencoded'},
        payload=urlencode(
            {
                'custname': 'John Doe',
                'custtel': '1234567890',
                'custemail': 'johndoe@example.com',
                'size': 'large',
                'topping': ['bacon', 'cheese', 'mushroom'],
                'delivery': '13:00',
                'comments': 'Please ring the doorbell upon arrival.',
            }
        ).encode(),
    )

    # Run the crawler with the initial list of requests.
    await crawler.run([request])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/fill_and_submit_web_form_request.py
================================================
import asyncio
from urllib.parse import urlencode

from crawlee import Request


async def main() -> None:
    # Prepare a POST request to the form endpoint.
    request = Request.from_url(
        url='https://httpbin.org/post',
        method='POST',
        headers={'content-type': 'application/x-www-form-urlencoded'},
        payload=urlencode(
            {
                'custname': 'John Doe',
                'custtel': '1234567890',
                'custemail': 'johndoe@example.com',
                'size': 'large',
                'topping': ['bacon', 'cheese', 'mushroom'],
                'delivery': '13:00',
                'comments': 'Please ring the doorbell upon arrival.',
            }
        ).encode(),
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/parsel_crawler.py
================================================
import asyncio

from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext

# Regex for identifying email addresses on a webpage.
EMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'


async def main() -> None:
    crawler = ParselCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': context.selector.xpath('//title/text()').get(),
            'email_address_list': context.selector.re(EMAIL_REGEX),
        }

        # Push the extracted data to the default dataset.
        await context.push_data(data)

        # Enqueue all links found on the page.
        await context.enqueue_links()

    # Register pre navigation hook which will be called before each request.
    # This hook is optional and does not need to be defined at all.
    @crawler.pre_navigation_hook
    async def some_hook(context: BasicCrawlingContext) -> None:
        pass

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://github.com'])

    # Export the entire dataset to a JSON file.
    await crawler.export_data(path='results.json')


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py
================================================
import asyncio
from random import choice

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.statistics import Statistics


async def main() -> None:
    crawler = ParselCrawler(
        statistics=Statistics.with_default_state(save_error_snapshots=True)
    )

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')
        # Simulate various errors to demonstrate `ErrorSnapshotter`
        # saving only the first occurrence of unique error.
        await context.enqueue_links()
        random_number = choice(range(10))
        if random_number == 1:
            raise KeyError('Some KeyError')
        if random_number == 2:
            raise ValueError('Some ValueError')
        if random_number == 3:
            raise RuntimeError('Some RuntimeError')

    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/playwright_block_requests.py
================================================
import asyncio

from crawlee.crawlers import (
    PlaywrightCrawler,
    PlaywrightCrawlingContext,
    PlaywrightPreNavCrawlingContext,
)


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        await context.enqueue_links()

    # Define the hook, which will be called before every request.
    @crawler.pre_navigation_hook
    async def navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None:
        context.log.info(f'Navigating to {context.request.url} ...')

        # Block all requests to URLs that include `adsbygoogle.js` and also all defaults.
        await context.block_requests(extra_url_patterns=['adsbygoogle.js'])

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/playwright_crawler.py
================================================
import asyncio

from crawlee.crawlers import (
    PlaywrightCrawler,
    PlaywrightCrawlingContext,
    PlaywrightPreNavCrawlingContext,
)


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
        # Headless mode, set to False to see the browser in action.
        headless=False,
        # Browser types supported by Playwright.
        browser_type='chromium',
    )

    # Define the default request handler, which will be called for every request.
    # The handler receives a context parameter, providing various properties and
    # helper methods. Here are a few key ones we use for demonstration:
    # - request: an instance of the Request class containing details such as the URL
    #   being crawled and the HTTP method used.
    # - page: Playwright's Page object, which allows interaction with the web page
    #   (see https://playwright.dev/python/docs/api/class-page for more details).
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page using Playwright's API.
        posts = await context.page.query_selector_all('.athing')
        data = []

        for post in posts:
            # Get the HTML elements for the title and rank within each post.
            title_element = await post.query_selector('.title a')
            rank_element = await post.query_selector('.rank')

            # Extract the data we want from the elements.
            title = await title_element.inner_text() if title_element else None
            rank = await rank_element.inner_text() if rank_element else None
            href = await title_element.get_attribute('href') if title_element else None

            data.append({'title': title, 'rank': rank, 'href': href})

        # Push the extracted data to the default dataset. In local configuration,
        # the data will be stored as JSON files in ./storage/datasets/default.
        await context.push_data(data)

        # Find a link to the next page and enqueue it if it exists.
        await context.enqueue_links(selector='.morelink')

    # Define a hook that will be called each time before navigating to a new URL.
    # The hook receives a context parameter, providing access to the request and
    # browser page among other things. In this example, we log the URL being
    # navigated to.
    @crawler.pre_navigation_hook
    async def log_navigation_url(context: PlaywrightPreNavCrawlingContext) -> None:
        context.log.info(f'Navigating to {context.request.url} ...')

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://news.ycombinator.com/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/playwright_crawler_with_camoufox.py
================================================
import asyncio

#  Camoufox is external package and needs to be installed. It is not included in crawlee.
from camoufox import AsyncNewBrowser
from typing_extensions import override

from crawlee.browsers import (
    BrowserPool,
    PlaywrightBrowserController,
    PlaywrightBrowserPlugin,
)
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


class CamoufoxPlugin(PlaywrightBrowserPlugin):
    """Example browser plugin that uses Camoufox browser,
    but otherwise keeps the functionality of PlaywrightBrowserPlugin.
    """

    @override
    async def new_browser(self) -> PlaywrightBrowserController:
        if not self._playwright:
            raise RuntimeError('Playwright browser plugin is not initialized.')

        return PlaywrightBrowserController(
            browser=await AsyncNewBrowser(
                self._playwright, **self._browser_launch_options
            ),
            # Increase, if camoufox can handle it in your use case.
            max_open_pages_per_browser=1,
            # This turns off the crawlee header_generation. Camoufox has its own.
            header_generator=None,
        )


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
        # Custom browser pool. Gives users full control over browsers used by the crawler.
        browser_pool=BrowserPool(plugins=[CamoufoxPlugin()]),
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract some data from the page using Playwright's API.
        posts = await context.page.query_selector_all('.athing')
        for post in posts:
            # Get the HTML elements for the title and rank within each post.
            title_element = await post.query_selector('.title a')

            # Extract the data we want from the elements.
            title = await title_element.inner_text() if title_element else None

        # Push the extracted data to the default dataset.
        await context.push_data({'title': title})

        # Find a link to the next page and enqueue it if it exists.
        await context.enqueue_links(selector='.morelink')

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://news.ycombinator.com/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py
================================================
import asyncio
from random import choice

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.statistics import Statistics


async def main() -> None:
    crawler = PlaywrightCrawler(
        statistics=Statistics.with_default_state(save_error_snapshots=True)
    )

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')
        # Simulate various errors to demonstrate `ErrorSnapshotter`
        # saving only the first occurrence of unique error.
        await context.enqueue_links()
        random_number = choice(range(10))
        if random_number == 1:
            raise KeyError('Some KeyError')
        if random_number == 2:
            raise ValueError('Some ValueError')
        if random_number == 3:
            raise RuntimeError('Some RuntimeError')

    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/playwright_crawler_with_fingerprint_generator.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.fingerprint_suite import (
    DefaultFingerprintGenerator,
    HeaderGeneratorOptions,
    ScreenOptions,
)


async def main() -> None:
    # Use default fingerprint generator with desired fingerprint options.
    # Generator will generate real looking browser fingerprint based on the options.
    # Unspecified fingerprint options will be automatically selected by the generator.
    fingerprint_generator = DefaultFingerprintGenerator(
        header_options=HeaderGeneratorOptions(browsers=['chrome']),
        screen_options=ScreenOptions(min_width=400),
    )

    crawler = PlaywrightCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
        # Headless mode, set to False to see the browser in action.
        headless=False,
        # Browser types supported by Playwright.
        browser_type='chromium',
        # Fingerprint generator to be used. By default no fingerprint generation is done.
        fingerprint_generator=fingerprint_generator,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Find a link to the next page and enqueue it if it exists.
        await context.enqueue_links(selector='.morelink')

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://news.ycombinator.com/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/respect_robots_on_skipped_request.py
================================================
import asyncio

from crawlee import SkippedReason
from crawlee.crawlers import (
    BeautifulSoupCrawler,
    BeautifulSoupCrawlingContext,
)


async def main() -> None:
    # Initialize the crawler with robots.txt compliance enabled
    crawler = BeautifulSoupCrawler(respect_robots_txt_file=True)

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

    # highlight-start
    # This handler is called when a request is skipped
    @crawler.on_skipped_request
    async def skipped_request_handler(url: str, reason: SkippedReason) -> None:
        # Check if the request was skipped due to robots.txt rules
        if reason == 'robots_txt':
            crawler.log.info(f'Skipped {url} due to robots.txt rules.')

    # highlight-end

    # Start the crawler with the specified URLs
    # The login URL will be skipped and handled by the skipped_request_handler
    await crawler.run(
        ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login']
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/respect_robots_txt_file.py
================================================
import asyncio

from crawlee.crawlers import (
    BeautifulSoupCrawler,
    BeautifulSoupCrawlingContext,
)


async def main() -> None:
    # Initialize the crawler with robots.txt compliance enabled
    crawler = BeautifulSoupCrawler(respect_robots_txt_file=True)

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

    # Start the crawler with the specified URLs
    # The crawler will check the robots.txt file before making requests
    # In this example, 'https://news.ycombinator.com/login' will be skipped
    # because it's disallowed in the site's robots.txt file
    await crawler.run(
        ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login']
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/resuming_paused_crawl.py
================================================
import asyncio

from crawlee import ConcurrencySettings, service_locator
from crawlee.crawlers import (
    BeautifulSoupCrawler,
    BeautifulSoupCrawlingContext,
)

# Disable clearing the `RequestQueue`, `KeyValueStore` and `Dataset` on each run.
# This makes the scraper continue from where it left off in the previous run.
# The recommended way to achieve this behavior is setting the environment variable
# `CRAWLEE_PURGE_ON_START=0`
configuration = service_locator.get_configuration()
configuration.purge_on_start = False


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Let's slow down the crawler for a demonstration
        concurrency_settings=ConcurrencySettings(max_tasks_per_minute=20)
    )

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

    # List of links for crawl
    requests = [
        'https://crawlee.dev',
        'https://crawlee.dev/python/docs',
        'https://crawlee.dev/python/docs/examples',
        'https://crawlee.dev/python/docs/guides',
        'https://crawlee.dev/python/docs/quick-start',
    ]

    await crawler.run(requests)


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/run_parallel_crawlers.py
================================================
import asyncio

from crawlee import ConcurrencySettings
from crawlee.crawlers import (
    ParselCrawler,
    ParselCrawlingContext,
    PlaywrightCrawler,
    PlaywrightCrawlingContext,
)
from crawlee.sessions import SessionPool
from crawlee.storages import RequestQueue


async def main() -> None:
    # Open request queues for both crawlers with different aliases
    playwright_rq = await RequestQueue.open(alias='playwright-requests')
    parsel_rq = await RequestQueue.open(alias='parsel-requests')

    # Use a shared session pool between both crawlers
    async with SessionPool() as session_pool:
        playwright_crawler = PlaywrightCrawler(
            # Set the request queue for Playwright crawler
            request_manager=playwright_rq,
            session_pool=session_pool,
            # Configure concurrency settings for Playwright crawler
            concurrency_settings=ConcurrencySettings(
                max_concurrency=5, desired_concurrency=5
            ),
            # Set `keep_alive`` so that the crawler does not stop working when there are
            # no requests in the queue.
            keep_alive=True,
        )

        parsel_crawler = ParselCrawler(
            # Set the request queue for Parsel crawler
            request_manager=parsel_rq,
            session_pool=session_pool,
            # Configure concurrency settings for Parsel crawler
            concurrency_settings=ConcurrencySettings(
                max_concurrency=10, desired_concurrency=10
            ),
            # Set maximum requests per crawl for Parsel crawler
            max_requests_per_crawl=50,
        )

        @playwright_crawler.router.default_handler
        async def handle_playwright(context: PlaywrightCrawlingContext) -> None:
            context.log.info(f'Playwright Processing {context.request.url}...')

            title = await context.page.title()
            # Push the extracted data to the dataset for Playwright crawler
            await context.push_data(
                {'title': title, 'url': context.request.url, 'source': 'playwright'},
                dataset_name='playwright-data',
            )

        @parsel_crawler.router.default_handler
        async def handle_parsel(context: ParselCrawlingContext) -> None:
            context.log.info(f'Parsel Processing {context.request.url}...')

            title = context.parsed_content.css('title::text').get()
            # Push the extracted data to the dataset for Parsel crawler
            await context.push_data(
                {'title': title, 'url': context.request.url, 'source': 'parsel'},
                dataset_name='parsel-data',
            )

            # Enqueue links to the Playwright request queue for blog pages
            await context.enqueue_links(
                selector='a[href*="/blog/"]', rq_alias='playwright-requests'
            )
            # Enqueue other links to the Parsel request queue
            await context.enqueue_links(selector='a:not([href*="/blog/"])')

        # Start the Playwright crawler in the background
        background_crawler_task = asyncio.create_task(playwright_crawler.run([]))

        # Run the Parsel crawler with the initial URL and wait for it to finish
        await parsel_crawler.run(['https://crawlee.dev/blog'])

        # Wait for the Playwright crawler to finish processing all requests
        while not await playwright_rq.is_empty():
            playwright_crawler.log.info('Waiting for Playwright crawler to finish...')
            await asyncio.sleep(5)

        # Stop the Playwright crawler after all requests are processed
        playwright_crawler.stop()

        # Wait for the background Playwright crawler task to complete
        await background_crawler_task


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/using_browser_profiles_chrome.py
================================================
import asyncio
import shutil
from pathlib import Path
from tempfile import TemporaryDirectory

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext

# Profile name to use (usually 'Default' for single profile setups)
PROFILE_NAME = 'Default'

# Paths to Chrome profiles in your system (example for Windows)
# Use `chrome://version/` to find your profile path
PROFILE_PATH = Path(Path.home(), 'AppData', 'Local', 'Google', 'Chrome', 'User Data')


async def main() -> None:
    # Create a temporary folder to copy the profile to
    with TemporaryDirectory(prefix='crawlee-') as tmpdirname:
        tmp_profile_dir = Path(tmpdirname)

        # Copy the profile to a temporary folder
        shutil.copytree(
            PROFILE_PATH / PROFILE_NAME,
            tmp_profile_dir / PROFILE_NAME,
            dirs_exist_ok=True,
        )

        crawler = PlaywrightCrawler(
            headless=False,
            # Use the installed Chrome browser
            browser_type='chrome',
            # Disable fingerprints to preserve profile identity
            fingerprint_generator=None,
            # Set user data directory to temp folder
            user_data_dir=tmp_profile_dir,
            browser_launch_options={
                # Slow down actions to mimic human behavior
                'slow_mo': 200,
                'args': [
                    # Use the specified profile
                    f'--profile-directory={PROFILE_NAME}',
                ],
            },
        )

        @crawler.router.default_handler
        async def default_handler(context: PlaywrightCrawlingContext) -> None:
            context.log.info(f'Visiting {context.request.url}')

        await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/using_browser_profiles_firefox.py
================================================
import asyncio
from pathlib import Path

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext

# Replace this with your actual Firefox profile name
# Find it at about:profiles in Firefox
PROFILE_NAME = 'your-profile-name-here'

# Paths to Firefox profiles in your system (example for Windows)
# Use `about:profiles` to find your profile path
PROFILE_PATH = Path(
    Path.home(), 'AppData', 'Roaming', 'Mozilla', 'Firefox', 'Profiles', PROFILE_NAME
)


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Use Firefox browser type
        browser_type='firefox',
        # Disable fingerprints to use the profile as is
        fingerprint_generator=None,
        headless=False,
        # Path to your Firefox profile
        user_data_dir=PROFILE_PATH,
        browser_launch_options={
            'args': [
                # Required to avoid version conflicts
                '--allow-downgrade'
            ]
        },
    )

    @crawler.router.default_handler
    async def default_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Visiting {context.request.url}')

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/code_examples/using_sitemap_request_loader.py
================================================
import asyncio
from collections.abc import Callable

from yarl import URL

from crawlee import RequestOptions, RequestTransformAction
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.http_clients import ImpitHttpClient
from crawlee.request_loaders import SitemapRequestLoader


# Create a transform_request_function that maps request options based on the host in
# the URL
def create_transform_request(
    data_mapper: dict[str, dict],
) -> Callable[[RequestOptions], RequestOptions | RequestTransformAction]:
    def transform_request(
        request_options: RequestOptions,
    ) -> RequestOptions | RequestTransformAction:
        # According to the Sitemap protocol, all URLs in a Sitemap must be from a single
        # host.
        request_host = URL(request_options['url']).host

        if request_host and (mapping_data := data_mapper.get(request_host)):
            # Set properties from the mapping data
            if 'label' in mapping_data:
                request_options['label'] = mapping_data['label']
            if 'user_data' in mapping_data:
                request_options['user_data'] = mapping_data['user_data']

            return request_options

        return 'unchanged'

    return transform_request


async def main() -> None:
    # Prepare data mapping for hosts
    apify_host = URL('https://apify.com/sitemap.xml').host
    crawlee_host = URL('https://crawlee.dev/sitemap.xml').host

    if not apify_host or not crawlee_host:
        raise ValueError('Unable to extract host from URLs')

    data_map = {
        apify_host: {
            'label': 'apify',
            'user_data': {'source': 'apify'},
        },
        crawlee_host: {
            'label': 'crawlee',
            'user_data': {'source': 'crawlee'},
        },
    }

    # Initialize the SitemapRequestLoader with the transform function
    async with SitemapRequestLoader(
        # Set the sitemap URLs and the HTTP client
        sitemap_urls=['https://crawlee.dev/sitemap.xml', 'https://apify.com/sitemap.xml'],
        http_client=ImpitHttpClient(),
        transform_request_function=create_transform_request(data_map),
    ) as sitemap_loader:
        # Convert the sitemap loader to a request manager
        request_manager = await sitemap_loader.to_tandem()

        # Create and configure the crawler
        crawler = BeautifulSoupCrawler(
            request_manager=request_manager,
            max_requests_per_crawl=10,
        )

        # Create default handler for requests without a specific label
        @crawler.router.default_handler
        async def handler(context: BeautifulSoupCrawlingContext) -> None:
            source = context.request.user_data.get('source', 'unknown')
            context.log.info(
                f'Processing request: {context.request.url} from source: {source}'
            )

        # Create handler for requests labeled 'apify'
        @crawler.router.handler('apify')
        async def apify_handler(context: BeautifulSoupCrawlingContext) -> None:
            source = context.request.user_data.get('source', 'unknown')
            context.log.info(
                f'Apify handler processing: {context.request.url} from source: {source}'
            )

        # Create handler for requests labeled 'crawlee'
        @crawler.router.handler('crawlee')
        async def crawlee_handler(context: BeautifulSoupCrawlingContext) -> None:
            source = context.request.user_data.get('source', 'unknown')
            context.log.info(
                f'Crawlee handler processing: {context.request.url} from source: {source}'
            )

        await crawler.run()


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/examples/crawl_all_links_on_website.mdx
================================================
---
id: crawl-all-links-on-website
title: Crawl all links on website
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_all_links_on_website_bs.py';
import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_all_links_on_website_pw.py';

This example uses the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> helper to add new links to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> as the crawler navigates from page to page. By automatically discovering and enqueuing all links on a given page, the crawler can systematically scrape an entire website. This approach is ideal for web scraping tasks where you need to collect data from multiple interconnected pages.

:::tip

If no options are given, by default the method will only add links that are under the same subdomain. This behavior can be controlled with the `strategy` option, which is an instance of the `EnqueueStrategy` type alias. You can find more info about this option in the [Crawl website with relative links](./crawl-website-with-relative-links) example.

:::

<Tabs groupId="main">
    <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {BeautifulSoupExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {PlaywrightExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>


================================================
FILE: docs/examples/crawl_multiple_urls.mdx
================================================
---
id: crawl-multiple-urls
title: Crawl multiple URLs
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_multiple_urls_bs.py';
import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_multiple_urls_pw.py';

This example demonstrates how to crawl a specified list of URLs using different crawlers. You'll learn how to set up the crawler, define a request handler, and run the crawler with multiple URLs. This setup is useful for scraping data from multiple pages or websites concurrently.

<Tabs groupId="main">
    <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {BeautifulSoupExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {PlaywrightExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>


================================================
FILE: docs/examples/crawl_specific_links_on_website.mdx
================================================
---
id: crawl-specific-links-on-website
title: Crawl specific links on website
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_bs.py';
import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_pw.py';

import BeautifulSoupExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_bs.py';
import PlaywrightExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_pw.py';

This example demonstrates how to crawl a website while targeting specific patterns of links. By utilizing the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> helper, you can pass `include` or `exclude` parameters to improve your crawling strategy. This approach ensures that only the links matching the specified patterns are added to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. Both `include` and `exclude` support lists of globs or regular expressions. This functionality is great for focusing on relevant sections of a website and avoiding scraping unnecessary or irrelevant content.

<Tabs groupId="first-example">
    <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {BeautifulSoupExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {PlaywrightExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>

## Even more control over the enqueued links

<ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> is a convenience helper and internally it calls <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> to find the links and <ApiLink to="class/AddRequestsFunction">`add_requests`</ApiLink> to add them to the queue. If you need some additional custom filtering of the extracted links before enqueuing them, then consider using <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> and <ApiLink to="class/AddRequestsFunction">`add_requests`</ApiLink> instead of the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink>

<Tabs groupId="second-example">
    <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {BeautifulSoupExampleExtractAndAdd}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {PlaywrightExampleExtractAndAdd}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>


================================================
FILE: docs/examples/crawl_website_with_relative_links.mdx
================================================
---
id: crawl-website-with-relative-links
title: Crawl website with relative links
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import AllLinksExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_all_links.py';
import SameDomainExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_domain.py';
import SameHostnameExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_hostname.py';
import SameOriginExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_origin.py';

When crawling a website, you may encounter various types of links that you wish to include in your crawl. To facilitate this, we provide the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> method on the crawler context, which will automatically find and add these links to the crawler's <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. This method simplifies the process of handling different types of links, including relative links, by automatically resolving them based on the page's context.

:::note

For these examples, we are using the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>. However, the same method is available for other crawlers as well. You can use it in exactly the same way.

:::

`EnqueueStrategy` type alias provides four distinct strategies for crawling relative links:

- `all` - Enqueues all links found, regardless of the domain they point to. This strategy is useful when you want to follow every link, including those that navigate to external websites.
- `same-domain` - Enqueues all links found that share the same domain name, including any possible subdomains. This strategy ensures that all links within the same top-level and base domain are included.
- `same-hostname` - Enqueues all links found for the exact same hostname. This is the **default** strategy, and it restricts the crawl to links that have the same hostname as the current page, excluding subdomains.
- `same-origin` - Enqueues all links found that share the same origin. The same origin refers to URLs that share the same protocol, domain, and port, ensuring a strict scope for the crawl.

<Tabs groupId="main">
    <TabItem value="all_links" label="All links">
        <RunnableCodeBlock className="language-python" language="python">
            {AllLinksExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="same-domain" label="Same domain">
        <RunnableCodeBlock className="language-python" language="python">
            {SameDomainExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="same-hostname" label="Same hostname">
        <RunnableCodeBlock className="language-python" language="python">
            {SameHostnameExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="same-origin" label="Same origin">
        <RunnableCodeBlock className="language-python" language="python">
            {SameOriginExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>


================================================
FILE: docs/examples/crawler_keep_alive.mdx
================================================
---
id: crawler-keep-alive
title: Keep a Crawler alive waiting for more requests
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_keep_alive.py';

This example demonstrates how to keep crawler alive even when there are no requests at the moment by using `keep_alive=True` argument of <ApiLink to="class/BasicCrawler#__init__">`BasicCrawler.__init__`</ApiLink>. This is available to all crawlers that inherit from <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink> and in the example below it is shown on <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>. To stop the crawler that was started with `keep_alive=True` you can call `crawler.stop()`.

<RunnableCodeBlock className="language-python" language="python">
    {BeautifulSoupExample}
</RunnableCodeBlock>


================================================
FILE: docs/examples/crawler_stop.mdx
================================================
---
id: crawler-stop
title: Stopping a Crawler with stop method
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_stop.py';

This example demonstrates how to use `stop` method of <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink> to stop crawler once the crawler finds what it is looking for. This method is available to all crawlers that inherit from <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink> and in the example below it is shown on <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>. Simply call `crawler.stop()` to stop the crawler. It will not continue to crawl through new requests. Requests that are already being concurrently processed are going to get finished. It is possible to call `stop` method with optional argument `reason` that is a string that will be used in logs and it can improve logs readability especially if you have multiple different conditions for triggering `stop`.

<RunnableCodeBlock className="language-python" language="python">
    {BeautifulSoupExample}
</RunnableCodeBlock>


================================================
FILE: docs/examples/export_entire_dataset_to_file.mdx
================================================
---
id: export-entire-dataset-to-file
title: Export entire dataset to file
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import JsonExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_json.py';
import CsvExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_csv.py';

This example demonstrates how to use the <ApiLink to="class/BasicCrawler#export_data">`BasicCrawler.export_data`</ApiLink> method of the crawler to export the entire default dataset to a single file. This method supports exporting data in either CSV or JSON format and also accepts additional keyword arguments so you can fine-tune the underlying `json.dump` or `csv.writer` behavior.

:::note

For these examples, we are using the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>. However, the same method is available for other crawlers as well. You can use it in exactly the same way.

:::

<Tabs groupId="main">
    <TabItem value="json" label="JSON">
        <RunnableCodeBlock className="language-python" language="python">
            {JsonExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="csv" label="CSV">
        <RunnableCodeBlock className="language-python" language="python">
            {CsvExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>


================================================
FILE: docs/examples/fill_and_submit_web_form.mdx
================================================
---
id: fill-and-submit-web-form
title: Fill and submit web form
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import RequestExample from '!!raw-loader!roa-loader!./code_examples/fill_and_submit_web_form_request.py';
import CrawlerExample from '!!raw-loader!roa-loader!./code_examples/fill_and_submit_web_form_crawler.py';

This example demonstrates how to fill and submit a web form using the <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> crawler. The same approach applies to any crawler that inherits from it, such as the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> or <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>.

We are going to use the [httpbin.org](https://httpbin.org) website to demonstrate how it works.

## Investigate the form fields

First, we need to examine the form fields and the form's action URL. You can do this by opening the [httpbin.org/forms/post](https://httpbin.org/forms/post) page in a browser and inspecting the form fields.

In Chrome, right-click on the page and select "Inspect" or press `Ctrl+Shift+I`.
Use the element selector (`Ctrl+Shift+C`) to click on the form element you want to inspect.

![HTML input element name](/img/fill-and-submit-web-form/00.jpg 'HTML input element name.')

Identify the field names. For example, the customer name field is `custname`, the email field is `custemail`, and the phone field is `custtel`.

Now navigate to the "Network" tab in developer tools and submit the form by clicking the "Submit order" button.

![Submitting the form](/img/fill-and-submit-web-form/01.jpg 'Submitting the form.')

Find the form submission request and examine its details. The "Headers" tab will show the submission URL, in this case, it is `https://httpbin.org/post`.

![Network request investigation](/img/fill-and-submit-web-form/02.jpg 'Network request investigation.')

The "Payload" tab will display the form fields and their submitted values. This method could be an alternative to inspecting the HTML source code directly.

![Network payload investigation](/img/fill-and-submit-web-form/03.jpg 'Network payload investigation.')

## Preparing a POST request

Now, let's create a POST request with the form fields and their values using the <ApiLink to="class/Request">`Request`</ApiLink> class, specifically its <ApiLink to="class/Request#from_url">`Request.from_url`</ApiLink> constructor:

<RunnableCodeBlock className="language-python" language="python">
    {RequestExample}
</RunnableCodeBlock>

Alternatively, you can send form data as URL parameters using the `url` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach.

## Implementing the crawler

Finally, let's implement the crawler and run it with the prepared request. Although we are using the <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>, the process is the same for any crawler that inherits from it.

<RunnableCodeBlock className="language-python" language="python">
    {CrawlerExample}
</RunnableCodeBlock>

## Running the crawler

Finally, run your crawler. Your logs should show something like this:

```plaintext
...
[crawlee.http_crawler._http_crawler] INFO  Processing https://httpbin.org/post ...
[crawlee.http_crawler._http_crawler] INFO  Response: {
  "args": {},
  "data": "",
  "files": {},
  "form": {
    "comments": "Please ring the doorbell upon arrival.",
    "custemail": "johndoe@example.com",
    "custname": "John Doe",
    "custtel": "1234567890",
    "delivery": "13:00",
    "size": "large",
    "topping": [
      "bacon",
      "cheese",
      "mushroom"
    ]
  },
  "headers": {
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate, br",
    "Content-Length": "190",
    "Content-Type": "application/x-www-form-urlencoded",
    "Host": "httpbin.org",
    "User-Agent": "python-httpx/0.27.0",
    "X-Amzn-Trace-Id": "Root=1-66c849d6-1ae432fb7b4156e6149ff37f"
  },
  "json": null,
  "origin": "78.80.81.196",
  "url": "https://httpbin.org/post"
}

[crawlee._autoscaling.autoscaled_pool] INFO  Waiting for remaining tasks to finish
[crawlee.http_crawler._http_crawler] INFO  Final request statistics:
┌───────────────────────────────┬──────────┐
│ requests_finished             │ 1        │
│ requests_failed               │ 0        │
│ retry_histogram               │ [1]      │
│ request_avg_failed_duration   │ None     │
│ request_avg_finished_duration │ 0.678442 │
│ requests_finished_per_minute  │ 85       │
│ requests_failed_per_minute    │ 0        │
│ request_total_duration        │ 0.678442 │
│ requests_total                │ 1        │
│ crawler_runtime               │ 0.707666 │
└───────────────────────────────┴──────────┘
```

This log output confirms that the crawler successfully submitted the form and processed the response. Congratulations! You have successfully filled and submitted a web form using the <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>.


================================================
FILE: docs/examples/json_logging.mdx
================================================
---
id: configure-json-logging
title: Сonfigure JSON logging
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import JsonLoggingExample from '!!raw-loader!roa-loader!./code_examples/configure_json_logging.py';

This example demonstrates how to configure JSON line (JSONL) logging with Crawlee. By using the `use_table_logs=False` parameter, you can disable table-formatted statistics logs, which makes it easier to parse logs with external tools or to serialize them as JSON.

The example shows how to integrate with the popular [`loguru`](https://github.com/delgan/loguru) library to capture Crawlee logs and format them as JSONL (one JSON object per line). This approach works well when you need to collect logs for analysis, monitoring, or when integrating with logging platforms like ELK Stack, Grafana Loki, or similar systems.

<RunnableCodeBlock className="language-python" language="python">
    {JsonLoggingExample}
</RunnableCodeBlock>

Here's an example of what a crawler statistics log entry in JSONL format.

```json
{
    "text": "[HttpCrawler] |   INFO   | - Final request statistics: {'requests_finished': 1, 'requests_failed': 0, 'retry_histogram': [1], 'request_avg_failed_duration': None, 'request_avg_finished_duration': 3.57098, 'requests_finished_per_minute': 17, 'requests_failed_per_minute': 0, 'request_total_duration': 3.57098, 'requests_total': 1, 'crawler_runtime': 3.59165}\n",
    "record": {
        "elapsed": { "repr": "0:00:05.604568", "seconds": 5.604568 },
        "exception": null,
        "extra": {
            "requests_finished": 1,
            "requests_failed": 0,
            "retry_histogram": [1],
            "request_avg_failed_duration": null,
            "request_avg_finished_duration": 3.57098,
            "requests_finished_per_minute": 17,
            "requests_failed_per_minute": 0,
            "request_total_duration": 3.57098,
            "requests_total": 1,
            "crawler_runtime": 3.59165
        },
        "file": {
            "name": "_basic_crawler.py",
            "path": "/crawlers/_basic/_basic_crawler.py"
        },
        "function": "run",
        "level": { "icon": "ℹ️", "name": "INFO", "no": 20 },
        "line": 583,
        "message": "Final request statistics:",
        "module": "_basic_crawler",
        "name": "HttpCrawler",
        "process": { "id": 198383, "name": "MainProcess" },
        "thread": { "id": 135312814966592, "name": "MainThread" },
        "time": {
            "repr": "2025-03-17 17:14:45.339150+00:00",
            "timestamp": 1742231685.33915
        }
    }
}
```


================================================
FILE: docs/examples/parsel_crawler.mdx
================================================
---
id: parsel-crawler
title: Parsel crawler
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import ParselCrawlerExample from '!!raw-loader!roa-loader!./code_examples/parsel_crawler.py';

This example shows how to use <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> to crawl a website or a list of URLs. Each URL is loaded using a plain HTTP request and the response is parsed using [Parsel](https://pypi.org/project/parsel/) library which supports CSS and XPath selectors for HTML responses and JMESPath for JSON responses. We can extract data from all kinds of complex HTML structures using XPath. In this example, we will use Parsel to crawl github.com and extract page title, URL and emails found in the webpage. The default handler will scrape data from the current webpage and enqueue all the links found in the webpage for continuous scraping.  It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request.

<RunnableCodeBlock className="language-python" language="python">
    {ParselCrawlerExample}
</RunnableCodeBlock>


================================================
FILE: docs/examples/playwright_crawler.mdx
================================================
---
id: playwright-crawler
title: Playwright crawler
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler.py';

This example demonstrates how to use <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> to recursively scrape the Hacker news website using headless Chromium and Playwright.

The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> manages the browser and page instances, simplifying the process of interacting with web pages. In the request handler, Playwright's API is used to extract data from each post on the page. Specifically, it retrieves the title, rank, and URL of each post. Additionally, the handler enqueues links to the next pages to ensure continuous scraping. This setup is ideal for scraping dynamic web pages where JavaScript execution is required to render the content.

A **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation.

<RunnableCodeBlock className="language-python" language="python">
    {PlaywrightCrawlerExample}
</RunnableCodeBlock>


================================================
FILE: docs/examples/playwright_crawler_adaptive.mdx
================================================
---
id: adaptive-playwright-crawler
title: Adaptive Playwright crawler
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import AdaptivePlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/adaptive_playwright_crawler.py';

This example demonstrates how to use <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink>. An <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> is a combination of <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and some implementation of HTTP-based crawler such as <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> or <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>.
It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit.

A [pre-navigation hook](/python/docs/guides/adaptive-playwright-crawler#page-configuration-with-pre-navigation-hooks) can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only=True` to mark hooks that should be executed only for playwright sub crawler.

For more detailed description please see [Adaptive Playwright crawler guide](/python/docs/guides/adaptive-playwright-crawler)

<RunnableCodeBlock className="language-python" language="python">
    {AdaptivePlaywrightCrawlerExample}
</RunnableCodeBlock>


================================================
FILE: docs/examples/playwright_crawler_with_block_requests.mdx
================================================
---
id: playwright-crawler-with-block-requests
title: Playwright crawler with block requests
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import PlaywrightBlockRequests from '!!raw-loader!roa-loader!./code_examples/playwright_block_requests.py';

This example demonstrates how to optimize your <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> performance by blocking unnecessary network requests.

The primary use case is when you need to scrape or interact with web pages without loading non-essential resources like images, styles, or analytics scripts. This can significantly reduce bandwidth usage and improve crawling speed.

The <ApiLink to="class/BlockRequestsFunction">`block_requests`</ApiLink> helper provides the most efficient way to block requests as it operates directly in the browser.

By default, <ApiLink to="class/BlockRequestsFunction">`block_requests`</ApiLink> will block all URLs including the following patterns:

```python
['.css', '.webp', '.jpg', '.jpeg', '.png', '.svg', '.gif', '.woff', '.pdf', '.zip']
```

You can also replace the default patterns list with your own by providing `url_patterns`, or extend it by passing additional patterns in `extra_url_patterns`.

<RunnableCodeBlock className="language-python" language="python">
    {PlaywrightBlockRequests}
</RunnableCodeBlock>


================================================
FILE: docs/examples/playwright_crawler_with_camoufox.mdx
================================================
---
id: playwright-crawler-with-camoufox
title: Playwright crawler with Camoufox
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import PlaywrightCrawlerExampleWithCamoufox from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_camoufox.py';

This example demonstrates how to integrate Camoufox into <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> with custom <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>.

Camoufox is a stealthy minimalistic build of Firefox. For details please visit its homepage https://camoufox.com/ .
To be able to run this example you will need to install camoufox, as it is external tool, and it is not part of the crawlee. For installation please see https://pypi.org/project/camoufox/.

**Warning!** Camoufox is using custom build of firefox. This build can be hundreds of MB large.
You can either pre-download this file using following command `python3 -m camoufox fetch` or camoufox will download it automatically once you try to run it, and it does not find existing binary.
For more details please refer to: https://github.com/daijro/camoufox/tree/main/pythonlib#camoufox-python-interface

**Project template -** It is possible to generate project with Python code which includes Camoufox integration into crawlee through crawlee cli. Call `crawlee create` and pick `Playwright-camoufox` when asked for Crawler type.

The example code after PlayWrightCrawler instantiation is similar to example describing the use of Playwright Crawler. The main difference is that in this example Camoufox will be used as the browser through BrowserPool.

<RunnableCodeBlock className="language-python" language="python">
    {PlaywrightCrawlerExampleWithCamoufox}
</RunnableCodeBlock>


================================================
FILE: docs/examples/playwright_crawler_with_fingerprint_generator.mdx
================================================
---
id: playwright-crawler-with-fingerprint-generator
title: Playwright crawler with fingerprint generator
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_fingerprint_generator.py';

This example demonstrates how to use <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> together with <ApiLink to="class/FingerprintGenerator">`FingerprintGenerator`</ApiLink> that will populate several browser attributes to mimic real browser fingerprint. To read more about fingerprints please see: https://docs.apify.com/academy/anti-scraping/techniques/fingerprinting.

You can implement your own fingerprint generator or use <ApiLink to="class/BrowserforgeFingerprintGenerator">`DefaultFingerprintGenerator`</ApiLink>. To use the generator initialize it with the desired fingerprint options. The generator will try to create fingerprint based on those options. Unspecified options will be automatically selected by the generator from the set of reasonable values. If some option is important for you, do not rely on the default and explicitly define it.

<RunnableCodeBlock className="language-python" language="python">
    {PlaywrightCrawlerExample}
</RunnableCodeBlock>


================================================
FILE: docs/examples/respect_robots_txt_file.mdx
================================================
---
id: respect-robots-txt-file
title: Respect robots.txt file
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py';
import OnSkippedRequest from '!!raw-loader!roa-loader!./code_examples/respect_robots_on_skipped_request.py';

This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file.

To configure `Crawlee` to follow the `robots.txt` file, set the parameter `respect_robots_txt_file=True` in <ApiLink to="class/BasicCrawlerOptions">`BasicCrawlerOptions`</ApiLink>. In this case, `Crawlee` will skip any URLs forbidden in the website's robots.txt file.

As an example, let's look at the website `https://news.ycombinator.com/` and its corresponding [robots.txt](https://news.ycombinator.com/robots.txt) file. Since the file has a rule `Disallow: /login`, the URL `https://news.ycombinator.com/login` will be automatically skipped.

The code below demonstrates this behavior using the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>:

<RunnableCodeBlock className="language-python" language="python">
    {RespectRobotsTxt}
</RunnableCodeBlock>

## Handle with `on_skipped_request`

If you want to process URLs skipped according to the `robots.txt` rules, for example for further analysis, you should use the `on_skipped_request` handler from <ApiLink to="class/BasicCrawler#on_skipped_request">`BasicCrawler`</ApiLink>.

Let's update the code by adding the `on_skipped_request` handler:

<RunnableCodeBlock className="language-python" language="python">
    {OnSkippedRequest}
</RunnableCodeBlock>


================================================
FILE: docs/examples/resuming_paused_crawl.mdx
================================================
---
id: resuming-paused-crawl
title: Resuming a paused crawl
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import ResumeCrawl from '!!raw-loader!roa-loader!./code_examples/resuming_paused_crawl.py';

This example demonstrates how to resume crawling from its last state when running locally, if for some reason it was unexpectedly terminated.

If each run should continue crawling from the previous state, you can configure this using `purge_on_start` in <ApiLink to="class/Configuration">`Configuration`</ApiLink>.

Use the code below and perform 2 sequential runs. During the 1st run, stop the crawler by pressing `CTRL+C`, and the 2nd run will resume crawling from where it stopped.

<RunnableCodeBlock className="language-python" language="python">
    {ResumeCrawl}
</RunnableCodeBlock>

Perform the 1st run, interrupting the crawler with `CTRL+C` after 2 links have been processed.

![Run with interruption](/img/resuming-paused-crawl/00.webp 'Run with interruption.')

Now resume crawling after the pause to process the remaining 3 links.

![Resuming crawling](/img/resuming-paused-crawl/01.webp 'Resuming crawling.')

Alternatively, use the environment variable `CRAWLEE_PURGE_ON_START=0` instead of using `configuration.purge_on_start = False`.

For example, when running code:

```bash
CRAWLEE_PURGE_ON_START=0 python -m best_crawler
```


================================================
FILE: docs/examples/run_parallel_crawlers.mdx
================================================
---
id: run-parallel-crawlers
title: Run parallel crawlers
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import RunParallelCrawlersExample from '!!raw-loader!roa-loader!./code_examples/run_parallel_crawlers.py';

This example demonstrates how to run two parallel crawlers where one crawler processes links discovered by another crawler.

In some situations, you may need different approaches for scraping data from a website. For example, you might use <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> for navigating JavaScript-heavy pages and a faster, more lightweight <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> for processing static pages. One way to solve this is to use <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink>, see the [Adaptive Playwright crawler example](./adaptive-playwright-crawler) to learn more.

The code below demonstrates an alternative approach using two separate crawlers. Links are passed between crawlers via <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> aliases. The `keep_alive` option allows the Playwright crawler to run in the background and wait for incoming links without stopping when its queue is empty. You can also use different storage clients for each crawler without losing the ability to pass links between queues. Learn more about available storage clients in this [guide](/python/docs/guides/storage-clients).

<RunnableCodeBlock className="language-python" language="python">
    {RunParallelCrawlersExample}
</RunnableCodeBlock>


================================================
FILE: docs/examples/using_browser_profile.mdx
================================================
---
id: using_browser_profile
title: Using browser profile
---

import ApiLink from '@site/src/components/ApiLink';

import CodeBlock from '@theme/CodeBlock';

import ChromeProfileExample from '!!raw-loader!./code_examples/using_browser_profiles_chrome.py';
import FirefoxProfileExample from '!!raw-loader!./code_examples/using_browser_profiles_firefox.py';

This example demonstrates how to run <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> using your local browser profile from [Chrome](https://www.google.com/intl/us/chrome/) or [Firefox](https://www.firefox.com/).

Using browser profiles allows you to leverage existing login sessions, saved passwords, bookmarks, and other personalized browser data during crawling. This can be particularly useful for testing scenarios or when you need to access content that requires authentication.

## Chrome browser

To run <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> with your Chrome profile, you need to know the path to your profile files. You can find this information by entering `chrome://version/` as a URL in your Chrome browser. If you have multiple profiles, pay attention to the profile name - if you only have one profile, it's always `Default`.

:::warning Profile access limitation
Due to [Chrome's security policies](https://developer.chrome.com/blog/remote-debugging-port), automation cannot use your main browsing profile directly. The example copies your profile to a temporary location as a workaround.
:::

Make sure you don't have any running Chrome browser processes before running this code:

<CodeBlock className="language-python" language="python">
    {ChromeProfileExample}
</CodeBlock>

## Firefox browser

To find the path to your Firefox profile, enter `about:profiles` as a URL in your Firefox browser. Unlike Chrome, you can use your standard profile path directly without copying it first.

Make sure you don't have any running Firefox browser processes before running this code:

<CodeBlock className="language-python" language="python">
    {FirefoxProfileExample}
</CodeBlock>


================================================
FILE: docs/examples/using_sitemap_request_loader.mdx
================================================
---
id: using-sitemap-request-loader
title: Using sitemap request loader
---

import ApiLink from '@site/src/components/ApiLink';

import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import SitemapRequestLoaderExample from '!!raw-loader!roa-loader!./code_examples/using_sitemap_request_loader.py';

This example demonstrates how to use <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> to crawl websites that provide `sitemap.xml` files following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html). The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> processes sitemaps in a streaming fashion without loading them entirely into memory, making it suitable for large sitemaps.

The example shows how to use the `transform_request_function` parameter to configure request options based on URL patterns. This allows you to modify request properties such as labels and user data based on the source URL, enabling different handling logic for different websites or sections.

The following code example implements processing of sitemaps from two different domains (Apify and Crawlee), with different labels assigned to requests based on their host. The `create_transform_request` function maps each host to the corresponding request configuration, while the crawler uses different handlers based on the assigned labels.

<RunnableCodeBlock className="language-python" language="python">
    {SitemapRequestLoaderExample}
</RunnableCodeBlock>

For more information about request loaders, see the [Request loaders guide](../guides/request-loaders).


================================================
FILE: docs/guides/architecture_overview.mdx
================================================
---
id: architecture-overview
title: Architecture overview
description: An overview of the core components of the Crawlee library and its architecture.
---

import ApiLink from '@site/src/components/ApiLink';

Crawlee is a modern and modular web scraping framework. It is designed for both HTTP-only and browser-based scraping. In this guide, we will provide a high-level overview of its architecture and the main components that make up the system.

## Crawler

The main user-facing component of Crawlee is the crawler, which orchestrates the crawling process and takes care of all other components. It manages storages, executes user-defined request handlers, handles retries, manages concurrency, and coordinates all other components. All crawlers inherit from the <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink> class, which provides the basic functionality. There are two main groups of specialized crawlers: HTTP crawlers and browser crawlers.

:::info

You will learn more about the request handlers in the request router section.

:::

```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Abstract classes
%% ========================

class BasicCrawler {
    <<abstract>>
}

class AbstractHttpCrawler {
    <<abstract>>
}

%% ========================
%% Specific classes
%% ========================

class HttpCrawler

class ParselCrawler

class BeautifulSoupCrawler

class PlaywrightCrawler

class AdaptivePlaywrightCrawler

%% ========================
%% Inheritance arrows
%% ========================

BasicCrawler --|> AbstractHttpCrawler
BasicCrawler --|> PlaywrightCrawler
BasicCrawler --|> AdaptivePlaywrightCrawler
AbstractHttpCrawler --|> HttpCrawler
AbstractHttpCrawler --|> ParselCrawler
AbstractHttpCrawler --|> BeautifulSoupCrawler
```

### HTTP crawlers

HTTP crawlers use HTTP clients to fetch pages and parse them with HTML parsing libraries. They are fast and efficient for sites that do not require JavaScript rendering. HTTP clients are Crawlee components that wrap around HTTP libraries like [httpx](https://www.python-httpx.org/), [curl-impersonate](https://github.com/lwthiker/curl-impersonate) or [impit](https://apify.github.io/impit) and handle HTTP communication for requests and responses. You can learn more about them in the [HTTP clients guide](./http-clients).

HTTP crawlers inherit from <ApiLink to="class/AbstractHttpCrawler">`AbstractHttpCrawler`</ApiLink> and there are three crawlers that belong to this category:

- <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> utilizes the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) HTML parser.
- <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> utilizes [Parsel](https://github.com/scrapy/parsel) for parsing HTML.
- <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> does not parse HTTP responses at all and is used when no content parsing is required.

You can learn more about HTTP crawlers in the [HTTP crawlers guide](./http-crawlers).

### Browser crawlers

Browser crawlers use a real browser to render pages, enabling scraping of sites that require JavaScript. They manage browser instances, pages, and context lifecycles. Currently, the only browser crawler is <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, which utilizes the [Playwright](https://playwright.dev/) library. Playwright provides a high-level API for controlling and navigating browsers. You can learn more about <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, its features, and how it internally manages browser instances in the [Playwright crawler guide](./playwright-crawler).

### Adaptive crawler

The <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> sits between HTTP and browser crawlers. It can automatically decide whether to use HTTP or browser crawling for each request based on heuristics or user configuration. This allows for optimal performance and compatibility. It also provides a uniform interface for both crawling types (modes). You can learn more about adaptive crawling in the [Adaptive Playwright crawler guide](./adaptive-playwright-crawler).

## Crawling contexts

Crawling contexts are objects that encapsulate the state and data for each request being processed by the crawler. They provide access to the request, response, session, and helper methods for handling the request. Crawling contexts are used to pass data between different parts of the crawler and to manage the lifecycle of each request. These contexts are provided to user-defined request handlers, which can then use them to access request data, response data, or use helper methods to interact with storages, and extract and enqueue new requests.

```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Classes
%% ========================

class BasicCrawlingContext

class HttpCrawlingContext

class HttpCrawlingResult

class ParsedHttpCrawlingContext

class ParselCrawlingContext

class BeautifulSoupCrawlingContext

class PlaywrightPreNavCrawlingContext

class PlaywrightCrawlingContext

class AdaptivePlaywrightPreNavCrawlingContext

class AdaptivePlaywrightCrawlingContext

%% ========================
%% Inheritance arrows
%% ========================

BasicCrawlingContext --|> HttpCrawlingContext

HttpCrawlingResult --|> HttpCrawlingContext

HttpCrawlingContext --|> ParsedHttpCrawlingContext

ParsedHttpCrawlingContext --|> ParselCrawlingContext

ParsedHttpCrawlingContext --|> BeautifulSoupCrawlingContext

BasicCrawlingContext --|> PlaywrightPreNavCrawlingContext

PlaywrightPreNavCrawlingContext --|> PlaywrightCrawlingContext

BasicCrawlingContext --|> AdaptivePlaywrightPreNavCrawlingContext

ParsedHttpCrawlingContext --|> AdaptivePlaywrightCrawlingContext
```

They have a similar inheritance structure as the crawlers, with the base class being <ApiLink to="class/BasicCrawlingContext">`BasicCrawlingContext`</ApiLink>. The specific crawling contexts are:
- <ApiLink to="class/HttpCrawlingContext">`HttpCrawlingContext`</ApiLink> for HTTP crawlers.
- <ApiLink to="class/ParsedHttpCrawlingContext">`ParsedHttpCrawlingContext`</ApiLink> for HTTP crawlers with parsed responses.
- <ApiLink to="class/ParselCrawlingContext">`ParselCrawlingContext`</ApiLink> for HTTP crawlers that use [Parsel](https://github.com/scrapy/parsel) for parsing.
- <ApiLink to="class/BeautifulSoupCrawlingContext">`BeautifulSoupCrawlingContext`</ApiLink> for HTTP crawlers that use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing.
- <ApiLink to="class/PlaywrightPreNavCrawlingContext">`PlaywrightPreNavCrawlingContext`</ApiLink> for Playwright crawlers before the page is navigated.
- <ApiLink to="class/PlaywrightCrawlingContext">`PlaywrightCrawlingContext`</ApiLink> for Playwright crawlers.
- <ApiLink to="class/AdaptivePlaywrightPreNavCrawlingContext">`AdaptivePlaywrightPreNavCrawlingContext`</ApiLink> for Adaptive Playwright crawlers before the page is navigated.
- <ApiLink to="class/AdaptivePlaywrightCrawlingContext">`AdaptivePlaywrightCrawlingContext`</ApiLink> for Adaptive Playwright crawlers.

## Storages

Storages are the components that manage data in Crawlee. They provide a way to store and retrieve data during the crawling process. Crawlee's storage system consists of two main layers:

- **Storages**: High-level interfaces for interacting with different storage types
- **Storage clients**: Backend implementations that handle the actual data persistence and management (you will learn more about them in the next section)

Crawlee provides three built-in storage types for managing data:

- <ApiLink to="class/Dataset">`Dataset`</ApiLink> - Append-only, tabular storage for structured data. It is ideal for storing scraping results.
- <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink> - Storage for arbitrary data like JSON documents, images or configs. It supports get and set operations with key-value pairs; updates are only possible by replacement.
- <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> - A managed queue for pending and completed requests, with automatic deduplication and dynamic addition of new items. It is used to track URLs for crawling.

See the [Storages guide](./storages) for more details.

```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Abstract classes
%% ========================

class Storage {
    <<abstract>>
}

%% ========================
%% Specific classes
%% ========================

class Dataset

class KeyValueStore

class RequestQueue

%% ========================
%% Inheritance arrows
%% ========================

Storage --|> Dataset
Storage --|> KeyValueStore
Storage --|> RequestQueue
```

## Storage clients

Storage clients are the backend implementations for storages that handle interactions with different storage systems. They provide a unified interface for <ApiLink to="class/Dataset">`Dataset`</ApiLink>, <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, and <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, regardless of the underlying storage implementation.

Crawlee provides several built-in storage client implementations:

- <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink> - Stores data in memory with no persistence (ideal for testing and fast operations).
- <ApiLink to="class/FileSystemStorageClient">`FileSystemStorageClient`</ApiLink> - Provides persistent file system storage with caching (default client).
- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com/) (cloud-based). It is implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). You can find more information about it in the [Apify SDK documentation](https://docs.apify.com/sdk/python/docs/overview/introduction).

```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Abstract classes
%% ========================

class StorageClient {
    <<abstract>>
}

%% ========================
%% Specific classes
%% ========================

class MemoryStorageClient

class FileSystemStorageClient

class ApifyStorageClient

%% ========================
%% Inheritance arrows
%% ========================

StorageClient --|> MemoryStorageClient
StorageClient --|> FileSystemStorageClient
StorageClient --|> ApifyStorageClient
```

Storage clients can be registered globally with the <ApiLink to="class/ServiceLocator">`ServiceLocator`</ApiLink> (you will learn more about the <ApiLink to="class/ServiceLocator">`ServiceLocator`</ApiLink> in the next section), passed directly to crawlers, or specified when opening individual storage instances. You can also create custom storage clients by implementing the <ApiLink to="class/StorageClient">`StorageClient`</ApiLink> interface.

See the [Storage clients guide](./storage-clients) for more details.

## Request router

The request <ApiLink to="class/Router">`Router`</ApiLink> is a central component that manages the flow of requests and responses in Crawlee. It is responsible for routing requests to the appropriate request handlers, managing the crawling context, and coordinating the execution of user-defined logic.

### Request handlers

Request handlers are user-defined functions that process requests and responses in Crawlee. They are the core of the crawling logic and are responsible for handling data extraction, processing, and storage. Each request handler receives a crawling context as an argument, which provides access to request data, response data, and other information related to the request. Request handlers can be registered with the <ApiLink to="class/Router">`Router`</ApiLink>.

The request routing in Crawlee supports:
- Default handlers - Fallback handlers for requests without specific labels.
- Label-based routing - Handlers for specific request types based on labels.
- Error handlers - Handle errors during request processing.
- Failed request handlers - Handle requests that exceed retry limits.
- Pre-navigation hooks - Execute logic before navigating to URLs.

See the [Request router guide](./request-router) for detailed information and examples.

## Service locator

The <ApiLink to="class/ServiceLocator">`ServiceLocator`</ApiLink> is a central registry for global services in Crawlee. It manages and provides access to core services throughout the framework, ensuring consistent configuration across all components. The service locator coordinates these three services:

- <ApiLink to="class/Configuration">`Configuration`</ApiLink> - Application-wide settings and parameters that control various aspects of Crawlee behavior.
- <ApiLink to="class/StorageClient">`StorageClient`</ApiLink> - Backend implementation for data storage across datasets, key-value stores, and request queues.
- <ApiLink to="class/EventManager">`EventManager`</ApiLink> - Event coordination system for internal framework events and custom user hooks.

Services can be registered globally through the `service_locator` singleton instance, passed to crawler constructors, or provided when opening individual storage instances. The service locator includes conflict prevention mechanisms to ensure configuration consistency and prevent accidental service conflicts during runtime.

See the [Service locator guide](./service-locator) for detailed information about service registration and configuration options.

## Request loaders

Request loaders provide a subset of <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> functionality, focusing specifically on reading and accessing streams of requests from various sources. They define how requests are fetched and processed, enabling use cases such as reading URLs from files, external APIs, sitemaps, or combining multiple sources together. Unlike request queues, they do not handle storage or persistence—they only provide request reading capabilities.

- <ApiLink to="class/RequestLoader">`RequestLoader`</ApiLink> - Base interface for read-only access to a stream of requests, with capabilities like fetching the next request, marking as handled, and status checking.
- <ApiLink to="class/RequestList">`RequestList`</ApiLink> - Lightweight in-memory implementation of `RequestLoader` for managing static lists of URLs.
- <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> - A specialized loader that reads URLs from XML and plain-text sitemaps following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html) with filtering capabilities.

### Request managers

<ApiLink to="class/RequestManager">`RequestManager`</ApiLink> extends <ApiLink to="class/RequestLoader">`RequestLoader`</ApiLink> with write capabilities for adding and reclaiming requests, providing full request management functionality. <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> is the primary concrete implementation of <ApiLink to="class/RequestManager">`RequestManager`</ApiLink>.

<ApiLink to="class/RequestManagerTandem">`RequestManagerTandem`</ApiLink> combines a read-only `RequestLoader` with a writable <ApiLink to="class/RequestManager">`RequestManager`</ApiLink>, transferring requests from the loader to the manager for hybrid scenarios. This is useful when you want to start with a predefined set of URLs (from a file or sitemap) but also need to add new requests dynamically during crawling. The tandem first processes all requests from the loader, then handles any additional requests added to the manager.

Request loaders are useful when you need to start with a predefined set of URLs. The tandem approach allows processing requests from static sources (like files or sitemaps) while maintaining the ability to add new requests dynamically.

See the [Request loaders guide](./request-loaders) for detailed information.

## Event manager

The <ApiLink to="class/EventManager">`EventManager`</ApiLink> is responsible for coordinating internal events throughout Crawlee and enabling custom hooks. It provides a system for registering event listeners, emitting events, and managing their execution lifecycle.

Crawlee provides several implementations of the event manager:

- <ApiLink to="class/EventManager">`EventManager`</ApiLink> is the base class for event management in Crawlee.
- <ApiLink to="class/LocalEventManager">`LocalEventManager`</ApiLink> extends the base event manager for local environments by automatically emitting `SYSTEM_INFO` events at regular intervals. This provides real-time system metrics including CPU usage and memory consumption, which are essential for internal components like the <ApiLink to="class/Snapshotter">`Snapshotter`</ApiLink> and <ApiLink to="class/AutoscaledPool">`AutoscaledPool`</ApiLink>.
- [`ApifyEventManager`](https://docs.apify.com/sdk/python/reference/class/PlatformEventManager) - Manages events on the [Apify platform](https://apify.com/) (cloud-based). It is implemented in the [Apify SDK](https://docs.apify.com/sdk/python/).

:::info

You can learn more about <ApiLink to="class/Snapshotter">`Snapshotter`</ApiLink> and <ApiLink to="class/AutoscaledPool">`AutoscaledPool`</ApiLink> and their configuration in the [Scaling crawlers guide](./scaling-crawlers).

:::

Crawlee defines several built-in event types:

- `PERSIST_STATE` - Emitted periodically to trigger state persistence.
- `SYSTEM_INFO` - Contains CPU and memory usage information.
- `MIGRATING` - Signals that the crawler is migrating to a different environment.
- `ABORTING` - Indicates the crawler is aborting execution.
- `EXIT` - Emitted when the crawler is exiting.
- `CRAWLER_STATUS` - Provides status updates from crawlers.

Additional specialized events for browser and session management are also available.

The event manager operates as an async context manager, automatically starting periodic tasks when entered and ensuring all listeners complete before exiting. Event listeners can be either synchronous or asynchronous functions and are executed safely without blocking the main event loop.

```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Abstract classes
%% ========================

class EventManager {
    <<abstract>>
}

%% ========================
%% Specific classes
%% ========================

class LocalEventManager

class ApifyEventManager

%% ========================
%% Inheritance arrows
%% ========================

EventManager --|> LocalEventManager
EventManager --|> ApifyEventManager
```

## Session management

The core component of session management in Crawlee is <ApiLink to="class/SessionPool">`SessionPool`</ApiLink>. It manages a collection of sessions that simulate individual users with unique attributes like cookies, IP addresses (via proxies), and browser fingerprints. Sessions help avoid blocking by rotating user identities and maintaining realistic browsing patterns.

:::info

You can learn more about fingerprints and how to avoid getting blocked in the [Avoid blocking guide](./avoid-blocking).

:::

### Session

A session is represented as a <ApiLink to="class/Session">`Session`</ApiLink> object, which contains components like cookies, error tracking, usage limits, and expiration handling. Sessions can be marked as good (<ApiLink to="class/Session#mark_good">`Session.mark_good`</ApiLink>), bad (<ApiLink to="class/Session#mark_bad">`Session.mark_bad`</ApiLink>), or retired (<ApiLink to="class/Session#retire">`Session.retire`</ApiLink>) based on their performance, and they automatically become unusable when they exceed error thresholds or usage limits.

### Session pool

The session pool provides automated session lifecycle management:

- Automatic rotation - Retrieves random sessions from the pool and creates new ones as needed.
- Pool maintenance - Removes retired sessions and maintains the pool at maximum capacity.
- State persistence - Persists session state to enable recovery across restarts.
- Configurable limits - Supports custom pool sizes, session settings, and creation functions.

The pool operates as an async context manager, automatically initializing with sessions and cleaning up on exit. It ensures proper session management by rotating sessions based on usage count, expiration time, and custom rules while maintaining optimal pool size.

See the [Session management guide](./session-management) for more information.

## Statistics

The <ApiLink to="class/Statistics">`Statistics`</ApiLink> class provides runtime monitoring for crawler operations, tracking performance metrics like request counts, processing times, retry attempts, and error patterns. It operates as an async context manager, automatically persisting data across crawler restarts and migrations using <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>.

The system includes error tracking through the <ApiLink to="class/ErrorTracker">`ErrorTracker`</ApiLink> class, which groups similar errors by type and message patterns using wildcard matching. It can capture HTML snapshots and screenshots for debugging and separately track retry-specific errors.

Statistics are logged at configurable intervals in both table and inline formats, with final summary data returned from the `crawler.run` method available through <ApiLink to="class/FinalStatistics">`FinalStatistics`</ApiLink>.

## Conclusion

In this guide, we provided a high-level overview of the core components of the Crawlee library and its architecture. We covered the main components like crawlers, crawling contexts, storages, request routers, service locator, request loaders, event manager, session management, and statistics. Check out other guides, the [API reference](https://crawlee.dev/python/api), and [Examples](../examples) for more details on how to use these components in your own projects.

If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!


================================================
FILE: docs/guides/avoid_blocking.mdx
================================================
---
id: avoid-blocking
title: Avoid getting blocked
description: How to avoid getting blocked when scraping
---

import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import PlaywrightDefaultFingerprintGenerator from '!!raw-loader!roa-loader!./code_examples/avoid_blocking/playwright_with_fingerprint_generator.py';
import PlaywrightWithCamoufox from '!!raw-loader!roa-loader!../examples/code_examples/playwright_crawler_with_camoufox.py';

import PlaywrightDefaultFingerprintGeneratorWithArgs from '!!raw-loader!./code_examples/avoid_blocking/default_fingerprint_generator_with_args.py';

A scraper might get blocked for numerous reasons. Let's narrow it down to the two main ones. The first is a bad or blocked IP address. You can learn about this topic in the [proxy management guide](./proxy-management). The second reason is [browser fingerprints](https://pixelprivacy.com/resources/browser-fingerprinting/) (or signatures), which we will explore more in this guide. Check the [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping) to gain a deeper theoretical understanding of blocking and learn a few tips and tricks.

Browser fingerprint is a collection of browser attributes and significant features that can show if our browser is a bot or a real user. Moreover, most browsers have these unique features that allow the website to track the browser even within different IP addresses. This is the main reason why scrapers should change browser fingerprints while doing browser-based scraping. In return, it should significantly reduce the blocking.

## Using browser fingerprints

Changing browser fingerprints can be a tedious job. Luckily, Crawlee provides this feature with minimal configuration necessary - the usage of fingerprints in <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is enabled by default. You can customize the fingerprints by using the `fingerprint_generator` argument of the <ApiLink to="class/PlaywrightCrawler#__init__">`PlaywrightCrawler.__init__`</ApiLink>, either pass your own implementation of <ApiLink to="class/FingerprintGenerator">`FingerprintGenerator`</ApiLink> or use <ApiLink to="class/BrowserforgeFingerprintGenerator">`DefaultFingerprintGenerator`</ApiLink>.

<RunnableCodeBlock className="language-python" language="python">
    {PlaywrightDefaultFingerprintGenerator}
</RunnableCodeBlock>

In certain cases we want to narrow down the fingerprints used - e.g. specify a certain operating system, locale or browser. This is also possible with Crawlee - the crawler can have the generation algorithm customized to reflect the particular browser version and many more. For description of fingerprint generation options please see <ApiLink to="class/HeaderGeneratorOptions">`HeaderGeneratorOptions`</ApiLink>, <ApiLink to="class/ScreenOptions">`ScreenOptions`</ApiLink> and <ApiLink to="class/BrowserforgeFingerprintGenerator#__init__">`DefaultFingerprintGenerator.__init__`</ApiLink>  See the example below:

<CodeBlock className="language-python">
    {PlaywrightDefaultFingerprintGeneratorWithArgs}
</CodeBlock>

If you do not want to use fingerprints, then pass `fingerprint_generator=None` argument to the <ApiLink to="class/PlaywrightCrawler#__init__">`PlaywrightCrawler.__init__`</ApiLink>.

## Using Camoufox

In some cases even <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> with fingerprints is not enough. You can try using <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> together with [Camoufox](https://camoufox.com/). See the example integration below:

<RunnableCodeBlock className="language-python" language="python">
    {PlaywrightWithCamoufox}
</RunnableCodeBlock>

**Related links**

- [Fingerprint Suite Docs](https://github.com/apify/fingerprint-suite)
- [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping)


================================================
FILE: docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py
================================================
import asyncio

from crawlee.fingerprint_suite import (
    DefaultFingerprintGenerator,
    HeaderGeneratorOptions,
    ScreenOptions,
)


async def main() -> None:
    fingerprint_generator = DefaultFingerprintGenerator(
        header_options=HeaderGeneratorOptions(browsers=['chrome']),
        screen_options=ScreenOptions(min_width=400),
    )

    # ...


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/avoid_blocking/playwright_with_fingerprint_generator.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    # Fingerprint generator is used by default.
    crawler = PlaywrightCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Find a link to the next page and enqueue it if it exists.
        await context.enqueue_links(selector='.morelink')

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://news.ycombinator.com/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py
================================================
import asyncio
import io
from pathlib import Path

from warcio.statusandheaders import StatusAndHeaders
from warcio.warcwriter import WARCWriter

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext


async def archive_response(context: ParselCrawlingContext, writer: WARCWriter) -> None:
    """Helper function for archiving response in WARC format."""
    # Create WARC records for response
    response_body = await context.http_response.read()
    response_payload_stream = io.BytesIO(response_body)

    response_headers = StatusAndHeaders(
        str(context.http_response.status_code),
        context.http_response.headers,
        protocol='HTTP/1.1',
    )
    response_record = writer.create_warc_record(
        context.request.url,
        'response',
        payload=response_payload_stream,
        length=len(response_body),
        http_headers=response_headers,
    )
    writer.write_record(response_record)


async def main() -> None:
    crawler = ParselCrawler(
        max_requests_per_crawl=10,
    )

    # Create a WARC archive file a prepare the writer.
    archive = Path('example.warc.gz')
    with archive.open('wb') as output:
        writer = WARCWriter(output, gzip=True)

        # Create a WARC info record to store metadata about the archive.
        warcinfo_payload = {
            'software': 'Crawlee',
            'format': 'WARC/1.1',
            'description': 'Example archive created with ParselCrawler',
        }
        writer.write_record(writer.create_warcinfo_record(archive.name, warcinfo_payload))

        # Define the default request handler, which will be called for every request.
        @crawler.router.default_handler
        async def request_handler(context: ParselCrawlingContext) -> None:
            context.log.info(f'Archiving {context.request.url} ...')
            await archive_response(context=context, writer=writer)
            await context.enqueue_links(strategy='same-domain')

        await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/creating_web_archive/manual_archiving_playwright_crawler.py
================================================
import asyncio
import io
import logging
from functools import partial
from pathlib import Path

from playwright.async_api import Request
from warcio.statusandheaders import StatusAndHeaders
from warcio.warcwriter import WARCWriter

from crawlee.crawlers import (
    PlaywrightCrawler,
    PlaywrightCrawlingContext,
    PlaywrightPreNavCrawlingContext,
)


async def archive_response(
    request: Request, writer: WARCWriter, logger: logging.Logger
) -> None:
    """Helper function for archiving response in WARC format."""
    response = await request.response()
    if not response:
        logger.warning(f'Could not get response {request.url}')
        return
    try:
        response_body = await response.body()
    except Exception as e:
        logger.warning(f'Could not get response body for {response.url}: {e}')
        return
    logger.info(f'Archiving resource {response.url}')
    response_payload_stream = io.BytesIO(response_body)
    response_headers = StatusAndHeaders(
        str(response.status), response.headers, protocol='HTTP/1.1'
    )
    response_record = writer.create_warc_record(
        response.url,
        'response',
        payload=response_payload_stream,
        length=len(response_body),
        http_headers=response_headers,
    )
    writer.write_record(response_record)


async def main() -> None:
    crawler = PlaywrightCrawler(
        max_requests_per_crawl=1,
        headless=False,
    )

    # Create a WARC archive file a prepare the writer.
    archive = Path('example.warc.gz')
    with archive.open('wb') as output:
        writer = WARCWriter(output, gzip=True)

        # Create a WARC info record to store metadata about the archive.
        warcinfo_payload = {
            'software': 'Crawlee',
            'format': 'WARC/1.1',
            'description': 'Example archive created with PlaywrightCrawler',
        }
        writer.write_record(writer.create_warcinfo_record(archive.name, warcinfo_payload))

        @crawler.pre_navigation_hook
        async def archiving_hook(context: PlaywrightPreNavCrawlingContext) -> None:
            # Ensure that all responses with additional resources are archived
            context.page.on(
                'requestfinished',
                partial(archive_response, logger=context.log, writer=writer),
            )

        @crawler.router.default_handler
        async def request_handler(context: PlaywrightCrawlingContext) -> None:
            # For some sites, where the content loads dynamically,
            # it is needed to scroll the page to load all content.
            # It slows down the crawling, but ensures that all content is loaded.
            await context.infinite_scroll()
            await context.enqueue_links(strategy='same-domain')

        await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/creating_web_archive/simple_pw_through_proxy_pywb_server.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Use the local wayback server as a proxy
        proxy_configuration=ProxyConfiguration(proxy_urls=['http://localhost:8080/']),
        # Ignore the HTTPS errors if you have not followed pywb CA setup instructions
        browser_launch_options={'ignore_https_errors': True},
        max_requests_per_crawl=10,
        headless=False,
    )

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Archiving {context.request.url} ...')
        # For some sites, where the content loads dynamically,
        # it is needed to scroll the page to load all content.
        # It slows down the crawling, but ensures that all content is loaded.
        await context.infinite_scroll()
        await context.enqueue_links(strategy='same-domain')

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/error_handling/change_handle_error_status.py
================================================
import asyncio
import json

from crawlee import HttpHeaders
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.errors import HttpStatusCodeError
from crawlee.sessions import SessionPool

# Using a placeholder refresh token for this example
REFRESH_TOKEN = 'PLACEHOLDER'
UNAUTHORIZED_CODE = 401


async def main() -> None:
    crawler = HttpCrawler(
        max_request_retries=2,
        # Only treat 403 as a blocking status code, not 401
        session_pool=SessionPool(create_session_settings={'blocked_status_codes': [403]}),
        # Don't treat 401 responses as errors
        ignore_http_error_status_codes=[UNAUTHORIZED_CODE],
    )

    @crawler.router.default_handler
    async def default_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')
        # Now we can handle 401 responses ourselves
        if context.http_response.status_code == UNAUTHORIZED_CODE:
            # Get a fresh access token
            headers = {'authorization': f'Bearer {REFRESH_TOKEN}'}
            response = await context.send_request(
                'https://placeholder.org/refresh', headers=headers
            )
            data = json.loads(await response.read())
            # Add the new token to our `Request` headers
            context.request.headers |= HttpHeaders(
                {'authorization': f'Bearer {data["access_token"]}'},
            )
            # Trigger a retry with our updated headers
            raise HttpStatusCodeError('Unauthorized', status_code=UNAUTHORIZED_CODE)

    await crawler.run(['http://httpbingo.org/status/401'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/error_handling/disable_retry.py
================================================
import asyncio

from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext
from crawlee.errors import HttpStatusCodeError, SessionError


async def main() -> None:
    crawler = HttpCrawler(max_request_retries=5)

    # Create a parsing error for demonstration
    @crawler.router.default_handler
    async def default_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')
        raise ValueError('Simulated parsing error')

    # This handler runs before any retry attempts
    @crawler.error_handler
    async def retry_handler(context: BasicCrawlingContext, error: Exception) -> None:
        context.log.error(f'Failed request {context.request.url}')
        # Only allow retries for network-related errors
        if not isinstance(error, (SessionError, HttpStatusCodeError)):
            context.log.error('Non-network error detected')
            # Stop further retry attempts for this `Request`
            context.request.no_retry = True

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/error_handling/handle_proxy_error.py
================================================
import asyncio

from crawlee import Request
from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext
from crawlee.errors import ProxyError


async def main() -> None:
    # Set how many session rotations will happen before calling the error handler
    # when ProxyError occurs
    crawler = HttpCrawler(max_session_rotations=5, max_request_retries=6)

    # For this example, we'll create a proxy error in our handler
    @crawler.router.default_handler
    async def default_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')
        raise ProxyError('Simulated proxy error')

    # This handler runs after all retry attempts are exhausted
    @crawler.failed_request_handler
    async def failed_handler(context: BasicCrawlingContext, error: Exception) -> None:
        context.log.error(f'Failed request {context.request.url}, after 5 rotations')
        request = context.request
        # For proxy errors, we can add a new `Request` to try again
        if isinstance(error, ProxyError) and not request.unique_key.startswith('retry'):
            context.log.info(f'Retrying {request.url} ...')
            # Create a new `Request` with a modified key to avoid deduplication
            new_request = Request.from_url(
                request.url, unique_key=f'retry{request.unique_key}'
            )

            # Add the new `Request` to the `Queue`
            rq = await crawler.get_request_manager()
            await rq.add_request(new_request)

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/http_clients/parsel_curl_impersonate_example.py
================================================
import asyncio

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import CurlImpersonateHttpClient


async def main() -> None:
    http_client = CurlImpersonateHttpClient(
        # Optional additional keyword arguments for `curl_cffi.requests.AsyncSession`.
        timeout=10,
        impersonate='chrome131',
    )

    crawler = ParselCrawler(
        http_client=http_client,
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Enqueue all links from the page.
        await context.enqueue_links()

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': context.selector.css('title::text').get(),
        }

        # Push the extracted data to the default dataset.
        await context.push_data(data)

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/http_clients/parsel_httpx_example.py
================================================
import asyncio

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import HttpxHttpClient


async def main() -> None:
    http_client = HttpxHttpClient(
        # Optional additional keyword arguments for `httpx.AsyncClient`.
        timeout=10,
        follow_redirects=True,
    )

    crawler = ParselCrawler(
        http_client=http_client,
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Enqueue all links from the page.
        await context.enqueue_links()

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': context.selector.css('title::text').get(),
        }

        # Push the extracted data to the default dataset.
        await context.push_data(data)

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/http_clients/parsel_impit_example.py
================================================
import asyncio

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import ImpitHttpClient


async def main() -> None:
    http_client = ImpitHttpClient(
        # Optional additional keyword arguments for `impit.AsyncClient`.
        http3=True,
        browser='firefox',
        verify=True,
    )

    crawler = ParselCrawler(
        http_client=http_client,
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Enqueue all links from the page.
        await context.enqueue_links()

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': context.selector.css('title::text').get(),
        }

        # Push the extracted data to the default dataset.
        await context.push_data(data)

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/http_crawlers/__init__.py
================================================


================================================
FILE: docs/guides/code_examples/http_crawlers/beautifulsoup_example.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    # Create a BeautifulSoupCrawler instance
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to 10 requests
        max_requests_per_crawl=10,
    )

    # Define the default request handler
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # Extract data using BeautifulSoup
        data = {
            'url': context.request.url,
            'title': context.soup.title.string if context.soup.title else None,
        }

        # Push extracted data to the dataset
        await context.push_data(data)

        # Enqueue links found on the page for further crawling
        await context.enqueue_links()

    # Run the crawler
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/http_crawlers/custom_crawler_example.py
================================================


================================================
FILE: docs/guides/code_examples/http_crawlers/http_example.py
================================================
import asyncio
import re

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
    # Create an HttpCrawler instance - no automatic parsing
    crawler = HttpCrawler(
        # Limit the crawl to 10 requests
        max_requests_per_crawl=10,
    )

    # Define the default request handler
    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # Get the raw response content
        response_body = await context.http_response.read()
        response_text = response_body.decode('utf-8')

        # Extract title manually using regex (since we don't have a parser)
        title_match = re.search(
            r'<title[^>]*>([^<]+)</title>', response_text, re.IGNORECASE
        )
        title = title_match.group(1).strip() if title_match else None

        # Extract basic information
        data = {
            'url': context.request.url,
            'title': title,
        }

        # Push extracted data to the dataset
        await context.push_data(data)

        # Simple link extraction for further crawling
        href_pattern = r'href=["\']([^"\']+)["\']'
        matches = re.findall(href_pattern, response_text, re.IGNORECASE)

        # Enqueue first few links found (limit to avoid too many requests)
        for href in matches[:3]:
            if href.startswith('http') and 'crawlee.dev' in href:
                await context.add_requests([href])

    # Run the crawler
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/http_crawlers/lexbor_parser.py
================================================
import asyncio

from pydantic import ValidationError
from selectolax.lexbor import LexborHTMLParser
from yarl import URL

from crawlee import Request
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
    crawler = HttpCrawler(
        max_request_retries=1,
        max_requests_per_crawl=10,
    )

    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Parse the HTML content using Selectolax with Lexbor backend.
        parsed_html = LexborHTMLParser(await context.http_response.read())

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': parsed_html.css_first('title').text(),
            'h1s': [h1.text() for h1 in parsed_html.css('h1')],
            'h2s': [h2.text() for h2 in parsed_html.css('h2')],
            'h3s': [h3.text() for h3 in parsed_html.css('h3')],
        }
        await context.push_data(data)

        # Css selector to extract valid href attributes.
        links_selector = (
            'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
        )
        base_url = URL(context.request.url)
        extracted_requests = []

        # Extract links.
        for item in parsed_html.css(links_selector):
            href = item.attributes.get('href')
            if not href:
                continue

            # Convert relative URLs to absolute if needed.
            url = str(base_url.join(URL(href)))
            try:
                request = Request.from_url(url)
            except ValidationError as exc:
                context.log.warning(f'Skipping invalid URL "{url}": {exc}')
                continue
            extracted_requests.append(request)

        # Add extracted requests to the queue with the same-domain strategy.
        await context.add_requests(extracted_requests, strategy='same-domain')

    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/http_crawlers/lxml_parser.py
================================================
import asyncio

from lxml import html
from pydantic import ValidationError

from crawlee import Request
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
    crawler = HttpCrawler(
        max_request_retries=1,
        max_requests_per_crawl=10,
    )

    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Parse the HTML content using lxml.
        parsed_html = html.fromstring(await context.http_response.read())

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': parsed_html.findtext('.//title'),
            'h1s': [h1.text_content() for h1 in parsed_html.findall('.//h1')],
            'h2s': [h2.text_content() for h2 in parsed_html.findall('.//h2')],
            'h3s': [h3.text_content() for h3 in parsed_html.findall('.//h3')],
        }
        await context.push_data(data)

        # Convert relative URLs to absolute before extracting links.
        parsed_html.make_links_absolute(context.request.url, resolve_base_href=True)

        # Xpath 1.0 selector for extracting valid href attributes.
        links_xpath = (
            '//a/@href[not(starts-with(., "#")) '
            'and not(starts-with(., "javascript:")) '
            'and not(starts-with(., "mailto:"))]'
        )

        extracted_requests = []

        # Extract links.
        for url in parsed_html.xpath(links_xpath):
            try:
                request = Request.from_url(url)
            except ValidationError as exc:
                context.log.warning(f'Skipping invalid URL "{url}": {exc}')
                continue
            extracted_requests.append(request)

        # Add extracted requests to the queue with the same-domain strategy.
        await context.add_requests(extracted_requests, strategy='same-domain')

    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/http_crawlers/lxml_saxonche_parser.py
================================================
import asyncio

from lxml import html
from pydantic import ValidationError
from saxonche import PySaxonProcessor

from crawlee import Request
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
    crawler = HttpCrawler(
        max_request_retries=1,
        max_requests_per_crawl=10,
    )

    # Create Saxon processor once and reuse across requests.
    saxon_proc = PySaxonProcessor(license=False)
    xpath_proc = saxon_proc.new_xpath_processor()

    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Parse HTML with lxml.
        parsed_html = html.fromstring(await context.http_response.read())
        # Convert relative URLs to absolute before extracting links.
        parsed_html.make_links_absolute(context.request.url, resolve_base_href=True)
        # Convert parsed HTML to XML for Saxon processing.
        xml = html.tostring(parsed_html, encoding='unicode', method='xml')
        # Parse XML with Saxon.
        parsed_xml = saxon_proc.parse_xml(xml_text=xml)
        # Set the parsed context for XPath evaluation.
        xpath_proc.set_context(xdm_item=parsed_xml)

        # Extract data using XPath 2.0 string() function.
        data = {
            'url': context.request.url,
            'title': xpath_proc.evaluate_single('.//title/string()'),
            'h1s': [str(h) for h in (xpath_proc.evaluate('//h1/string()') or [])],
            'h2s': [str(h) for h in (xpath_proc.evaluate('//h2/string()') or [])],
            'h3s': [str(h) for h in (xpath_proc.evaluate('//h3/string()') or [])],
        }
        await context.push_data(data)

        # XPath 2.0 with distinct-values() to get unique links and remove fragments.
        links_xpath = """
            distinct-values(
                for $href in //a/@href[
                    not(starts-with(., "#"))
                    and not(starts-with(., "javascript:"))
                    and not(starts-with(., "mailto:"))
                ]
                return replace($href, "#.*$", "")
            )
        """

        extracted_requests = []

        # Extract links.
        for item in xpath_proc.evaluate(links_xpath) or []:
            url = item.string_value
            try:
                request = Request.from_url(url)
            except ValidationError as exc:
                context.log.warning(f'Skipping invalid URL "{url}": {exc}')
                continue
            extracted_requests.append(request)

        # Add extracted requests to the queue with the same-domain strategy.
        await context.add_requests(extracted_requests, strategy='same-domain')

    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/http_crawlers/parsel_example.py
================================================
import asyncio

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext


async def main() -> None:
    # Create a ParselCrawler instance
    crawler = ParselCrawler(
        # Limit the crawl to 10 requests
        max_requests_per_crawl=10,
    )

    # Define the default request handler
    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # Extract data using Parsel's XPath and CSS selectors
        data = {
            'url': context.request.url,
            'title': context.selector.xpath('//title/text()').get(),
        }

        # Push extracted data to the dataset
        await context.push_data(data)

        # Enqueue links found on the page for further crawling
        await context.enqueue_links()

    # Run the crawler
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/http_crawlers/pyquery_parser.py
================================================
import asyncio

from pydantic import ValidationError
from pyquery import PyQuery
from yarl import URL

from crawlee import Request
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
    crawler = HttpCrawler(
        max_request_retries=1,
        max_requests_per_crawl=10,
    )

    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Parse the HTML content using PyQuery.
        parsed_html = PyQuery(await context.http_response.read())

        # Extract data using jQuery-style selectors.
        data = {
            'url': context.request.url,
            'title': parsed_html('title').text(),
            'h1s': [h1.text() for h1 in parsed_html('h1').items()],
            'h2s': [h2.text() for h2 in parsed_html('h2').items()],
            'h3s': [h3.text() for h3 in parsed_html('h3').items()],
        }
        await context.push_data(data)

        # Css selector to extract valid href attributes.
        links_selector = (
            'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
        )
        base_url = URL(context.request.url)

        extracted_requests = []

        # Extract links.
        for item in parsed_html(links_selector).items():
            href = item.attr('href')
            if not href:
                continue

            # Convert relative URLs to absolute if needed.
            url = str(base_url.join(URL(str(href))))
            try:
                request = Request.from_url(url)
            except ValidationError as exc:
                context.log.warning(f'Skipping invalid URL "{url}": {exc}')
                continue
            extracted_requests.append(request)

        # Add extracted requests to the queue with the same-domain strategy.
        await context.add_requests(extracted_requests, strategy='same-domain')

    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/http_crawlers/scrapling_parser.py
================================================
import asyncio

from pydantic import ValidationError
from scrapling.parser import Selector
from yarl import URL

from crawlee import Request
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
    crawler = HttpCrawler(
        max_request_retries=1,
        max_requests_per_crawl=10,
    )

    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Parse the HTML content using Scrapling.
        page = Selector(await context.http_response.read(), url=context.request.url)

        # Extract data using Xpath selectors with .get_all_text method for full text
        # content.
        title_el = page.xpath_first('//title')
        data = {
            'url': context.request.url,
            'title': title_el.text if isinstance(title_el, Selector) else title_el,
            'h1s': [
                h1.get_all_text() if isinstance(h1, Selector) else h1
                for h1 in page.xpath('//h1')
            ],
            'h2s': [
                h2.get_all_text() if isinstance(h2, Selector) else h2
                for h2 in page.xpath('//h2')
            ],
            'h3s': [
                h3.get_all_text() if isinstance(h3, Selector) else h3
                for h3 in page.xpath('//h3')
            ],
        }
        await context.push_data(data)

        # Css selector to extract valid href attributes.
        links_selector = (
            'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
        )
        base_url = URL(context.request.url)
        extracted_requests = []

        # Extract links.
        for item in page.css(links_selector):
            href = item.attrib.get('href') if isinstance(item, Selector) else None
            if not href:
                continue

            # Convert relative URLs to absolute if needed.
            url = str(base_url.join(URL(href)))
            try:
                request = Request.from_url(url)
            except ValidationError as exc:
                context.log.warning(f'Skipping invalid URL "{url}": {exc}')
                continue
            extracted_requests.append(request)

        # Add extracted requests to the queue with the same-domain strategy.
        await context.add_requests(extracted_requests, strategy='same-domain')

    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py
================================================
import asyncio

from crawlee.crawlers import (
    AdaptivePlaywrightCrawler,
    AdaptivePlaywrightCrawlingContext,
)

from .selectolax_parser import SelectolaxLexborParser


async def main() -> None:
    crawler: AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler(
        max_requests_per_crawl=10,
        # Use custom Selectolax parser for static content parsing.
        static_parser=SelectolaxLexborParser(),
    )

    @crawler.router.default_handler
    async def handle_request(context: AdaptivePlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')
        data = {
            'url': context.request.url,
            'title': await context.query_selector_one('title'),
        }

        await context.push_data(data)

        await context.enqueue_links()

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/http_crawlers/selectolax_context.py
================================================
from dataclasses import dataclass, fields

from selectolax.lexbor import LexborHTMLParser
from typing_extensions import Self

from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext


# Custom context for Selectolax parser, you can add your own methods here
# to facilitate working with the parsed document.
@dataclass(frozen=True)
class SelectolaxLexborContext(ParsedHttpCrawlingContext[LexborHTMLParser]):
    """Crawling context providing access to the parsed page.

    This context is passed to request handlers and includes all standard
    context methods (push_data, enqueue_links, etc.) plus custom helpers.
    """

    @property
    def parser(self) -> LexborHTMLParser:
        """Convenient alias for accessing the parsed document."""
        return self.parsed_content

    @classmethod
    def from_parsed_http_crawling_context(
        cls, context: ParsedHttpCrawlingContext[LexborHTMLParser]
    ) -> Self:
        """Create custom context from the base context.

        Copies all fields from the base context to preserve framework
        functionality while adding custom interface.
        """
        return cls(
            **{field.name: getattr(context, field.name) for field in fields(context)}
        )


================================================
FILE: docs/guides/code_examples/http_crawlers/selectolax_crawler.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

from selectolax.lexbor import LexborHTMLParser, LexborNode

from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions

from .selectolax_context import SelectolaxLexborContext
from .selectolax_parser import SelectolaxLexborParser

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from typing_extensions import Unpack

    from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext


# Custom crawler using custom context, It is optional and you can use
# AbstractHttpCrawler directly with SelectolaxLexborParser if you don't need
# any custom context methods.
class SelectolaxLexborCrawler(
    AbstractHttpCrawler[SelectolaxLexborContext, LexborHTMLParser, LexborNode]
):
    """Custom crawler using Selectolax Lexbor for HTML parsing."""

    def __init__(
        self,
        **kwargs: Unpack[HttpCrawlerOptions[SelectolaxLexborContext]],
    ) -> None:
        # Final step converts the base context to custom context type.
        async def final_step(
            context: ParsedHttpCrawlingContext[LexborHTMLParser],
        ) -> AsyncGenerator[SelectolaxLexborContext, None]:
            # Yield custom context wrapping with additional functionality around the base
            # context.
            yield SelectolaxLexborContext.from_parsed_http_crawling_context(context)

        # Build context pipeline: HTTP request -> parsing -> custom context.
        kwargs['_context_pipeline'] = (
            self._create_static_content_crawler_pipeline().compose(final_step)
        )
        super().__init__(
            parser=SelectolaxLexborParser(),
            **kwargs,
        )


================================================
FILE: docs/guides/code_examples/http_crawlers/selectolax_crawler_run.py
================================================
import asyncio

from .selectolax_crawler import SelectolaxLexborContext, SelectolaxLexborCrawler


async def main() -> None:
    crawler = SelectolaxLexborCrawler(
        max_requests_per_crawl=10,
    )

    @crawler.router.default_handler
    async def handle_request(context: SelectolaxLexborContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        data = {
            'url': context.request.url,
            'title': context.parser.css_first('title').text(),
        }

        await context.push_data(data)
        await context.enqueue_links()

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/http_crawlers/selectolax_parser.py
================================================
from __future__ import annotations

import asyncio
from typing import TYPE_CHECKING

from selectolax.lexbor import LexborHTMLParser, LexborNode
from typing_extensions import override

from crawlee.crawlers._abstract_http import AbstractHttpParser

if TYPE_CHECKING:
    from collections.abc import Iterable, Sequence

    from crawlee.http_clients import HttpResponse


class SelectolaxLexborParser(AbstractHttpParser[LexborHTMLParser, LexborNode]):
    """Parser for parsing HTTP response using Selectolax Lexbor."""

    @override
    async def parse(self, response: HttpResponse) -> LexborHTMLParser:
        """Parse HTTP response body into a document object."""
        response_body = await response.read()
        # Run parsing in a thread to avoid blocking the event loop.
        return await asyncio.to_thread(LexborHTMLParser, response_body)

    @override
    async def parse_text(self, text: str) -> LexborHTMLParser:
        """Parse raw HTML string into a document object."""
        return LexborHTMLParser(text)

    @override
    async def select(
        self, parsed_content: LexborHTMLParser, selector: str
    ) -> Sequence[LexborNode]:
        """Select elements matching a CSS selector."""
        return tuple(item for item in parsed_content.css(selector))

    @override
    def is_matching_selector(
        self, parsed_content: LexborHTMLParser, selector: str
    ) -> bool:
        """Check if any element matches the selector."""
        return parsed_content.css_first(selector) is not None

    @override
    def find_links(
        self, parsed_content: LexborHTMLParser, selector: str, attribute: str
    ) -> Iterable[str]:
        """Extract href attributes from elements matching the selector.

        Used by `enqueue_links` helper to discover URLs.
        """
        link: LexborNode
        urls: list[str] = []
        for link in parsed_content.css(selector):
            url = link.attributes.get(attribute)
            if url:
                urls.append(url.strip())
        return urls


================================================
FILE: docs/guides/code_examples/login_crawler/http_login.py
================================================
import asyncio
import json
from datetime import datetime, timedelta

from crawlee import ConcurrencySettings, Request
from crawlee.crawlers import (
    HttpCrawler,
    HttpCrawlingContext,
)
from crawlee.sessions import SessionPool


async def main() -> None:
    crawler = HttpCrawler(
        max_requests_per_crawl=10,
        # Configure to use a single persistent session throughout the crawl
        max_session_rotations=0,
        # Limit request rate to avoid triggering anti-scraping measures
        concurrency_settings=ConcurrencySettings(max_tasks_per_minute=30),
        session_pool=SessionPool(
            max_pool_size=1,
            create_session_settings={
                # Set high value to ensure the session isn't replaced during crawling
                'max_usage_count': 999_999,
                # Set high value to prevent session expiration during crawling
                'max_age': timedelta(hours=999_999),
                # Higher error tolerance before the session is considered blocked
                # Make sure you implement proper error handling in your code
                'max_error_score': 100,
            },
        ),
    )

    # Default request handler for normal page processing
    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

    # Specialized handler for the login API request
    @crawler.router.handler('login')
    async def login_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing login at {context.request.url} ...')

        # Verify that a session is available before proceeding
        if not context.session:
            raise RuntimeError('Session not found')

        # Parse the API response containing authentication tokens and user data
        data = json.loads(await context.http_response.read())

        # Extract authentication data from the response
        token = data['token']
        expires = data['expires'].replace('Z', '+00:00')
        expires_int = int(datetime.fromisoformat(expires).timestamp())
        user_id = data['userId']
        username = data['username']

        # Set authentication cookies in the session that will be used
        # for subsequent requests
        context.session.cookies.set(name='token', value=token, expires=expires_int)
        context.session.cookies.set(name='userID', value=user_id)
        context.session.cookies.set(name='userName', value=username)

        # After successful authentication, continue crawling with the
        # authenticated session
        await context.add_requests(['https://demoqa.com/BookStore/v1/Books'])

    # Create a POST request to the authentication API endpoint
    # This will trigger the login_handler when executed
    request = Request.from_url(
        'https://demoqa.com/Account/v1/Login',
        label='login',
        method='POST',
        payload=json.dumps(
            {'userName': 'crawlee_test', 'password': 'Test1234!'}
        ).encode(),
        headers={'Content-Type': 'application/json'},
    )

    # Start the crawling process with the login request
    await crawler.run([request])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/login_crawler/playwright_login.py
================================================
import asyncio
from datetime import timedelta

from crawlee import ConcurrencySettings, Request
from crawlee.crawlers import (
    PlaywrightCrawler,
    PlaywrightCrawlingContext,
)
from crawlee.sessions import SessionPool


async def main() -> None:
    crawler = PlaywrightCrawler(
        max_requests_per_crawl=10,
        headless=True,
        browser_type='chromium',
        # We only have one session and it shouldn't rotate
        max_session_rotations=0,
        # Limit crawling intensity to avoid blocking
        concurrency_settings=ConcurrencySettings(max_tasks_per_minute=30),
        session_pool=SessionPool(
            # Limit the pool to one session
            max_pool_size=1,
            create_session_settings={
                # High value for session usage limit
                'max_usage_count': 999_999,
                # High value for session lifetime
                'max_age': timedelta(hours=999_999),
                # High score allows the session to encounter more errors
                # before crawlee decides the session is blocked
                # Make sure you know how to handle these errors
                'max_error_score': 100,
            },
        ),
    )

    # The main handler for processing requests
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

    # A handler for the login page
    @crawler.router.handler('login')
    async def login_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing login {context.request.url} ...')

        # Check if the session is available
        if not context.session:
            raise RuntimeError('Session not found')

        # Entering data into the form, `delay` to simulate human typing
        # Without this, the data will be entered instantly
        await context.page.type('#userName', 'crawlee_test', delay=100)
        await context.page.type('#password', 'Test1234!', delay=100)
        await context.page.click('#login', delay=100)

        # Wait for an element confirming that we have successfully
        # logged in to the site
        await context.page.locator('#userName-value').first.wait_for(state='visible')
        context.log.info('Login successful!')

        # Moving on to the basic flow of crawling
        await context.add_requests(['https://demoqa.com/books'])

    # We start crawling with login. This is necessary to access the rest of the pages
    await crawler.run([Request.from_url('https://demoqa.com/login', label='login')])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/playwright_crawler/browser_configuration_example.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler(
        headless=False,
        browser_type='chromium',
        # Browser launch options
        browser_launch_options={
            # For support `msedge` channel you need to install it
            # `playwright install msedge`
            'channel': 'msedge',
            'slow_mo': 200,
        },
        # Context launch options, applied to each page as it is created
        browser_new_context_options={
            'color_scheme': 'dark',
            # Set headers
            'extra_http_headers': {
                'Custom-Header': 'my-header',
                'Accept-Language': 'en',
            },
            # Set only User Agent
            'user_agent': 'My-User-Agent',
        },
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        await context.enqueue_links()

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/playwright_crawler/browser_pool_page_hooks_example.py
================================================
from __future__ import annotations

import asyncio
import logging
from typing import TYPE_CHECKING, Any

from crawlee.browsers import BrowserPool
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.storages import KeyValueStore

if TYPE_CHECKING:
    from crawlee.browsers._browser_controller import BrowserController
    from crawlee.browsers._types import CrawleePage
    from crawlee.proxy_configuration import ProxyInfo

logger = logging.getLogger(__name__)


async def main() -> None:
    async with BrowserPool() as browser_pool:

        @browser_pool.pre_page_create_hook
        async def log_page_init(
            page_id: str,
            _browser_controller: BrowserController,
            _browser_new_context_options: dict[str, Any],
            _proxy_info: ProxyInfo | None,
        ) -> None:
            """Log when a new page is about to be created."""
            logger.info(f'Creating page {page_id}...')

        @browser_pool.post_page_create_hook
        async def set_viewport(
            crawlee_page: CrawleePage, _browser_controller: BrowserController
        ) -> None:
            """Set a fixed viewport size on each newly created page."""
            await crawlee_page.page.set_viewport_size({'width': 1280, 'height': 1024})

        @browser_pool.pre_page_close_hook
        async def save_screenshot(
            crawlee_page: CrawleePage, _browser_controller: BrowserController
        ) -> None:
            """Save a screenshot to KeyValueStore before each page is closed."""
            kvs = await KeyValueStore.open()

            screenshot = await crawlee_page.page.screenshot()
            await kvs.set_value(
                key=f'screenshot-{crawlee_page.id}',
                value=screenshot,
                content_type='image/png',
            )
            logger.info(f'Saved screenshot for page {crawlee_page.id}.')

        @browser_pool.post_page_close_hook
        async def log_page_closed(
            page_id: str, _browser_controller: BrowserController
        ) -> None:
            """Log after each page is closed."""
            logger.info(f'Page {page_id} closed successfully.')

        crawler = PlaywrightCrawler(
            browser_pool=browser_pool,
            max_requests_per_crawl=5,
        )

        @crawler.router.default_handler
        async def request_handler(context: PlaywrightCrawlingContext) -> None:
            context.log.info(f'Processing {context.request.url} ...')

            await context.enqueue_links()

        # Run the crawler with the initial list of URLs.
        await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/playwright_crawler/multiple_launch_example.py
================================================
import asyncio

from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    # Create a plugin for each required browser.
    plugin_chromium = PlaywrightBrowserPlugin(
        browser_type='chromium', max_open_pages_per_browser=1
    )
    plugin_firefox = PlaywrightBrowserPlugin(
        browser_type='firefox', max_open_pages_per_browser=1
    )

    crawler = PlaywrightCrawler(
        browser_pool=BrowserPool(plugins=[plugin_chromium, plugin_firefox]),
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        browser_name = (
            context.page.context.browser.browser_type.name
            if context.page.context.browser
            else 'undefined'
        )
        context.log.info(f'Processing {context.request.url} with {browser_name} ...')

        await context.enqueue_links()

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev', 'https://apify.com/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/playwright_crawler/navigation_hooks_example.py
================================================
import asyncio

from crawlee.crawlers import (
    PlaywrightCrawler,
    PlaywrightCrawlingContext,
    PlaywrightPostNavCrawlingContext,
    PlaywrightPreNavCrawlingContext,
)
from crawlee.errors import SessionError


async def main() -> None:
    crawler = PlaywrightCrawler(max_requests_per_crawl=10)

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        await context.enqueue_links()

    @crawler.pre_navigation_hook
    async def configure_page(context: PlaywrightPreNavCrawlingContext) -> None:
        context.log.info(f'Navigating to {context.request.url} ...')

        # block stylesheets, images, fonts and other static assets
        # to speed up page loading
        await context.block_requests()

    @crawler.post_navigation_hook
    async def custom_captcha_check(context: PlaywrightPostNavCrawlingContext) -> None:
        # check if the page contains a captcha
        captcha_element = context.page.locator('input[name="captcha"]').first
        if await captcha_element.is_visible():
            context.log.warning('Captcha detected! Skipping the page.')
            raise SessionError('Captcha detected')

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/playwright_crawler/plugin_browser_configuration_example.py
================================================
import asyncio

from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin
from crawlee.crawlers import PlaywrightCrawler


async def main() -> None:
    crawler = PlaywrightCrawler(
        browser_pool=BrowserPool(
            plugins=[
                PlaywrightBrowserPlugin(
                    browser_type='chromium',
                    browser_launch_options={
                        'headless': False,
                        'channel': 'msedge',
                        'slow_mo': 200,
                    },
                    browser_new_context_options={
                        'color_scheme': 'dark',
                        'extra_http_headers': {
                            'Custom-Header': 'my-header',
                            'Accept-Language': 'en',
                        },
                        'user_agent': 'My-User-Agent',
                    },
                )
            ]
        )
    )

    # ...


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/playwright_crawler_adaptive/handler.py
================================================
import asyncio
from datetime import timedelta

from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext


async def main() -> None:
    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser()

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        # Locate element h2 within 5 seconds
        h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))
        # Do stuff with element found by the selector
        context.log.info(h2)

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/playwright_crawler_adaptive/init_beautifulsoup.py
================================================
import asyncio

from crawlee.crawlers import AdaptivePlaywrightCrawler


async def main() -> None:
    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        # Arguments relevant only for PlaywrightCrawler
        playwright_crawler_specific_kwargs={
            'headless': False,
            'browser_type': 'chromium',
        },
        # Common arguments relevant to all crawlers
        max_crawl_depth=5,
    )

    # ...


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/playwright_crawler_adaptive/init_parsel.py
================================================
import asyncio

from crawlee.crawlers import AdaptivePlaywrightCrawler


async def main() -> None:
    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
        # Arguments relevant only for PlaywrightCrawler
        playwright_crawler_specific_kwargs={
            'headless': False,
            'browser_type': 'chromium',
        },
        # Common arguments relevant to all crawlers
        max_crawl_depth=5,
    )

    # ...


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/playwright_crawler_adaptive/init_prediction.py
================================================
import asyncio

from crawlee import Request
from crawlee._types import RequestHandlerRunResult
from crawlee.crawlers import (
    AdaptivePlaywrightCrawler,
    RenderingType,
    RenderingTypePrediction,
    RenderingTypePredictor,
)


class CustomRenderingTypePredictor(RenderingTypePredictor):
    def __init__(self) -> None:
        super().__init__()

        self._learning_data = list[tuple[Request, RenderingType]]()

    def predict(self, request: Request) -> RenderingTypePrediction:
        # Some custom logic that produces some `RenderingTypePrediction`
        # based on the `request` input.
        rendering_type: RenderingType = (
            'static' if 'abc' in request.url else 'client only'
        )

        return RenderingTypePrediction(
            #  Recommends `static` rendering type -> HTTP-based sub crawler will be used.
            rendering_type=rendering_type,
            # Recommends that both sub crawlers should run with 20% chance. When both sub
            # crawlers are running, the predictor can compare results and learn.
            # High number means that predictor is not very confident about the
            # `rendering_type`, low number means that predictor is very confident.
            detection_probability_recommendation=0.2,
        )

    def store_result(self, request: Request, rendering_type: RenderingType) -> None:
        # This function allows predictor to store new learning data and retrain itself
        # if needed. `request` is input for prediction and `rendering_type` is the correct
        # prediction.
        self._learning_data.append((request, rendering_type))
        # retrain


def result_checker(result: RequestHandlerRunResult) -> bool:
    # Some function that inspects produced `result` and returns `True` if the result
    # is correct.
    return bool(result)  # Check something on result


def result_comparator(
    result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult
) -> bool:
    # Some function that inspects two results and returns `True` if they are
    # considered equivalent. It is used when comparing results produced by HTTP-based
    # sub crawler and playwright based sub crawler.
    return (
        result_1.push_data_calls == result_2.push_data_calls
    )  #  For example compare `push_data` calls.


async def main() -> None:
    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
        rendering_type_predictor=CustomRenderingTypePredictor(),
        result_checker=result_checker,
        result_comparator=result_comparator,
    )

    # ...


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/playwright_crawler_adaptive/pre_nav_hooks.py
================================================
import asyncio

from playwright.async_api import Route

from crawlee.crawlers import (
    AdaptivePlaywrightCrawler,
    AdaptivePlaywrightPreNavCrawlingContext,
)


async def main() -> None:
    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser()

    @crawler.pre_navigation_hook
    async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
        """Hook executed both in static sub crawler and playwright sub crawler.

        Trying to access `context.page` in this hook would raise `AdaptiveContextError`
        for pages crawled without playwright.
        """
        context.log.info(f'pre navigation hook for: {context.request.url}')

    @crawler.pre_navigation_hook(playwright_only=True)
    async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
        """Hook executed only in playwright sub crawler."""

        async def some_routing_function(route: Route) -> None:
            await route.continue_()

        await context.page.route('*/**', some_routing_function)
        context.log.info(
            f'Playwright only pre navigation hook for: {context.request.url}'
        )

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/playwright_crawler_stagehand/__init__.py
================================================


================================================
FILE: docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py
================================================
from __future__ import annotations

from datetime import datetime, timezone
from typing import TYPE_CHECKING, Any, cast

from stagehand.context import StagehandContext
from typing_extensions import override

from crawlee.browsers import (
    PlaywrightBrowserController,
    PlaywrightBrowserPlugin,
    PlaywrightPersistentBrowser,
)

from .support_classes import CrawleeStagehandPage

if TYPE_CHECKING:
    from collections.abc import Mapping

    from playwright.async_api import Page
    from stagehand import Stagehand

    from crawlee.proxy_configuration import ProxyInfo


class StagehandBrowserController(PlaywrightBrowserController):
    @override
    def __init__(
        self, browser: PlaywrightPersistentBrowser, stagehand: Stagehand, **kwargs: Any
    ) -> None:
        # Initialize with browser context instead of browser instance
        super().__init__(browser, **kwargs)

        self._stagehand = stagehand
        self._stagehand_context: StagehandContext | None = None

    @override
    async def new_page(
        self,
        browser_new_context_options: Mapping[str, Any] | None = None,
        proxy_info: ProxyInfo | None = None,
    ) -> Page:
        # Initialize browser context if not already done
        if not self._browser_context:
            self._browser_context = await self._create_browser_context(
                browser_new_context_options=browser_new_context_options,
                proxy_info=proxy_info,
            )

        # Initialize Stagehand context if not already done
        if not self._stagehand_context:
            self._stagehand_context = await StagehandContext.init(
                self._browser_context, self._stagehand
            )

        # Create a new page using Stagehand context
        page = await self._stagehand_context.new_page()

        pw_page = page._page  # noqa: SLF001

        # Handle page close event
        pw_page.on(event='close', f=self._on_page_close)

        # Update internal state
        self._pages.append(pw_page)
        self._last_page_opened_at = datetime.now(timezone.utc)

        self._total_opened_pages += 1

        # Wrap StagehandPage to provide Playwright Page interface
        return cast('Page', CrawleeStagehandPage(page))


class StagehandPlugin(PlaywrightBrowserPlugin):
    """Browser plugin that integrates Stagehand with Crawlee's browser management."""

    @override
    def __init__(self, stagehand: Stagehand, **kwargs: Any) -> None:
        super().__init__(**kwargs)

        self._stagehand = stagehand

    @override
    async def new_browser(self) -> StagehandBrowserController:
        if not self._playwright:
            raise RuntimeError('Playwright browser plugin is not initialized.')

        browser = PlaywrightPersistentBrowser(
            # Stagehand can run only on a Chromium-based browser.
            self._playwright.chromium,
            self._user_data_dir,
            self._browser_launch_options,
        )

        # Return custom controller with Stagehand
        return StagehandBrowserController(
            browser=browser,
            stagehand=self._stagehand,
            header_generator=None,
            fingerprint_generator=None,
        )


================================================
FILE: docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py
================================================
from __future__ import annotations

import asyncio
import os
from typing import cast

from stagehand import StagehandConfig, StagehandPage

from crawlee import ConcurrencySettings
from crawlee.browsers import BrowserPool
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext

from .browser_classes import StagehandPlugin
from .support_classes import CrawleeStagehand


async def main() -> None:
    # Configure local Stagehand with Gemini model
    config = StagehandConfig(
        env='LOCAL',
        model_name='google/gemini-2.5-flash-preview-05-20',
        model_api_key=os.getenv('GEMINI_API_KEY'),
    )

    # Create Stagehand instance
    stagehand = CrawleeStagehand(config)

    # Create crawler with custom browser pool using Stagehand
    crawler = PlaywrightCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
        # Custom browser pool. Gives users full control over browsers used by the crawler.
        concurrency_settings=ConcurrencySettings(max_tasks_per_minute=10),
        browser_pool=BrowserPool(
            plugins=[
                StagehandPlugin(stagehand, browser_launch_options={'headless': True})
            ],
        ),
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Cast to StagehandPage for proper type hints in IDE
        page = cast('StagehandPage', context.page)

        # Use regular Playwright method
        playwright_title = await page.title()
        context.log.info(f'Playwright page title: {playwright_title}')

        # highlight-start
        # Use AI-powered extraction with natural language
        gemini_title = await page.extract('Extract page title')
        context.log.info(f'Gemini page title: {gemini_title}')
        # highlight-end

        await context.enqueue_links()

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING, Any

from stagehand import Stagehand, StagehandPage

if TYPE_CHECKING:
    from types import TracebackType


class CrawleeStagehandPage:
    """StagehandPage wrapper for Crawlee."""

    def __init__(self, page: StagehandPage) -> None:
        self._page = page

    async def goto(
        self,
        url: str,
        *,
        referer: str | None = None,
        timeout: int | None = None,
        wait_until: str | None = None,
    ) -> Any:
        """Navigate to the specified URL."""
        # Override goto to return navigation result that `PlaywrightCrawler` expects
        return await self._page._page.goto(  # noqa: SLF001
            url,
            referer=referer,
            timeout=timeout,
            wait_until=wait_until,
        )

    def __getattr__(self, name: str) -> Any:
        """Delegate all other methods to the underlying StagehandPage."""
        return getattr(self._page, name)

    async def __aenter__(self) -> CrawleeStagehandPage:
        """Enter the context manager."""
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        await self._page.close()


class CrawleeStagehand(Stagehand):
    """Stagehand wrapper for Crawlee to disable the launch of Playwright."""

    async def init(self) -> None:
        # Skip Stagehand's own Playwright initialization
        # Let Crawlee's PlaywrightBrowserPlugin manage the browser lifecycle
        self._initialized = True


================================================
FILE: docs/guides/code_examples/proxy_management/inspecting_bs_example.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration


async def main() -> None:
    # Create a ProxyConfiguration object and pass it to the crawler.
    proxy_configuration = ProxyConfiguration(
        proxy_urls=[
            'http://proxy-1.com/',
            'http://proxy-2.com/',
        ]
    )
    crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration)

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
        # Log the proxy used for the current request.
        context.log.info(f'Proxy for the current request: {context.proxy_info}')

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/proxy_management/inspecting_pw_example.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration


async def main() -> None:
    # Create a ProxyConfiguration object and pass it to the crawler.
    proxy_configuration = ProxyConfiguration(
        proxy_urls=[
            'http://proxy-1.com/',
            'http://proxy-2.com/',
        ]
    )
    crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration)

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def default_handler(context: PlaywrightCrawlingContext) -> None:
        # Log the proxy used for the current request.
        context.log.info(f'Proxy for the current request: {context.proxy_info}')

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/proxy_management/integration_bs_example.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration


async def main() -> None:
    # Create a ProxyConfiguration object and pass it to the crawler.
    proxy_configuration = ProxyConfiguration(
        proxy_urls=[
            'http://proxy-1.com/',
            'http://proxy-2.com/',
        ]
    )
    crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration)

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': context.soup.title.string if context.soup.title else None,
        }
        context.log.info(f'Extracted data: {data}')

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/proxy_management/integration_pw_example.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration


async def main() -> None:
    # Create a ProxyConfiguration object and pass it to the crawler.
    proxy_configuration = ProxyConfiguration(
        proxy_urls=[
            'http://proxy-1.com/',
            'http://proxy-2.com/',
        ]
    )
    crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration)

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def default_handler(context: PlaywrightCrawlingContext) -> None:
        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': await context.page.title(),
        }
        context.log.info(f'Extracted data: {data}')

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/proxy_management/quick_start_example.py
================================================
import asyncio

from crawlee.proxy_configuration import ProxyConfiguration


async def main() -> None:
    proxy_configuration = ProxyConfiguration(
        proxy_urls=[
            'http://proxy-1.com/',
            'http://proxy-2.com/',
        ]
    )

    # The proxy URLs are rotated in a round-robin.
    proxy_url_1 = await proxy_configuration.new_url()  # http://proxy-1.com/
    proxy_url_2 = await proxy_configuration.new_url()  # http://proxy-2.com/
    proxy_url_3 = await proxy_configuration.new_url()  # http://proxy-1.com/


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/proxy_management/session_bs_example.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler
from crawlee.proxy_configuration import ProxyConfiguration


async def main() -> None:
    # Create a ProxyConfiguration object and pass it to the crawler.
    proxy_configuration = ProxyConfiguration(
        proxy_urls=[
            'http://proxy-1.com/',
            'http://proxy-2.com/',
        ]
    )
    crawler = BeautifulSoupCrawler(
        proxy_configuration=proxy_configuration,
        use_session_pool=True,
    )

    # ...


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/proxy_management/session_pw_example.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler
from crawlee.proxy_configuration import ProxyConfiguration


async def main() -> None:
    # Create a ProxyConfiguration object and pass it to the crawler.
    proxy_configuration = ProxyConfiguration(
        proxy_urls=[
            'http://proxy-1.com/',
            'http://proxy-2.com/',
        ]
    )
    crawler = PlaywrightCrawler(
        proxy_configuration=proxy_configuration,
        use_session_pool=True,
    )

    # ...


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/proxy_management/tiers_bs_example.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration


async def main() -> None:
    # Create a ProxyConfiguration object and pass it to the crawler.
    proxy_configuration = ProxyConfiguration(
        tiered_proxy_urls=[
            # No proxy tier.
            # Optional in case you do not want to use any proxy on lowest tier.
            [None],
            # lower tier, cheaper, preferred as long as they work
            [
                'http://cheap-datacenter-proxy-1.com/',
                'http://cheap-datacenter-proxy-2.com/',
            ],
            # higher tier, more expensive, used as a fallback
            [
                'http://expensive-residential-proxy-1.com/',
                'http://expensive-residential-proxy-2.com/',
            ],
        ]
    )
    crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration)

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
        # Log the proxy used for the current request.
        context.log.info(f'Proxy for the current request: {context.proxy_info}')

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/proxy_management/tiers_pw_example.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration


async def main() -> None:
    # Create a ProxyConfiguration object and pass it to the crawler.
    proxy_configuration = ProxyConfiguration(
        tiered_proxy_urls=[
            # No proxy tier.
            # Optional in case you do not want to use any proxy on lowest tier.
            [None],
            # lower tier, cheaper, preferred as long as they work
            [
                'http://cheap-datacenter-proxy-1.com/',
                'http://cheap-datacenter-proxy-2.com/',
            ],
            # higher tier, more expensive, used as a fallback
            [
                'http://expensive-residential-proxy-1.com/',
                'http://expensive-residential-proxy-2.com/',
            ],
        ]
    )
    crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration)

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def default_handler(context: PlaywrightCrawlingContext) -> None:
        # Log the proxy used for the current request.
        context.log.info(f'Proxy for the current request: {context.proxy_info}')

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_loaders/rl_basic_example.py
================================================
import asyncio

from crawlee.request_loaders import RequestList


async def main() -> None:
    # Open the request list, if it does not exist, it will be created.
    # Leave name empty to use the default request list.
    request_list = RequestList(
        name='my-request-list',
        requests=[
            'https://apify.com/',
            'https://crawlee.dev/',
            'https://crawlee.dev/python/',
        ],
    )

    # Fetch and process requests from the queue.
    while request := await request_list.fetch_next_request():
        # Do something with it...
        print(f'Processing {request.url}')

        # And mark it as handled.
        await request_list.mark_request_as_handled(request)


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_loaders/rl_basic_example_with_persist.py
================================================
import asyncio
import logging

from crawlee import service_locator
from crawlee.request_loaders import RequestList

logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(message)s')
logger = logging.getLogger(__name__)


# Disable clearing the `KeyValueStore` on each run.
# This is necessary so that the state keys are not cleared at startup.
# The recommended way to achieve this behavior is setting the environment variable
# `CRAWLEE_PURGE_ON_START=0`
configuration = service_locator.get_configuration()
configuration.purge_on_start = False


async def main() -> None:
    # Open the request list, if it does not exist, it will be created.
    # Leave name empty to use the default request list.
    request_list = RequestList(
        name='my-request-list',
        requests=[
            'https://apify.com/',
            'https://crawlee.dev/',
            'https://crawlee.dev/python/',
        ],
        # Enable persistence
        persist_state_key='my-persist-state',
        persist_requests_key='my-persist-requests',
    )

    # We receive only one request.
    # Each time you run it, it will be a new request until you exhaust the `RequestList`.
    request = await request_list.fetch_next_request()
    if request:
        logger.info(f'Processing request: {request.url}')
        # Do something with it...

        # And mark it as handled.
        await request_list.mark_request_as_handled(request)


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_loaders/rl_tandem_example.py
================================================
import asyncio

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.request_loaders import RequestList


async def main() -> None:
    # Create a static request list.
    request_list = RequestList(['https://crawlee.dev', 'https://apify.com'])

    # highlight-start
    # Convert the request list to a request manager using the to_tandem method.
    # It is a tandem with the default request queue.
    request_manager = await request_list.to_tandem()
    # highlight-end

    # Create a crawler and pass the request manager to it.
    crawler = ParselCrawler(
        request_manager=request_manager,
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
    )

    @crawler.router.default_handler
    async def handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # New links will be enqueued directly to the queue.
        await context.enqueue_links()

        # Extract data using Parsel's XPath and CSS selectors.
        data = {
            'url': context.request.url,
            'title': context.selector.xpath('//title/text()').get(),
        }

        # Push extracted data to the dataset.
        await context.push_data(data)

    await crawler.run()


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py
================================================
import asyncio

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.request_loaders import RequestList, RequestManagerTandem
from crawlee.storages import RequestQueue


async def main() -> None:
    # Create a static request list.
    request_list = RequestList(['https://crawlee.dev', 'https://apify.com'])

    # Open the default request queue.
    request_queue = await RequestQueue.open()

    # And combine them together to a sinhle request manager.
    request_manager = RequestManagerTandem(request_list, request_queue)

    # Create a crawler and pass the request manager to it.
    crawler = ParselCrawler(
        request_manager=request_manager,
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
    )

    @crawler.router.default_handler
    async def handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # New links will be enqueued directly to the queue.
        await context.enqueue_links()

        # Extract data using Parsel's XPath and CSS selectors.
        data = {
            'url': context.request.url,
            'title': context.selector.xpath('//title/text()').get(),
        }

        # Push extracted data to the dataset.
        await context.push_data(data)

    await crawler.run()


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_loaders/sitemap_basic_example.py
================================================
import asyncio
import re

from crawlee.http_clients import ImpitHttpClient
from crawlee.request_loaders import SitemapRequestLoader


async def main() -> None:
    # Create an HTTP client for fetching the sitemap.
    http_client = ImpitHttpClient()

    # Create a sitemap request loader with filtering rules.
    sitemap_loader = SitemapRequestLoader(
        sitemap_urls=['https://crawlee.dev/sitemap.xml'],
        http_client=http_client,
        include=[re.compile(r'.*docs.*')],  # Only include URLs containing 'docs'.
        max_buffer_size=500,  # Keep up to 500 URLs in memory before processing.
    )

    # We work with the loader until we process all relevant links from the sitemap.
    while request := await sitemap_loader.fetch_next_request():
        # Do something with it...
        print(f'Processing {request.url}')

        # And mark it as handled.
        await sitemap_loader.mark_request_as_handled(request)


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_loaders/sitemap_example_with_persist.py
================================================
import asyncio
import logging

from crawlee import service_locator
from crawlee.http_clients import ImpitHttpClient
from crawlee.request_loaders import SitemapRequestLoader

logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(message)s')
logger = logging.getLogger(__name__)


# Disable clearing the `KeyValueStore` on each run.
# This is necessary so that the state keys are not cleared at startup.
# The recommended way to achieve this behavior is setting the environment variable
# `CRAWLEE_PURGE_ON_START=0`
configuration = service_locator.get_configuration()
configuration.purge_on_start = False


async def main() -> None:
    # Create an HTTP client for fetching sitemaps
    # Use the context manager for `SitemapRequestLoader` to correctly save the state when
    # the work is completed.
    async with (
        ImpitHttpClient() as http_client,
        SitemapRequestLoader(
            sitemap_urls=['https://crawlee.dev/sitemap.xml'],
            http_client=http_client,
            # Enable persistence
            persist_state_key='my-persist-state',
        ) as sitemap_loader,
    ):
        # We receive only one request.
        # Each time you run it, it will be a new request until you exhaust the sitemap.
        request = await sitemap_loader.fetch_next_request()
        if request:
            logger.info(f'Processing request: {request.url}')
            # Do something with it...

            # And mark it as handled.
            await sitemap_loader.mark_request_as_handled(request)


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_loaders/sitemap_tandem_example.py
================================================
import asyncio
import re

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import ImpitHttpClient
from crawlee.request_loaders import SitemapRequestLoader


async def main() -> None:
    # Create an HTTP client for fetching the sitemap.
    http_client = ImpitHttpClient()

    # Create a sitemap request loader with filtering rules.
    sitemap_loader = SitemapRequestLoader(
        sitemap_urls=['https://crawlee.dev/sitemap.xml'],
        http_client=http_client,
        include=[re.compile(r'.*docs.*')],  # Only include URLs containing 'docs'.
        max_buffer_size=500,  # Keep up to 500 URLs in memory before processing.
    )

    # highlight-start
    # Convert the sitemap loader into a request manager linked
    # to the default request queue.
    request_manager = await sitemap_loader.to_tandem()
    # highlight-end

    # Create a crawler and pass the request manager to it.
    crawler = ParselCrawler(
        request_manager=request_manager,
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
    )

    @crawler.router.default_handler
    async def handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # New links will be enqueued directly to the queue.
        await context.enqueue_links()

        # Extract data using Parsel's XPath and CSS selectors.
        data = {
            'url': context.request.url,
            'title': context.selector.xpath('//title/text()').get(),
        }

        # Push extracted data to the dataset.
        await context.push_data(data)

    await crawler.run()


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py
================================================
import asyncio
import re

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import ImpitHttpClient
from crawlee.request_loaders import RequestManagerTandem, SitemapRequestLoader
from crawlee.storages import RequestQueue


async def main() -> None:
    # Create an HTTP client for fetching the sitemap.
    http_client = ImpitHttpClient()

    # Create a sitemap request loader with filtering rules.
    sitemap_loader = SitemapRequestLoader(
        sitemap_urls=['https://crawlee.dev/sitemap.xml'],
        http_client=http_client,
        include=[re.compile(r'.*docs.*')],  # Only include URLs containing 'docs'.
        max_buffer_size=500,  # Keep up to 500 URLs in memory before processing.
    )

    # Open the default request queue.
    request_queue = await RequestQueue.open()

    # And combine them together to a single request manager.
    request_manager = RequestManagerTandem(sitemap_loader, request_queue)

    # Create a crawler and pass the request manager to it.
    crawler = ParselCrawler(
        request_manager=request_manager,
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
    )

    @crawler.router.default_handler
    async def handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # New links will be enqueued directly to the queue.
        await context.enqueue_links()

        # Extract data using Parsel's XPath and CSS selectors.
        data = {
            'url': context.request.url,
            'title': context.selector.xpath('//title/text()').get(),
        }

        # Push extracted data to the dataset.
        await context.push_data(data)

    await crawler.run()


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_router/adaptive_crawler_handlers.py
================================================
import asyncio

from crawlee import HttpHeaders
from crawlee.crawlers import (
    AdaptivePlaywrightCrawler,
    AdaptivePlaywrightCrawlingContext,
    AdaptivePlaywrightPreNavCrawlingContext,
)


async def main() -> None:
    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
    )

    @crawler.pre_navigation_hook
    async def common_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
        # Common pre-navigation hook - runs for both HTTP and browser requests.
        context.request.headers |= HttpHeaders(
            {'Accept': 'text/html,application/xhtml+xml'},
        )

    @crawler.pre_navigation_hook(playwright_only=True)
    async def browser_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
        # Playwright-specific pre-navigation hook - runs only when browser is used.
        await context.page.set_viewport_size({'width': 1280, 'height': 720})
        if context.block_requests:
            await context.block_requests(extra_url_patterns=['*.css', '*.js'])

    @crawler.router.default_handler
    async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        # Extract title using the unified context interface.
        title_tag = context.parsed_content.find('title')
        title = title_tag.get_text() if title_tag else None

        # Extract other data consistently across both modes.
        links = [a.get('href') for a in context.parsed_content.find_all('a', href=True)]

        await context.push_data(
            {
                'url': context.request.url,
                'title': title,
                'links': links,
            }
        )

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_router/basic_request_handlers.py
================================================
import asyncio

from crawlee import Request
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.router import Router


async def main() -> None:
    # Create a custom router instance
    router = Router[ParselCrawlingContext]()

    # Define the default handler (fallback for requests without specific labels)
    @router.default_handler
    async def default_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing homepage: {context.request.url}')

        # Extract page title
        title = context.selector.css('title::text').get() or 'No title found'

        await context.push_data(
            {
                'url': context.request.url,
                'title': title,
                'page_type': 'homepage',
            }
        )

        # Find and enqueue collection/category links
        await context.enqueue_links(selector='a[href*="/collections/"]', label='CATEGORY')

    # Define a handler for category pages
    @router.handler('CATEGORY')
    async def category_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing category page: {context.request.url}')

        # Extract category information
        category_title = context.selector.css('h1::text').get() or 'Unknown Category'
        product_count = len(context.selector.css('.product-item').getall())

        await context.push_data(
            {
                'url': context.request.url,
                'type': 'category',
                'category_title': category_title,
                'product_count': product_count,
                'handler': 'category',
            }
        )

        # Enqueue product links from this category
        await context.enqueue_links(selector='a[href*="/products/"]', label='PRODUCT')

    # Define a handler for product detail pages
    @router.handler('PRODUCT')
    async def product_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing product page: {context.request.url}')

        # Extract detailed product information
        product_data = {
            'url': context.request.url,
            'name': context.selector.css('h1::text').get(),
            'price': context.selector.css('.price::text').get(),
            'description': context.selector.css('.product-description p::text').get(),
            'images': context.selector.css('.product-gallery img::attr(src)').getall(),
            'in_stock': bool(context.selector.css('.add-to-cart-button').get()),
            'handler': 'product',
        }

        await context.push_data(product_data)

    # Create crawler with the router
    crawler = ParselCrawler(
        request_handler=router,
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
    )

    # Start crawling with some initial requests
    await crawler.run(
        [
            # Will use default handler
            'https://warehouse-theme-metal.myshopify.com/',
            # Will use category handler
            Request.from_url(
                'https://warehouse-theme-metal.myshopify.com/collections/all',
                label='CATEGORY',
            ),
        ]
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_router/custom_router_default_only.py
================================================
import asyncio

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.router import Router


async def main() -> None:
    # Create a custom router instance
    router = Router[ParselCrawlingContext]()

    # Define only a default handler
    @router.default_handler
    async def default_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # Extract page title
        title = context.selector.css('title::text').get() or 'No title found'

        # Extract and save basic page data
        await context.push_data(
            {
                'url': context.request.url,
                'title': title,
            }
        )

        # Find and enqueue product links for further crawling
        await context.enqueue_links(
            selector='a[href*="/products/"]',
            label='PRODUCT',  # Note: no handler for this label, will use default
        )

    # Create crawler with the custom router
    crawler = ParselCrawler(
        request_handler=router,
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
    )

    # Start crawling
    await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_router/error_handler.py
================================================
import asyncio

from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext
from crawlee.errors import HttpStatusCodeError

# HTTP status code constants
TOO_MANY_REQUESTS = 429


async def main() -> None:
    # Create a crawler instance
    crawler = ParselCrawler(
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
    )

    @crawler.router.default_handler
    async def default_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # Extract product information (might fail for some pages)
        product_name = context.selector.css('h1[data-testid="product-title"]::text').get()
        if not product_name:
            raise ValueError('Product name not found - might be a non-product page')

        price = context.selector.css('.price::text').get()
        await context.push_data(
            {
                'url': context.request.url,
                'product_name': product_name,
                'price': price,
            }
        )

    # Error handler - called when an error occurs during request processing
    @crawler.error_handler
    async def error_handler(context: BasicCrawlingContext, error: Exception) -> None:
        error_name = type(error).__name__
        context.log.warning(f'Error occurred for {context.request.url}: {error_name}')

        # You can modify the request or context here before retry
        if (
            isinstance(error, HttpStatusCodeError)
            and error.status_code == TOO_MANY_REQUESTS
        ):
            context.log.info('Rate limited - will retry with delay')
            # You could modify headers, add delay, etc.
        elif isinstance(error, ValueError):
            context.log.info('Parse error - marking request as no retry')
            context.request.no_retry = True

    # Start crawling
    await crawler.run(
        [
            'https://warehouse-theme-metal.myshopify.com/products/on-running-cloudmonster-2-mens',
            # Might cause parse error
            'https://warehouse-theme-metal.myshopify.com/collections/mens-running',
        ]
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_router/failed_request_handler.py
================================================
import asyncio

from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext


async def main() -> None:
    # Create a crawler instance with retry settings
    crawler = ParselCrawler(
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
        max_request_retries=2,  # Allow 2 retries before failing
    )

    @crawler.router.default_handler
    async def default_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # Extract product information
        product_name = context.selector.css('h1[data-testid="product-title"]::text').get()
        if not product_name:
            product_name = context.selector.css('h1::text').get() or 'Unknown Product'

        price = context.selector.css('.price::text').get() or 'Price not available'

        await context.push_data(
            {
                'url': context.request.url,
                'product_name': product_name,
                'price': price,
                'status': 'success',
            }
        )

    # Failed request handler - called when request has exhausted all retries
    @crawler.failed_request_handler
    async def failed_handler(context: BasicCrawlingContext, error: Exception) -> None:
        context.log.error(
            f'Failed to process {context.request.url} after all retries: {error}'
        )

        # Save failed request information for analysis
        await context.push_data(
            {
                'failed_url': context.request.url,
                'label': context.request.label,
                'error_type': type(error).__name__,
                'error_message': str(error),
                'retry_count': context.request.retry_count,
                'status': 'failed',
            }
        )

    # Start crawling with some URLs that might fail
    await crawler.run(
        [
            'https://warehouse-theme-metal.myshopify.com/products/on-running-cloudmonster-2-mens',
            # This will likely fail
            'https://warehouse-theme-metal.myshopify.com/invalid-url',
            'https://warehouse-theme-metal.myshopify.com/products/valid-product',
        ]
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_router/http_pre_navigation.py
================================================
import asyncio

from crawlee import HttpHeaders
from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext


async def main() -> None:
    crawler = ParselCrawler(
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
    )

    @crawler.pre_navigation_hook
    async def setup_request(context: BasicCrawlingContext) -> None:
        # Add custom headers before making the request
        context.request.headers |= HttpHeaders(
            {
                'User-Agent': 'Crawlee Bot 1.0',
                'Accept': 'text/html,application/xhtml+xml',
            },
        )

    @crawler.router.default_handler
    async def default_handler(context: ParselCrawlingContext) -> None:
        # Extract basic page information
        title = context.selector.css('title::text').get()
        await context.push_data(
            {
                'url': context.request.url,
                'title': title,
            }
        )

    await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_router/playwright_pre_navigation.py
================================================
import asyncio

from crawlee.crawlers import (
    PlaywrightCrawler,
    PlaywrightCrawlingContext,
    PlaywrightPreNavCrawlingContext,
)


async def main() -> None:
    crawler = PlaywrightCrawler(
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
    )

    @crawler.pre_navigation_hook
    async def setup_page(context: PlaywrightPreNavCrawlingContext) -> None:
        # Set viewport size for consistent rendering
        await context.page.set_viewport_size({'width': 1280, 'height': 720})

        # Block unnecessary resources to speed up crawling
        await context.block_requests(
            extra_url_patterns=[
                '*.png',
                '*.jpg',
                '*.jpeg',
                '*.gif',
                '*.svg',
                '*.css',
                '*.woff',
                '*.woff2',
                '*.ttf',
                '*google-analytics*',
                '*facebook*',
                '*twitter*',
            ]
        )

        # Set custom user agent
        await context.page.set_extra_http_headers(
            {
                'User-Agent': 'Mozilla/5.0 (compatible; Crawlee Bot)',
            }
        )

    @crawler.router.default_handler
    async def default_handler(context: PlaywrightCrawlingContext) -> None:
        title = await context.page.title()
        await context.push_data(
            {
                'url': context.request.url,
                'title': title,
            }
        )

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/request_router/simple_default_handler.py
================================================
import asyncio

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext


async def main() -> None:
    # Create a crawler instance
    crawler = ParselCrawler(
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
    )

    # Use the crawler's built-in router to define a default handler
    @crawler.router.default_handler
    async def default_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # Extract page title
        title = context.selector.css('title::text').get() or 'No title found'

        # Extract and save basic page data
        await context.push_data(
            {
                'url': context.request.url,
                'title': title,
            }
        )

        # Find and enqueue product links for further crawling
        await context.enqueue_links(selector='a[href*="/products/"]', label='PRODUCT')

    # Start crawling
    await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/running_in_web_server/__init__.py
================================================


================================================
FILE: docs/guides/code_examples/running_in_web_server/crawler.py
================================================
import asyncio
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from typing import TypedDict

from fastapi import FastAPI

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext


class State(TypedDict):
    """State available in the app."""

    crawler: ParselCrawler
    requests_to_results: dict[str, asyncio.Future[dict[str, str]]]


@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncIterator[State]:
    # Start up code that runs once when the app starts

    # Results will be stored in this dictionary
    requests_to_results = dict[str, asyncio.Future[dict[str, str]]]()

    crawler = ParselCrawler(
        # Keep the crawler alive even when there are no more requests to process now.
        # This makes the crawler wait for more requests to be added later.
        keep_alive=True
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')
        title = context.selector.xpath('//title/text()').get() or ''

        # Extract data from the page and save it to the result dictionary.
        requests_to_results[context.request.unique_key].set_result(
            {
                'title': title,
            }
        )

    # Start the crawler without awaiting it to finish
    crawler.log.info(f'Starting crawler for the {app.title}')
    run_task = asyncio.create_task(crawler.run([]))

    # Make the crawler and the result dictionary available in the app state
    yield {'crawler': crawler, 'requests_to_results': requests_to_results}

    # Cleanup code that runs once when the app shuts down
    crawler.stop()
    # Wait for the crawler to finish
    await run_task


================================================
FILE: docs/guides/code_examples/running_in_web_server/server.py
================================================
from __future__ import annotations

import asyncio
from uuid import uuid4

from fastapi import FastAPI
from starlette.requests import Request
from starlette.responses import HTMLResponse

import crawlee

from .crawler import lifespan

app = FastAPI(lifespan=lifespan, title='Crawler app')


@app.get('/', response_class=HTMLResponse)
def index() -> str:
    return """
<!DOCTYPE html>
<html>
<body>
    <h1>Scraper server</h1>
        <p>To scrape some page, visit "scrape" endpoint with url parameter.
            For example:
            <a href="/scrape?url=https://www.example.com">
                /scrape?url=https://www.example.com
            </a>
        </p>
</body>
</html>
"""


@app.get('/scrape')
async def scrape_url(request: Request, url: str | None = None) -> dict:
    if not url:
        return {'url': 'missing', 'scrape result': 'no results'}

    # Generate random unique key for the request
    unique_key = str(uuid4())

    # Set the result future in the result dictionary so that it can be awaited
    request.state.requests_to_results[unique_key] = asyncio.Future[dict[str, str]]()

    # Add the request to the crawler queue
    await request.state.crawler.add_requests(
        [crawlee.Request.from_url(url, unique_key=unique_key)]
    )

    # Wait for the result future to be finished
    result = await request.state.requests_to_results[unique_key]

    # Clean the result from the result dictionary to free up memory
    request.state.requests_to_results.pop(unique_key)

    # Return the result
    return {'url': url, 'scrape result': result}


================================================
FILE: docs/guides/code_examples/scaling_crawlers/max_tasks_per_minute_example.py
================================================
import asyncio

from crawlee import ConcurrencySettings
from crawlee.crawlers import BeautifulSoupCrawler


async def main() -> None:
    concurrency_settings = ConcurrencySettings(
        # Set the maximum number of concurrent requests the crawler can run to 100.
        max_concurrency=100,
        # Limit the total number of requests to 10 per minute to avoid overwhelming
        # the target website.
        max_tasks_per_minute=10,
    )

    crawler = BeautifulSoupCrawler(
        # Apply the defined concurrency settings to the crawler.
        concurrency_settings=concurrency_settings,
    )

    # ...


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/scaling_crawlers/min_and_max_concurrency_example.py
================================================
import asyncio

from crawlee import ConcurrencySettings
from crawlee.crawlers import BeautifulSoupCrawler


async def main() -> None:
    concurrency_settings = ConcurrencySettings(
        # Start with 8 concurrent tasks, as long as resources are available.
        desired_concurrency=8,
        # Maintain a minimum of 5 concurrent tasks to ensure steady crawling.
        min_concurrency=5,
        # Limit the maximum number of concurrent tasks to 10 to prevent
        # overloading the system.
        max_concurrency=10,
    )

    crawler = BeautifulSoupCrawler(
        # Use the configured concurrency settings for the crawler.
        concurrency_settings=concurrency_settings,
    )

    # ...


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/service_locator/service_conflicts.py
================================================
import asyncio

from crawlee import service_locator
from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient


async def main() -> None:
    # Register the storage client via service locator.
    memory_storage_client = MemoryStorageClient()
    service_locator.set_storage_client(memory_storage_client)

    # Retrieve the storage client.
    current_storage_client = service_locator.get_storage_client()

    # Try to set a different storage client, which will raise ServiceConflictError
    # if storage client was already retrieved.
    file_system_storage_client = FileSystemStorageClient()
    service_locator.set_storage_client(file_system_storage_client)


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/service_locator/service_crawler_configuration.py
================================================
import asyncio
from datetime import timedelta

from crawlee.configuration import Configuration
from crawlee.crawlers import ParselCrawler


async def main() -> None:
    configuration = Configuration(
        log_level='DEBUG',
        headless=False,
        persist_state_interval=timedelta(seconds=30),
    )

    # Register configuration via crawler.
    crawler = ParselCrawler(
        configuration=configuration,
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/service_locator/service_crawler_event_manager.py
================================================
import asyncio
from datetime import timedelta

from crawlee.crawlers import ParselCrawler
from crawlee.events import LocalEventManager


async def main() -> None:
    event_manager = LocalEventManager(
        system_info_interval=timedelta(seconds=5),
    )

    # Register event manager via crawler.
    crawler = ParselCrawler(
        event_manager=event_manager,
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/service_locator/service_crawler_storage_client.py
================================================
import asyncio

from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import MemoryStorageClient


async def main() -> None:
    storage_client = MemoryStorageClient()

    # Register storage client via crawler.
    crawler = ParselCrawler(
        storage_client=storage_client,
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/service_locator/service_locator_configuration.py
================================================
import asyncio
from datetime import timedelta

from crawlee import service_locator
from crawlee.configuration import Configuration


async def main() -> None:
    configuration = Configuration(
        log_level='DEBUG',
        headless=False,
        persist_state_interval=timedelta(seconds=30),
    )

    # Register configuration via service locator.
    service_locator.set_configuration(configuration)


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/service_locator/service_locator_event_manager.py
================================================
import asyncio
from datetime import timedelta

from crawlee import service_locator
from crawlee.events import LocalEventManager


async def main() -> None:
    event_manager = LocalEventManager(
        system_info_interval=timedelta(seconds=5),
    )

    # Register event manager via service locator.
    service_locator.set_event_manager(event_manager)


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/service_locator/service_locator_storage_client.py
================================================
import asyncio

from crawlee import service_locator
from crawlee.storage_clients import MemoryStorageClient


async def main() -> None:
    storage_client = MemoryStorageClient()

    # Register storage client via service locator.
    service_locator.set_storage_client(storage_client)


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/service_locator/service_storage_configuration.py
================================================
import asyncio
from datetime import timedelta

from crawlee import service_locator
from crawlee.configuration import Configuration
from crawlee.storage_clients import MemoryStorageClient
from crawlee.storages import Dataset


async def main() -> None:
    configuration = Configuration(
        log_level='DEBUG',
        headless=False,
        persist_state_interval=timedelta(seconds=30),
    )
    # Set the custom configuration as the global default configuration.
    service_locator.set_configuration(configuration)

    # Use the global defaults when creating the dataset (or other storage).
    dataset_1 = await Dataset.open()

    # Or set explicitly specific configuration if
    # you do not want to rely on global defaults.
    dataset_2 = await Dataset.open(
        storage_client=MemoryStorageClient(), configuration=configuration
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/service_locator/service_storage_storage_client.py
================================================
import asyncio

from crawlee.storage_clients import MemoryStorageClient
from crawlee.storages import Dataset


async def main() -> None:
    storage_client = MemoryStorageClient()

    # Pass the storage client to the dataset (or other storage) when opening it.
    dataset = await Dataset.open(
        storage_client=storage_client,
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/session_management/multi_sessions_http.py
================================================
import asyncio
from collections.abc import Callable
from datetime import timedelta
from itertools import count

from crawlee import ConcurrencySettings, Request
from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext
from crawlee.errors import RequestCollisionError
from crawlee.sessions import Session, SessionPool


# Define a function for creating sessions with simple logic for unique `id` generation.
# This is necessary if you need to specify a particular session for the first request,
# for example during authentication
def create_session_function() -> Callable[[], Session]:
    counter = count()

    def create_session() -> Session:
        return Session(
            id=str(next(counter)),
            max_usage_count=999_999,
            max_age=timedelta(hours=999_999),
            max_error_score=100,
            blocked_status_codes=[403],
        )

    return create_session


async def main() -> None:
    crawler = HttpCrawler(
        # Adjust request limits according to your pool size
        concurrency_settings=ConcurrencySettings(max_tasks_per_minute=500),
        # Requests are bound to specific sessions, no rotation needed
        max_session_rotations=0,
        session_pool=SessionPool(
            max_pool_size=10, create_session_function=create_session_function()
        ),
    )

    @crawler.router.default_handler
    async def basic_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

    # Initialize the session and bind the next request to this session if needed
    @crawler.router.handler(label='session_init')
    async def session_init(context: HttpCrawlingContext) -> None:
        next_requests = []
        if context.session:
            context.log.info(f'Init session {context.session.id}')
            next_request = Request.from_url(
                'https://a.placeholder.com', session_id=context.session.id
            )
            next_requests.append(next_request)

        await context.add_requests(next_requests)

    # Handle errors when a session is blocked and no longer available in the pool
    # when attempting to execute requests bound to it
    @crawler.failed_request_handler
    async def error_processing(context: BasicCrawlingContext, error: Exception) -> None:
        if isinstance(error, RequestCollisionError) and context.session:
            context.log.error(
                f'Request {context.request.url} failed, because the bound '
                'session is unavailable'
            )

    # Create a pool of requests bound to their respective sessions
    # Use `always_enqueue=True` if session initialization happens on a non-unique address,
    # such as the site's main page
    init_requests = [
        Request.from_url(
            'https://example.org/',
            label='session_init',
            session_id=str(session_id),
            use_extended_unique_key=True,
        )
        for session_id in range(1, 11)
    ]

    await crawler.run(init_requests)


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/session_management/one_session_http.py
================================================
import asyncio
from datetime import timedelta

from crawlee import ConcurrencySettings, Request
from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext
from crawlee.errors import SessionError
from crawlee.sessions import SessionPool


async def main() -> None:
    crawler = HttpCrawler(
        # Limit requests per minute to reduce the chance of being blocked
        concurrency_settings=ConcurrencySettings(max_tasks_per_minute=50),
        # Disable session rotation
        max_session_rotations=0,
        session_pool=SessionPool(
            # Only one session in the pool
            max_pool_size=1,
            create_session_settings={
                # High value for session usage limit
                'max_usage_count': 999_999,
                # High value for session lifetime
                'max_age': timedelta(hours=999_999),
                # High score allows the session to encounter more errors
                # before crawlee decides the session is blocked
                # Make sure you know how to handle these errors
                'max_error_score': 100,
                # 403 status usually indicates you're already blocked
                'blocked_status_codes': [403],
            },
        ),
    )

    # Basic request handling logic
    @crawler.router.default_handler
    async def basic_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

    # Handler for session initialization (authentication, initial cookies, etc.)
    @crawler.router.handler(label='session_init')
    async def session_init(context: HttpCrawlingContext) -> None:
        if context.session:
            context.log.info(f'Init session {context.session.id}')

    # Monitor if our session gets blocked and explicitly stop the crawler
    @crawler.error_handler
    async def error_processing(context: BasicCrawlingContext, error: Exception) -> None:
        if isinstance(error, SessionError) and context.session:
            context.log.info(f'Session {context.session.id} blocked')
            crawler.stop()

    await crawler.run([Request.from_url('https://example.org/', label='session_init')])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/session_management/sm_basic.py
================================================
import asyncio
import re

from crawlee.crawlers import BasicCrawler, BasicCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.sessions import SessionPool


async def main() -> None:
    # To use the proxy IP session rotation logic, you must turn the proxy usage on.
    proxy_configuration = ProxyConfiguration(
        # options
    )

    # Initialize crawler with a custom SessionPool configuration
    # to manage concurrent sessions and proxy rotation
    crawler = BasicCrawler(
        proxy_configuration=proxy_configuration,
        # Activates the Session pool (default is true).
        use_session_pool=True,
        # Overrides default Session pool configuration.
        session_pool=SessionPool(max_pool_size=100),
    )

    # Define the default request handler that manages session states
    @crawler.router.default_handler
    async def default_handler(context: BasicCrawlingContext) -> None:
        # Send request, BasicCrawler automatically selects a session from the pool
        # and sets a proxy for it. You can check it with `context.session`
        # and `context.proxy_info`.
        response = await context.send_request(context.request.url)

        page_content = (await response.read()).decode()
        title_match = re.search(r'<title(?:.*?)>(.*?)</title>', page_content)

        if context.session and (title := title_match.group(1) if title_match else None):
            if title == 'Blocked':
                context.session.retire()
            elif title == 'Not sure if blocked, might also be a connection error':
                context.session.mark_bad()
            else:
                context.session.mark_good()  # BasicCrawler handles this automatically.

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/session_management/sm_beautifulsoup.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.sessions import SessionPool


async def main() -> None:
    # To use the proxy IP session rotation logic, you must turn the proxy usage on.
    proxy_configuration = ProxyConfiguration(
        # options
    )

    # Initialize crawler with a custom SessionPool configuration
    # to manage concurrent sessions and proxy rotation
    crawler = BeautifulSoupCrawler(
        proxy_configuration=proxy_configuration,
        # Activates the Session pool (default is true).
        use_session_pool=True,
        # Overrides default Session pool configuration.
        session_pool=SessionPool(max_pool_size=100),
    )

    # Define the default request handler that manages session states
    # based on the response content and potential blocking
    @crawler.router.default_handler
    async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
        title = context.soup.title.get_text() if context.soup.title else None

        if context.session:
            if title == 'Blocked':
                context.session.retire()
            elif title == 'Not sure if blocked, might also be a connection error':
                context.session.mark_bad()
            else:
                context.session.mark_good()  # BasicCrawler handles this automatically.

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/session_management/sm_http.py
================================================
import asyncio
import re

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.sessions import SessionPool


async def main() -> None:
    # To use the proxy IP session rotation logic, you must turn the proxy usage on.
    proxy_configuration = ProxyConfiguration(
        # options
    )

    # Initialize crawler with a custom SessionPool configuration
    # to manage concurrent sessions and proxy rotation
    crawler = HttpCrawler(
        proxy_configuration=proxy_configuration,
        # Activates the Session pool (default is true).
        use_session_pool=True,
        # Overrides default Session pool configuration.
        session_pool=SessionPool(max_pool_size=100),
    )

    # Define the default request handler that manages session states
    # based on the response content and potential blocking
    @crawler.router.default_handler
    async def default_handler(context: HttpCrawlingContext) -> None:
        page_content = (await context.http_response.read()).decode()
        title_match = re.search(r'<title(?:.*?)>(.*?)</title>', page_content)

        if context.session and (title := title_match.group(1) if title_match else None):
            if title == 'Blocked':
                context.session.retire()
            elif title == 'Not sure if blocked, might also be a connection error':
                context.session.mark_bad()
            else:
                context.session.mark_good()  # BasicCrawler handles this automatically.

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/session_management/sm_parsel.py
================================================
import asyncio

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.sessions import SessionPool


async def main() -> None:
    # To use the proxy IP session rotation logic, you must turn the proxy usage on.
    proxy_configuration = ProxyConfiguration(
        # options
    )

    # Initialize crawler with a custom SessionPool configuration
    # to manage concurrent sessions and proxy rotation
    crawler = ParselCrawler(
        proxy_configuration=proxy_configuration,
        # Activates the Session pool (default is true).
        use_session_pool=True,
        # Overrides default Session pool configuration.
        session_pool=SessionPool(max_pool_size=100),
    )

    # Define the default request handler that manages session states
    # based on the response content and potential blocking
    @crawler.router.default_handler
    async def default_handler(context: ParselCrawlingContext) -> None:
        title = context.selector.css('title::text').get()

        if context.session:
            if title == 'Blocked':
                context.session.retire()
            elif title == 'Not sure if blocked, might also be a connection error':
                context.session.mark_bad()
            else:
                context.session.mark_good()  # BasicCrawler handles this automatically.

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/session_management/sm_playwright.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.sessions import SessionPool


async def main() -> None:
    # To use the proxy IP session rotation logic, you must turn the proxy usage on.
    proxy_configuration = ProxyConfiguration(
        # options
    )

    # Initialize crawler with a custom SessionPool configuration
    # to manage concurrent sessions and proxy rotation
    crawler = PlaywrightCrawler(
        proxy_configuration=proxy_configuration,
        # Activates the Session pool (default is true).
        use_session_pool=True,
        # Overrides default Session pool configuration.
        session_pool=SessionPool(max_pool_size=100),
    )

    # Define the default request handler that manages session states
    # based on the response content and potential blocking
    @crawler.router.default_handler
    async def default_handler(context: PlaywrightCrawlingContext) -> None:
        title = await context.page.title()

        if context.session:
            if title == 'Blocked':
                context.session.retire()
            elif title == 'Not sure if blocked, might also be a connection error':
                context.session.mark_bad()
            else:
                context.session.mark_good()  # BasicCrawler handles this automatically.

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/session_management/sm_standalone.py
================================================
import asyncio

from crawlee.sessions import SessionPool


async def main() -> None:
    # Override the default Session pool configuration.
    async with SessionPool(
        max_pool_size=100,
        create_session_settings={'max_usage_count': 10, 'blocked_status_codes': [403]},
    ) as session_pool:
        session = await session_pool.get_session()

        # Increase the error_score.
        session.mark_bad()

        # Throw away the session.
        session.retire()

        # Lower the error_score and mark the session good.
        session.mark_good()


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storage_clients/custom_storage_client_example.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

from crawlee.storage_clients import StorageClient
from crawlee.storage_clients._base import (
    DatasetClient,
    KeyValueStoreClient,
    RequestQueueClient,
)

if TYPE_CHECKING:
    from crawlee.configuration import Configuration

# Implement the storage type clients with your backend logic.


class CustomDatasetClient(DatasetClient):
    # Implement methods like push_data, get_data, iterate_items, etc.
    pass


class CustomKeyValueStoreClient(KeyValueStoreClient):
    # Implement methods like get_value, set_value, delete, etc.
    pass


class CustomRequestQueueClient(RequestQueueClient):
    # Implement methods like add_request, fetch_next_request, etc.
    pass


# Implement the storage client factory.


class CustomStorageClient(StorageClient):
    async def create_dataset_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        configuration: Configuration | None = None,
    ) -> CustomDatasetClient:
        # Create and return your custom dataset client.
        pass

    async def create_kvs_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        configuration: Configuration | None = None,
    ) -> CustomKeyValueStoreClient:
        # Create and return your custom key-value store client.
        pass

    async def create_rq_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        configuration: Configuration | None = None,
    ) -> CustomRequestQueueClient:
        # Create and return your custom request queue client.
        pass


================================================
FILE: docs/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py
================================================
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import FileSystemStorageClient

# Create a new instance of storage client.
storage_client = FileSystemStorageClient()

# And pass it to the crawler.
crawler = ParselCrawler(storage_client=storage_client)


================================================
FILE: docs/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py
================================================
from crawlee.configuration import Configuration
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import FileSystemStorageClient

# Create a new instance of storage client.
storage_client = FileSystemStorageClient()

# Create a configuration with custom settings.
configuration = Configuration(
    storage_dir='./my_storage',
    purge_on_start=False,
)

# And pass them to the crawler.
crawler = ParselCrawler(
    storage_client=storage_client,
    configuration=configuration,
)


================================================
FILE: docs/guides/code_examples/storage_clients/memory_storage_client_basic_example.py
================================================
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import MemoryStorageClient

# Create a new instance of storage client.
storage_client = MemoryStorageClient()

# And pass it to the crawler.
crawler = ParselCrawler(storage_client=storage_client)


================================================
FILE: docs/guides/code_examples/storage_clients/redis_storage_client_basic_example.py
================================================
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import RedisStorageClient

# Create a new instance of storage client using connection string.
# 'redis://localhost:6379' is the just placeholder, replace it with your actual
# connection string.
storage_client = RedisStorageClient(connection_string='redis://localhost:6379')

# And pass it to the crawler.
crawler = ParselCrawler(storage_client=storage_client)


================================================
FILE: docs/guides/code_examples/storage_clients/redis_storage_client_configuration_example.py
================================================
from redis.asyncio import Redis

from crawlee.configuration import Configuration
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import RedisStorageClient

# Create a new instance of storage client using a Redis client with custom settings.
# Replace host and port with your actual Redis server configuration.
# Other Redis client settings can be adjusted as needed.
storage_client = RedisStorageClient(
    redis=Redis(
        host='localhost',
        port=6379,
        retry_on_timeout=True,
        socket_keepalive=True,
        socket_connect_timeout=10,
    )
)

# Create a configuration with custom settings.
configuration = Configuration(purge_on_start=False)

# And pass them to the crawler.
crawler = ParselCrawler(
    storage_client=storage_client,
    configuration=configuration,
)


================================================
FILE: docs/guides/code_examples/storage_clients/registering_storage_clients_example.py
================================================
import asyncio

from crawlee import service_locator
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import MemoryStorageClient
from crawlee.storages import Dataset


async def main() -> None:
    # Create custom storage client, MemoryStorageClient for example.
    storage_client = MemoryStorageClient()

    # Register it globally via the service locator.
    service_locator.set_storage_client(storage_client)

    # Or pass it directly to the crawler, it will be registered globally
    # to the service locator under the hood.
    crawler = ParselCrawler(storage_client=storage_client)

    # Or just provide it when opening a storage (e.g. dataset), it will be used
    # for this storage only, not globally.
    dataset = await Dataset.open(
        name='my-dataset',
        storage_client=storage_client,
    )


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storage_clients/sql_storage_client_basic_example.py
================================================
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import SqlStorageClient


async def main() -> None:
    # Create a new instance of storage client.
    # This will create an SQLite database file crawlee.db or created tables in your
    # database if you pass `connection_string` or `engine`
    # Use the context manager to ensure that connections are properly cleaned up.
    async with SqlStorageClient() as storage_client:
        # And pass it to the crawler.
        crawler = ParselCrawler(storage_client=storage_client)


================================================
FILE: docs/guides/code_examples/storage_clients/sql_storage_client_configuration_example.py
================================================
from sqlalchemy.ext.asyncio import create_async_engine

from crawlee.configuration import Configuration
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import SqlStorageClient


async def main() -> None:
    # Create a new instance of storage client.
    # On first run, also creates tables in your PostgreSQL database.
    # Use the context manager to ensure that connections are properly cleaned up.
    async with SqlStorageClient(
        # Create an `engine` with the desired configuration
        engine=create_async_engine(
            'postgresql+asyncpg://myuser:mypassword@localhost:5432/postgres',
            future=True,
            pool_size=5,
            max_overflow=10,
            pool_recycle=3600,
            pool_pre_ping=True,
            echo=False,
        )
    ) as storage_client:
        # Create a configuration with custom settings.
        configuration = Configuration(
            purge_on_start=False,
        )

        # And pass them to the crawler.
        crawler = ParselCrawler(
            storage_client=storage_client,
            configuration=configuration,
        )


================================================
FILE: docs/guides/code_examples/storages/cleaning_do_not_purge_example.py
================================================
import asyncio

from crawlee.configuration import Configuration
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
    # Set the purge_on_start field to False to avoid purging the storage on start.
    # highlight-next-line
    configuration = Configuration(purge_on_start=False)

    # Pass the configuration to the crawler.
    crawler = HttpCrawler(configuration=configuration)

    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py
================================================
import asyncio

from crawlee.storages import Dataset


async def main() -> None:
    # Create storage client with configuration
    dataset = await Dataset.open(name='my-dataset')

    # Purge the dataset explicitly - purging will remove all items from the dataset.
    # But keeps the dataset itself and its metadata.
    await dataset.purge()

    # Or you can drop the dataset completely, which will remove the dataset
    # and all its items.
    await dataset.drop()


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storages/dataset_basic_example.py
================================================
import asyncio

from crawlee.storages import Dataset


async def main() -> None:
    # Open the dataset, if it does not exist, it will be created.
    # Leave name empty to use the default dataset.
    dataset = await Dataset.open(name='my-dataset')

    # Push a single row of data.
    await dataset.push_data({'foo': 'bar'})

    # Push multiple rows of data (anything JSON-serializable can be pushed).
    await dataset.push_data([{'foo': 'bar2', 'col2': 'val2'}, {'col3': 123}])

    # Fetch all data from the dataset.
    data = await dataset.get_data()
    # Do something with it...

    # Remove the dataset.
    await dataset.drop()


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storages/dataset_with_crawler_example.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    # Create a new crawler (it can be any subclass of BasicCrawler).
    crawler = BeautifulSoupCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': context.soup.title.string if context.soup.title else None,
        }

        # Push the extracted data to the (default) dataset.
        await context.push_data(data)

    # Run the crawler with the initial URLs.
    await crawler.run(['https://crawlee.dev'])

    # Export the dataset to a file.
    await crawler.export_data(path='dataset.csv')


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storages/dataset_with_crawler_explicit_example.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.storages import Dataset


async def main() -> None:
    # Open the dataset, if it does not exist, it will be created.
    # Leave name empty to use the default dataset.
    dataset = await Dataset.open(name='my-dataset')

    # Create a new crawler (it can be any subclass of BasicCrawler).
    crawler = BeautifulSoupCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': context.soup.title.string if context.soup.title else None,
        }

        # Push the extracted data to the dataset.
        await dataset.push_data(data)

    # Run the crawler with the initial URLs.
    await crawler.run(['https://crawlee.dev'])

    # Export the dataset to the key-value store.
    await dataset.export_to(key='dataset', content_type='csv')


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storages/helper_add_requests_example.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler()

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')
        # highlight-next-line
        await context.add_requests(['https://apify.com/'])

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storages/helper_enqueue_links_example.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler()

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')
        # highlight-next-line
        await context.enqueue_links()

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storages/kvs_basic_example.py
================================================
import asyncio

from crawlee.storages import KeyValueStore


async def main() -> None:
    # Open the key-value store, if it does not exist, it will be created.
    # Leave name empty to use the default KVS.
    kvs = await KeyValueStore.open(name='my-key-value-store')

    # Set a value associated with 'some-key'.
    await kvs.set_value(key='some-key', value={'foo': 'bar'})

    # Get the value associated with 'some-key'.
    value = kvs.get_value('some-key')
    # Do something with it...

    # Delete the value associated with 'some-key' by setting it to None.
    await kvs.set_value(key='some-key', value=None)

    # Remove the key-value store.
    await kvs.drop()


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storages/kvs_with_crawler_example.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    # Create a new Playwright crawler.
    crawler = PlaywrightCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Capture the screenshot of the page using Playwright's API.
        screenshot = await context.page.screenshot()
        name = context.request.url.split('/')[-1]

        # Get the key-value store from the context. # If it does not exist,
        # it will be created. Leave name empty to use the default KVS.
        kvs = await context.get_key_value_store()

        # Store the screenshot in the key-value store.
        await kvs.set_value(
            key=f'screenshot-{name}',
            value=screenshot,
            content_type='image/png',
        )

    # Run the crawler with the initial URLs.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storages/kvs_with_crawler_explicit_example.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.storages import KeyValueStore


async def main() -> None:
    # Open the key-value store, if it does not exist, it will be created.
    # Leave name empty to use the default KVS.
    kvs = await KeyValueStore.open(name='my-key-value-store')

    # Create a new Playwright crawler.
    crawler = PlaywrightCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Capture the screenshot of the page using Playwright's API.
        screenshot = await context.page.screenshot()
        name = context.request.url.split('/')[-1]

        # Store the screenshot in the key-value store.
        await kvs.set_value(
            key=f'screenshot-{name}',
            value=screenshot,
            content_type='image/png',
        )

    # Run the crawler with the initial URLs.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storages/opening.py
================================================
import asyncio

from crawlee.storages import Dataset


async def main() -> None:
    # Named storage (persists across runs)
    dataset_named = await Dataset.open(name='my-persistent-dataset')

    # Unnamed storage with alias (purged on start)
    dataset_unnamed = await Dataset.open(alias='temporary-results')

    # Default unnamed storage (purged on start)
    dataset_default = await Dataset.open()


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storages/rq_basic_example.py
================================================
import asyncio

from crawlee.storages import RequestQueue


async def main() -> None:
    # Open the request queue, if it does not exist, it will be created.
    # Leave name empty to use the default request queue.
    request_queue = await RequestQueue.open(name='my-request-queue')

    # Add a single request.
    await request_queue.add_request('https://apify.com/')

    # Add multiple requests as a batch.
    await request_queue.add_requests(
        ['https://crawlee.dev/', 'https://crawlee.dev/python/']
    )

    # Fetch and process requests from the queue.
    while request := await request_queue.fetch_next_request():
        # Do something with it...

        # And mark it as handled.
        await request_queue.mark_request_as_handled(request)

    # Remove the request queue.
    await request_queue.drop()


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storages/rq_with_crawler_example.py
================================================
import asyncio

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


async def main() -> None:
    # Create a new crawler (it can be any subclass of BasicCrawler). Request queue is
    # a default request manager, it will be opened, and fully managed if not specified.
    crawler = HttpCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Use context's add_requests method helper to add new requests from the handler.
        await context.add_requests(['https://crawlee.dev/python/'])

    # Use crawler's add_requests method helper to add new requests.
    await crawler.add_requests(['https://apify.com/'])

    # Run the crawler. You can optionally pass the list of initial requests.
    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py
================================================
import asyncio

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.storages import RequestQueue


async def main() -> None:
    # Open the request queue, if it does not exist, it will be created.
    # Leave name empty to use the default request queue.
    request_queue = await RequestQueue.open(name='my-request-queue')

    # Interact with the request queue directly, e.g. add a batch of requests.
    await request_queue.add_requests(['https://apify.com/', 'https://crawlee.dev/'])

    # Create a new crawler (it can be any subclass of BasicCrawler) and pass the request
    # queue as request manager to it. It will be managed by the crawler.
    crawler = HttpCrawler(request_manager=request_queue)

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

    # And execute the crawler.
    await crawler.run()


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/code_examples/trace_and_monitor_crawlers/instrument_crawler.py
================================================
import asyncio

from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry.trace import set_tracer_provider

from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext
from crawlee.otel import CrawlerInstrumentor
from crawlee.storages import Dataset, KeyValueStore, RequestQueue


def instrument_crawler() -> None:
    """Add instrumentation to the crawler."""
    resource = Resource.create(
        {
            'service.name': 'ExampleCrawler',
            'service.version': '1.0.0',
            'environment': 'development',
        }
    )

    # Set up the OpenTelemetry tracer provider and exporter
    provider = TracerProvider(resource=resource)
    otlp_exporter = OTLPSpanExporter(endpoint='localhost:4317', insecure=True)
    provider.add_span_processor(SimpleSpanProcessor(otlp_exporter))
    set_tracer_provider(provider)
    # Instrument the crawler with OpenTelemetry
    CrawlerInstrumentor(
        instrument_classes=[RequestQueue, KeyValueStore, Dataset]
    ).instrument()


async def main() -> None:
    """Run the crawler."""
    instrument_crawler()

    crawler = ParselCrawler(max_requests_per_crawl=100)
    kvs = await KeyValueStore.open()

    @crawler.pre_navigation_hook
    async def pre_nav_hook(_: BasicCrawlingContext) -> None:
        # Simulate some pre-navigation processing
        await asyncio.sleep(0.01)

    @crawler.router.default_handler
    async def handler(context: ParselCrawlingContext) -> None:
        await context.push_data({'url': context.request.url})
        await kvs.set_value(key='url', value=context.request.url)
        await context.enqueue_links()

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/guides/crawler_login.mdx
================================================
---
id: logging-in-with-a-crawler
title: Logging in with a crawler
description: How to log in to websites with Crawlee.
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import PlaywrightLogin from '!!raw-loader!roa-loader!./code_examples/login_crawler/playwright_login.py';
import HttpLogin from '!!raw-loader!roa-loader!./code_examples/login_crawler/http_login.py';

Many websites require authentication to access their content. This guide demonstrates how to implement login functionality using both <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>.

## Session management for authentication

When implementing authentication, you'll typically want to maintain the same <ApiLink to="class/Session">`Session`</ApiLink> throughout your crawl to preserve login state. This requires proper configuration of the <ApiLink to="class/SessionPool">`SessionPool`</ApiLink>. For more details, see our [session management guide](./session-management).

If your use case requires multiple authenticated sessions with different credentials, you can:
- Use the `new_session_function` parameter in <ApiLink to="class/SessionPool#__init__">`SessionPool`</ApiLink> to customize session creation.
- Specify the `session_id` parameter in <ApiLink to="class/Request#from_url">`Request`</ApiLink> to bind specific requests to particular sessions.

For this guide, we'll use [demoqa.com](https://demoqa.com/login), a testing site designed for automation practice that provides a login form and protected content.

## Login with Playwright crawler

The following example demonstrates how to authenticate on a website using <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, which provides browser automation capabilities for filling out logging forms.

<RunnableCodeBlock className="language-python" language="python">
    {PlaywrightLogin}
</RunnableCodeBlock>

## Login with HTTP crawler

You can also use <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> (or its more specific variants like <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> or <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>) to authenticate by sending a POST <ApiLink to="class/Request">`Request`</ApiLink> with your credentials directly to the authentication endpoint.

HTTP-based authentication often varies significantly between websites. Using browser [DevTools](https://developer.chrome.com/docs/devtools/overview) to analyze the `Network` tab during manual login can help you understand the specific authentication flow, required headers, and body parameters for your target website.

<RunnableCodeBlock className="language-python" language="python">
    {HttpLogin}
</RunnableCodeBlock>


================================================
FILE: docs/guides/creating_web_archive.mdx
================================================
---
id: creating-web-archive
title: Creating web archive
description: How to create a Web ARChive (WARC) with Crawlee
---

import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';

import PlaywrightCrawlerRecordThroughProxy from '!!raw-loader!./code_examples/creating_web_archive/simple_pw_through_proxy_pywb_server.py';
import ParselCrawlerRecordManual from '!!raw-loader!./code_examples/creating_web_archive/manual_archiving_parsel_crawler.py';
import PlaywrightCrawlerRecordManual from '!!raw-loader!./code_examples/creating_web_archive/manual_archiving_playwright_crawler.py';

Archiving webpages is one of the tasks that a web crawler can be used for. There are various use cases, such as archiving for future reference, speeding up web crawler development, creating top-level regression tests for web crawlers and so on.

There are various existing libraries of web archives with massive amount of data stored during their years of existence, for example [Wayback Machine](https://web.archive.org/) or [Common Crawl](https://commoncrawl.org/). There are also dedicated tools for archiving web pages, to name some: simple browser extensions such as [Archive Webpage](https://archiveweb.page/), open source tools such as [pywb](https://pypi.org/project/pywb/) or [warcio](https://pypi.org/project/warcio/), or even web crawlers specialized in archiving such as [Browsertrix](https://webrecorder.net/browsertrix/).

The common file format used for archiving is [WARC](https://www.iso.org/standard/68004.html). Crawlee does not offer any out-of-the-box functionality to create WARC files, but in this guide, we will show examples of approaches that can be easily used in your use case to create WARC files with Crawlee.

## Crawling through proxy recording server

This approach can be especially attractive as it does not require almost any code change to the crawler itself and the correct WARC creation is done by code from well maintained [pywb](https://pypi.org/project/pywb/) package. The trick is to run a properly configured [wayback proxy server](https://pywb.readthedocs.io/en/latest/manual/usage.html#using-pywb-recorder), use it as a proxy for the crawler and record any traffic. Another advantage of this approach is that it is language agnostic. This way, you can record both your Python-based crawler and your JavaScript-based crawler. This is very straightforward and a good place to start.

This approach expects that you have already created your crawler, and that you just want to archive all the pages it is visiting during its crawl.

Install [pywb](https://pypi.org/project/pywb/) which will allow you to use `wb-manager` and `wayback` commands.
Create a new collection that will be used for this archiving session and start the wayback server:
```bash
wb-manager init example-collection
wayback --record --live -a --auto-interval 10 --proxy example-collection --proxy-record
```
Instead of passing many configuration  arguments to `wayback` command, you can configure the server by adding configuration options to `config.yaml`. See the details in the [documentation](https://pywb.readthedocs.io/en/latest/manual/configuring.html#configuring-the-web-archive).

### Configure the crawler

Now you should use this locally hosted server as a proxy in your crawler. There are two more steps before starting the crawler:
 - Make the crawler use the proxy server.
 - Deal with the [pywb Certificate Authority](https://pywb.readthedocs.io/en/latest/manual/configuring.html#https-proxy-and-pywb-certificate-authority).

For example, in <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, this is the simplest setup, which takes the shortcut and ignores the CA-related errors:

<CodeBlock className="language-python">
    {PlaywrightCrawlerRecordThroughProxy}
</CodeBlock>

After you run the crawler you will be able to see the archived data in the wayback collection directory for example `.../collections/example-collection/archive`. You can then access the recorded pages directly in the proxy recording server or use it with any other WARC-compatible tool.

## Manual WARC creation

A different approach is to create WARC files manually in the crawler, which gives you full control over the WARC files. This is way more complex and low-level approach as you have to ensure that all the relevant data is collected, and correctly stored and that the archiving functions are called at the right time. This is by no means a trivial task and the example archiving functions below are just the most simple examples that will be insufficient for many real-world use cases. You will need to extend and improve them to properly fit your specific needs.

### Simple crawlers

With non-browser crawlers such as <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> you will not be able to create high fidelity archive of the page as you will be missing all the JavaScript dynamic content. However, you can still create a WARC file with the HTML content of the page, which can be sufficient for some use cases. Let's take a look at the example below:
<CodeBlock className="language-python">
    {ParselCrawlerRecordManual}
</CodeBlock>

The example above is calling an archiving function on each request using the `request_handler`.

### Browser-based crawlers

With browser crawlers such as <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> you should be able to create high fidelity archive of a web page. Let's take a look at the example below:

<CodeBlock className="language-python">
    {PlaywrightCrawlerRecordManual}
</CodeBlock>

The example above is adding an archiving callback on each response in the pre_navigation `archiving_hook`. This ensures that additional resources requested by the browser are also archived.

## Using the archived data

In the following section, we will describe an example use case how you can use the recorded WARC files to speed up the development of your web crawler. The idea is to use the archived data as a source of responses for your crawler so that you can test it against the real data without having to crawl the web again.

It is assumed that you already have the WARC files. If not, please read the previous sections on how to create them first.

Let's use pywb again. This time we will not use it as a recording server, but as a proxy server that will serve the previously archived pages to your crawler in development.

```bash
wb-manager init example-collection
wb-manager add example-collection /your_path_to_warc_file/example.warc.gz
wayback --proxy example-collection
```

Previous commands start the wayback server that allows crawler requests to be served from the archived pages in the `example-collection` instead of sending requests to the real website. This is again [proxy mode of the wayback server](https://pywb.readthedocs.io/en/latest/manual/usage.html#http-s-proxy-mode-access), but without recording capability. Now you need to [configure your crawler](#configure-the-crawler) to use this proxy server, which was already described above. Once everything is finished, you can just run your crawler, and it will crawl the offline archived version of the website from your WARC file.

You can also manually browse the archived pages in the wayback server by going to the locally hosted server and entering the collection and URL of the archived page, for example: `http://localhost:8080/example-collection/https:/crawlee.dev/`. The wayback server will serve the page from the WARC file if it exists, or it will return a 404 error if it does not. For more detail about the server please refer to the [pywb documentation](https://pywb.readthedocs.io/en/latest/manual/usage.html#getting-started).

If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord](https://discord.com/invite/jyEM2PRvMU) community.


================================================
FILE: docs/guides/error_handling.mdx
================================================
---
id: error-handling
title: Error handling
description: How to handle errors that occur during web crawling.
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import HandleProxyError from '!!raw-loader!roa-loader!./code_examples/error_handling/handle_proxy_error.py';
import ChangeHandleErrorStatus from '!!raw-loader!roa-loader!./code_examples/error_handling/change_handle_error_status.py';
import DisableRetry from '!!raw-loader!roa-loader!./code_examples/error_handling/disable_retry.py';

This guide demonstrates techniques for handling common errors encountered during web crawling operations.

## Handling proxy errors

Low-quality proxies can cause problems even with high settings for `max_request_retries` and `max_session_rotations` in <ApiLink to="class/BasicCrawlerOptions">`BasicCrawlerOptions`</ApiLink>. If you can't get data because of proxy errors, you might want to try again. You can do this using <ApiLink to="class/BasicCrawler#failed_request_handler">`failed_request_handler`</ApiLink>:

<RunnableCodeBlock className="language-python" language="python">
    {HandleProxyError}
</RunnableCodeBlock>

You can use this same approach when testing different proxy providers. To better manage this process, you can count proxy errors and [stop the crawler](../examples/crawler-stop) if you get too many.

## Changing how error status codes are handled

By default, when <ApiLink to="class/Session">`Sessions`</ApiLink> get status codes like [401](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/401), [403](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/403), or [429](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/429), Crawlee marks the <ApiLink to="class/Session">`Session`</ApiLink> as `retire` and switches to a new one. This might not be what you want, especially when working with [authentication](./logging-in-with-a-crawler). You can learn more in the [Session management guide](./session-management).

Here's an example of how to change this behavior:

<RunnableCodeBlock className="language-python" language="python">
    {ChangeHandleErrorStatus}
</RunnableCodeBlock>

## Turning off retries for non-network errors

Sometimes you might get unexpected errors when parsing data, like when a website has an unusual structure. Crawlee normally tries again based on your `max_request_retries` setting, but sometimes you don't want that.

Here's how to turn off retries for non-network errors using <ApiLink to="class/BasicCrawler#error_handler">`error_handler`</ApiLink>, which runs before Crawlee tries again:

<RunnableCodeBlock className="language-python" language="python">
    {DisableRetry}
</RunnableCodeBlock>


================================================
FILE: docs/guides/http_clients.mdx
================================================
---
id: http-clients
title: HTTP clients
description: Learn about Crawlee's HTTP client architecture, how to switch between different implementations, and create custom HTTP clients for specialized web scraping needs.
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import ParselHttpxExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_httpx_example.py';
import ParselCurlImpersonateExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_curl_impersonate_example.py';
import ParselImpitExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_impit_example.py';

HTTP clients are utilized by HTTP-based crawlers (e.g., <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> and <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>) to communicate with web servers. They use external HTTP libraries for communication rather than a browser. Examples of such libraries include [httpx](https://pypi.org/project/httpx/), [aiohttp](https://pypi.org/project/aiohttp/), [curl-cffi](https://pypi.org/project/curl-cffi/), and [impit](https://apify.github.io/impit/). After retrieving page content, an HTML parsing library is typically used to facilitate data extraction. Examples of such libraries include [beautifulsoup](https://pypi.org/project/beautifulsoup4/), [parsel](https://pypi.org/project/parsel/), [selectolax](https://pypi.org/project/selectolax/), and [pyquery](https://pypi.org/project/pyquery/). These crawlers are faster than browser-based crawlers but cannot execute client-side JavaScript.

```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Abstract classes
%% ========================

class HttpClient {
    <<abstract>>
}

%% ========================
%% Specific classes
%% ========================

class ImpitHttpClient

class HttpxHttpClient

class CurlImpersonateHttpClient

%% ========================
%% Inheritance arrows
%% ========================

HttpClient --|> ImpitHttpClient
HttpClient --|> HttpxHttpClient
HttpClient --|> CurlImpersonateHttpClient
```

## Switching between HTTP clients

Crawlee currently provides three main HTTP clients: <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink>, which uses the `impit` library, <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink>, which uses the `httpx` library with `browserforge` for custom HTTP headers and fingerprints, and <ApiLink to="class/CurlImpersonateHttpClient">`CurlImpersonateHttpClient`</ApiLink>, which uses the `curl-cffi` library. You can switch between them by setting the `http_client` parameter when initializing a crawler class. The default HTTP client is <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink>. For more details on anti-blocking features, see our [avoid getting blocked guide](./avoid-blocking).

Below are examples of how to configure the HTTP client for the <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>:

<Tabs>
    <TabItem value="ParselHttpxExample" label="ParselCrawler with HTTPX">
        <RunnableCodeBlock className="language-python" language="python">
            {ParselHttpxExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="ParselCurlImpersonateExample" label="ParselCrawler with curl-cffi">
        <RunnableCodeBlock className="language-python" language="python">
            {ParselCurlImpersonateExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="ParselImpitExample" label="ParselCrawler with impit">
        <RunnableCodeBlock className="language-python" language="python">
            {ParselImpitExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>

## Installation requirements

Since <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink> is the default HTTP client, it's included with the base Crawlee installation and requires no additional packages.

For <ApiLink to="class/CurlImpersonateHttpClient">`CurlImpersonateHttpClient`</ApiLink>, you need to install Crawlee with the `curl-impersonate` extra:

```sh
python -m pip install 'crawlee[curl-impersonate]'
```

For <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink>, you need to install Crawlee with the `httpx` extra:

```sh
python -m pip install 'crawlee[httpx]'
```

Alternatively, you can install all available extras to get access to all HTTP clients and features:

```sh
python -m pip install 'crawlee[all]'
```

## Creating custom HTTP clients

Crawlee provides an abstract base class, <ApiLink to="class/HttpClient">`HttpClient`</ApiLink>, which defines the interface that all HTTP clients must implement. This allows you to create custom HTTP clients tailored to your specific requirements.

HTTP clients are responsible for several key operations:

- sending HTTP requests and receiving responses,
- managing cookies and sessions,
- handling headers and authentication,
- managing proxy configurations,
- connection pooling with timeout management.

To create a custom HTTP client, you need to inherit from the <ApiLink to="class/HttpClient">`HttpClient`</ApiLink> base class and implement all required abstract methods. Your implementation must be async-compatible and include proper cleanup and resource management to work seamlessly with Crawlee's concurrent processing model.

## Conclusion

This guide introduced you to the HTTP clients available in Crawlee and demonstrated how to switch between them, including their installation requirements and usage examples. You also learned about the responsibilities of HTTP clients and how to implement your own custom HTTP client by inheriting from the <ApiLink to="class/HttpClient">`HttpClient`</ApiLink> base class.

If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!


================================================
FILE: docs/guides/http_crawlers.mdx
================================================
---
id: http-crawlers
title: HTTP crawlers
description: Learn about Crawlee's HTTP crawlers including BeautifulSoup, Parsel, and raw HTTP crawlers for efficient server-rendered content extraction without JavaScript execution.
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
import CodeBlock from '@theme/CodeBlock';

import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/beautifulsoup_example.py';
import ParselExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/parsel_example.py';
import HttpExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/http_example.py';

import LxmlParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lxml_parser.py';
import LxmlSaxoncheParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lxml_saxonche_parser.py';
import LexborParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lexbor_parser.py';
import PyqueryParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/pyquery_parser.py';
import ScraplingParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/scrapling_parser.py';

import SelectolaxParserSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_parser.py';
import SelectolaxContextSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_context.py';
import SelectolaxCrawlerSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_crawler.py';
import SelectolaxCrawlerRunSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_crawler_run.py';
import AdaptiveCrawlerRunSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_adaptive_run.py';

HTTP crawlers are ideal for extracting data from server-rendered websites that don't require JavaScript execution. These crawlers make requests via HTTP clients to fetch HTML content and then parse it using various parsing libraries. For client-side rendered content, where you need to execute JavaScript consider using [Playwright crawler](https://crawlee.dev/python/docs/guides/playwright-crawler) instead.

## Overview

All HTTP crawlers share a common architecture built around the <ApiLink to="class/AbstractHttpCrawler">`AbstractHttpCrawler`</ApiLink> base class. The main differences lie in the parsing strategy and the context provided to request handlers. There are <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>, and <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>. It can also be extended to create custom crawlers with specialized parsing requirements. They use HTTP clients to fetch page content and parsing libraries to extract data from the HTML, check out the [HTTP clients guide](./http-clients) to learn about the HTTP clients used by these crawlers, how to switch between them, and how to create custom HTTP clients tailored to your specific requirements.

```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Abstract classes
%% ========================

class BasicCrawler {
    <<abstract>>
}

class AbstractHttpCrawler {
    <<abstract>>
}

%% ========================
%% Specific classes
%% ========================

class HttpCrawler

class ParselCrawler

class BeautifulSoupCrawler

%% ========================
%% Inheritance arrows
%% ========================

BasicCrawler --|> AbstractHttpCrawler
AbstractHttpCrawler --|> HttpCrawler
AbstractHttpCrawler --|> ParselCrawler
AbstractHttpCrawler --|> BeautifulSoupCrawler
```

## BeautifulSoupCrawler

The <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> uses the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) library for HTML parsing. It provides fault-tolerant parsing that handles malformed HTML, automatic character encoding detection, and supports CSS selectors, tag navigation, and custom search functions. Use this crawler when working with imperfect HTML structures, when you prefer BeautifulSoup's intuitive API, or when prototyping web scraping solutions.

<RunnableCodeBlock className="language-python" language="python">
    {BeautifulSoupExample}
</RunnableCodeBlock>

## ParselCrawler

The <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> uses the [Parsel](https://parsel.readthedocs.io/) library, which provides XPath 1.0 and CSS selector support built on `lxml` for high performance. It includes built-in regex support for pattern matching, proper XML namespace handling, and offers better performance than BeautifulSoup while maintaining a clean API. Use this crawler when you need XPath functionality, require high-performance parsing, or need to extract data using regular expressions.

<RunnableCodeBlock className="language-python" language="python">
    {ParselExample}
</RunnableCodeBlock>

## HttpCrawler

The <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> provides direct access to HTTP response body and headers without automatic parsing, offering maximum performance with no parsing overhead. It supports any content type (JSON, XML, binary) and allows complete control over response processing, including memory-efficient handling of large responses. Use this crawler when working with non-HTML content, requiring maximum performance, implementing custom parsing logic, or needing access to raw response data.

<RunnableCodeBlock className="language-python" language="python">
    {HttpExample}
</RunnableCodeBlock>

### Using custom parsers

Since <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> provides raw HTTP responses, you can integrate any parsing library. Note that helpers like <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> and <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> are not available with this approach.

The following examples demonstrate how to integrate with several popular parsing libraries, including [lxml](https://lxml.de/) (high-performance parsing with XPath 1.0), [lxml with SaxonC-HE](https://pypi.org/project/saxonche/) (XPath 3.1 support), [selectolax](https://github.com/rushter/selectolax) (high-speed CSS selectors), [PyQuery](https://pyquery.readthedocs.io/) (jQuery-like syntax), and [scrapling](https://github.com/D4Vinci/Scrapling) (a Scrapy/Parsel-style API offering BeautifulSoup-like methods).

<Tabs groupId="custom_parsers">
    <TabItem value="lxml" label="lxml">
        <RunnableCodeBlock className="language-python" language="python">
            {LxmlParser}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="saxonche" label="lxml with SaxonC-HE">
        <RunnableCodeBlock className="language-python" language="python">
            {LxmlSaxoncheParser}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="selectolax" label="selectolax">
        <RunnableCodeBlock className="language-python" language="python">
            {LexborParser}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="pyquery" label="PyQuery">
        <RunnableCodeBlock className="language-python" language="python">
            {PyqueryParser}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="scrapling" label="Scrapling">
        <RunnableCodeBlock className="language-python" language="python">
            {ScraplingParser}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>

## Custom HTTP crawler

While the built-in crawlers cover most use cases, you might need a custom HTTP crawler for specialized parsing requirements. To create a custom HTTP crawler, inherit directly from <ApiLink to="class/AbstractHttpCrawler">`AbstractHttpCrawler`</ApiLink>. This approach requires implementing:

1. **Custom parser class**: Inherit from <ApiLink to="class/AbstractHttpParser">`AbstractHttpParser`</ApiLink>.
2. **Custom context class**: Define what data and helpers are available to handlers.
3. **Custom crawler class**: Tie everything together.

This approach is recommended when you need tight integration between parsing and the crawling context, or when you're building a reusable crawler for a specific technology or format.

The following example demonstrates how to create a custom crawler using `selectolax` with the `Lexbor` engine.

### Parser implementation

The parser converts HTTP responses into a parsed document and provides methods for element selection. Implement <ApiLink to="class/AbstractHttpParser">`AbstractHttpParser`</ApiLink> using `selectolax` with required methods for parsing and querying:

<CodeBlock className="language-python" language="python" title="selectolax_parser.py">
    {SelectolaxParserSource}
</CodeBlock>

This is enough to use your parser with `AbstractHttpCrawler.create_parsed_http_crawler_class` factory method. For more control, continue with custom context and crawler classes below.

### Crawling context definition (optional)

The crawling context is passed to request handlers and provides access to the parsed content. Extend <ApiLink to="class/ParsedHttpCrawlingContext">`ParsedHttpCrawlingContext`</ApiLink> to define the interface your handlers will work with. Here you can implement additional helpers for the crawler context.

<CodeBlock className="language-python" language="python" title="selectolax_context.py">
    {SelectolaxContextSource}
</CodeBlock>

### Crawler composition

The crawler class connects the parser and context. Extend <ApiLink to="class/AbstractHttpCrawler">`AbstractHttpCrawler`</ApiLink> and configure the context pipeline to use your custom components:

<CodeBlock className="language-python" language="python" title="selectolax_crawler.py">
    {SelectolaxCrawlerSource}
</CodeBlock>

### Crawler usage

The custom crawler works like any built-in crawler. Request handlers receive your custom context with full access to framework helpers like <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink>. Additionally, the custom parser can be used with <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> for adaptive crawling:

<Tabs groupId="crawlers">
    <TabItem value="selectolax_crawler" label="SelectolaxCrawler">
        <CodeBlock className="language-python" language="python">
            {SelectolaxCrawlerRunSource}
        </CodeBlock>
    </TabItem>
    <TabItem value="adaptive_playwright_crawler" label="AdaptivePlaywrightCrawler with SelectolaxParser">
        <CodeBlock className="language-python" language="python">
            {AdaptiveCrawlerRunSource}
        </CodeBlock>
    </TabItem>
</Tabs>

## Conclusion

This guide provided a comprehensive overview of HTTP crawlers in Crawlee. You learned about the three main crawler types - <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> for fault-tolerant HTML parsing, <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> for high-performance extraction with XPath and CSS selectors, and <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> for raw response processing. You also discovered how to integrate third-party parsing libraries with <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> and how to create fully custom crawlers using <ApiLink to="class/AbstractHttpCrawler">`AbstractHttpCrawler`</ApiLink> for specialized parsing requirements.

If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!


================================================
FILE: docs/guides/playwright_crawler.mdx
================================================
---
id: playwright-crawler
title: Playwright crawler
description: Learn how to use PlaywrightCrawler for browser-based web scraping.
---

import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import MultipleLaunchExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/multiple_launch_example.py';
import BrowserConfigurationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_configuration_example.py';
import NavigationHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/navigation_hooks_example.py';
import BrowserPoolPageHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_pool_page_hooks_example.py';
import PluginBrowserConfigExample from '!!raw-loader!./code_examples/playwright_crawler/plugin_browser_configuration_example.py';

A <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is a browser-based crawler. In contrast to HTTP-based crawlers like <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> or <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, it uses a real browser to render pages and extract data. It is built on top of the [Playwright](https://playwright.dev/python/) browser automation library. While browser-based crawlers are typically slower and less efficient than HTTP-based crawlers, they can handle dynamic, client-side rendered sites that standard HTTP-based crawlers cannot manage.

## When to use Playwright crawler

Use <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> in scenarios that require full browser capabilities, such as:

- **Dynamic content rendering**: Required when pages rely on heavy JavaScript to load or modify content in the browser.
- **Anti-scraping protection**: Helpful for sites using JavaScript-based security or advanced anti-automation measures.
- **Complex cookie management**: Necessary for sites with session or cookie requirements that standard HTTP-based crawlers cannot handle easily.

If [HTTP-based crawlers](https://crawlee.dev/python/docs/guides/http-crawlers) are insufficient, <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> can address these challenges. See a [basic example](../examples/playwright-crawler) for a typical usage demonstration.

## Advanced configuration

The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> uses other Crawlee components under the hood, notably <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> and <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. These components let you to configure the browser and context settings, launch multiple browsers, and apply pre-navigation hooks. You can create your own instances of these components and pass them to the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> constructor.

- The <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink> manages how browsers are launched and how browser contexts are created. It accepts [browser launch](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch) and [new context](https://playwright.dev/python/docs/api/class-browser#browser-new-context) options.
- The <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> manages the lifecycle of browser instances (launching, recycling, etc.). You can customize its behavior to suit your needs.

## Managing multiple browsers

The <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> allows you to manage multiple browsers. Each browser instance is managed by a separate <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink> and can be configured independently. This is useful for scenarios like testing multiple configurations or implementing browser rotation to help avoid blocks or detect different site behaviors.

<RunnableCodeBlock className="language-python" language="python">
    {MultipleLaunchExample}
</RunnableCodeBlock>

## Browser launch and context configuration

The <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink> provides access to all relevant Playwright configuration options for both [browser launches](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch) and [new browser contexts](https://playwright.dev/python/docs/api/class-browser#browser-new-context). You can specify these options in the constructor of <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink> or <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>:

<RunnableCodeBlock className="language-python" language="python">
    {BrowserConfigurationExample}
</RunnableCodeBlock>

You can also configure each plugin used by <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink>:

<CodeBlock className="language-python">
    {PluginBrowserConfigExample}
</CodeBlock>

For an example of how to implement a custom browser plugin, see the [Camoufox example](../examples/playwright-crawler-with-camoufox). [Camoufox](https://camoufox.com/) is a stealth browser plugin designed to reduce detection by anti-scraping measures and is fully compatible with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>.

## Page configuration with lifecycle page hooks

For additional setup or event-driven actions around page creation and closure, the <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> exposes four lifecycle hooks: <ApiLink to="class/BrowserPool#pre_page_create_hook">`pre_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#post_page_create_hook">`post_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#pre_page_close_hook">`pre_page_close_hook`</ApiLink>, and <ApiLink to="class/BrowserPool#post_page_close_hook">`post_page_close_hook`</ApiLink>. To use them, create a `BrowserPool` instance and pass it to <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> via the `browser_pool` argument.

<RunnableCodeBlock className="language-python" language="python">
    {BrowserPoolPageHooksExample}
</RunnableCodeBlock>

## Navigation hooks

Navigation hooks allow for additional configuration at specific points during page navigation. The <ApiLink to="class/PlaywrightCrawler#pre_navigation_hook">`pre_navigation_hook`</ApiLink> is called before each navigation and provides <ApiLink to="class/PlaywrightPreNavCrawlingContext">`PlaywrightPreNavCrawlingContext`</ApiLink> - including the [page](https://playwright.dev/python/docs/api/class-page) instance and a <ApiLink to="class/PlaywrightPreNavCrawlingContext#block_requests">`block_requests`</ApiLink> helper for filtering unwanted resource types and URL patterns. See the [block requests example](https://crawlee.dev/python/docs/examples/playwright-crawler-with-block-requests) for a dedicated walkthrough. Similarly, the <ApiLink to="class/PlaywrightCrawler#post_navigation_hook">`post_navigation_hook`</ApiLink> is called after each navigation and provides <ApiLink to="class/PlaywrightPostNavCrawlingContext">`PlaywrightPostNavCrawlingContext`</ApiLink> - useful for post-load checks such as detecting CAPTCHAs or verifying page state.

<RunnableCodeBlock className="language-python" language="python">
    {NavigationHooksExample}
</RunnableCodeBlock>

## Conclusion

This guide introduced the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and explained how to configure it using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> and <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. You learned how to launch multiple browsers, configure browser and context settings, use <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> lifecycle page hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!


================================================
FILE: docs/guides/playwright_crawler_adaptive.mdx
================================================
---
id: adaptive-playwright-crawler
title: Adaptive Playwright crawler
description: Learn how to use the Adaptive Playwright crawler to automatically switch between browser-based and HTTP-only crawling.
---

import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import AdaptivePlaywrightCrawlerHandler from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_adaptive/handler.py';
import AdaptivePlaywrightCrawlerPreNavHooks from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_adaptive/pre_nav_hooks.py';

import AdaptivePlaywrightCrawlerInitBeautifulSoup from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_beautifulsoup.py';
import AdaptivePlaywrightCrawlerInitParsel from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_parsel.py';
import AdaptivePlaywrightCrawlerInitPrediction from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_prediction.py';

An <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> is a combination of <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and some implementation of HTTP-based crawler such as <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> or <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>.
It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit.

Detection is done based on the <ApiLink to="class/RenderingTypePredictor">`RenderingTypePredictor`</ApiLink> with default implementation <ApiLink to="class/DefaultRenderingTypePredictor">`DefaultRenderingTypePredictor`</ApiLink>. It predicts which crawling method should be used and learns from already crawled pages.

## When to use AdaptivePlaywrightCrawler

Use <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> in scenarios where some target pages have to be crawled with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, but for others faster HTTP-based crawler is sufficient. This way, you can achieve lower costs when crawling multiple different websites.

Another use case is performing selector-based data extraction without prior knowledge of whether the selector exists in the static page or is dynamically added by a code executed in a browsing client.

## Request handler and adaptive context helpers

Request handler for <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> works on special context type - <ApiLink to="class/AdaptivePlaywrightCrawlingContext">`AdaptivePlaywrightCrawlingContext`</ApiLink>. This context is sometimes created by HTTP-based sub crawler and sometimes by playwright based sub crawler. Due to its dynamic nature, you can't always access [page](https://playwright.dev/python/docs/api/class-page) object. To overcome this limitation, there are three helper methods on this context that can be called regardless of how the context was created.

<ApiLink to="class/AdaptivePlaywrightCrawlingContext#wait_for_selector">`wait_for_selector`</ApiLink> accepts `css` selector as first argument and timeout as second argument. The function will try to locate this selector a return once it is found(within timeout). In practice this means that if HTTP-based sub crawler was used, the function will find the selector only if it is part of the static content. If not, the adaptive crawler will fall back to the playwright sub crawler and will wait try to locate the selector within the timeout using playwright.

<ApiLink to="class/AdaptivePlaywrightCrawlingContext#query_selector_one">`query_selector_one`</ApiLink> accepts `css` selector as first argument and timeout as second argument. This function acts similar to `wait_for_selector`, but it also returns one selector if any selector is found. Return value type is determined by used HTTP-based sub crawler. For example, it will be `Selector` for <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> and `Tag` for <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>.

<ApiLink to="class/AdaptivePlaywrightCrawlingContext#query_selector_one">`query_selector_all`</ApiLink> same as <ApiLink to="class/AdaptivePlaywrightCrawlingContext#query_selector_one">`query_selector_one`</ApiLink>, but returns all found selectors.

<ApiLink to="class/AdaptivePlaywrightCrawlingContext#parse_with_static_parser">`parse_with_static_parser`</ApiLink> will re-parse the whole page. Return value type is determined by used HTTP-based sub crawler. It has optional arguments: `selector` and `timeout`. If those optional arguments are used then the function first calls <ApiLink to="class/AdaptivePlaywrightCrawlingContext#wait_for_selector">`wait_for_selector`</ApiLink> and then do the parsing. This can be used in scenario where some specific element can signal, that page is already complete.

See the following example about how to create request handler and use context helpers:

<RunnableCodeBlock className="language-python" language="python">
    {AdaptivePlaywrightCrawlerHandler}
</RunnableCodeBlock>

## Crawler configuration

To use <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> it is recommended to use one of the prepared factory methods that will create the crawler with specific HTTP-based sub crawler variant: <ApiLink to="class/AdaptivePlaywrightCrawler#with_beautifulsoup_static_parser">`AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser`</ApiLink> or <ApiLink to="class/AdaptivePlaywrightCrawler#with_parsel_static_parser">`AdaptivePlaywrightCrawler.with_parsel_static_parser`</ApiLink>.

<ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> is internally composed of two sub crawlers and you can do a detailed configuration of both of them. For detailed configuration options of the sub crawlers, please refer to their pages: <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>, <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>.

In the following example you can see how to create and configure <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> with two different HTTP-based sub crawlers:

<Tabs>
    <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler" default>
        <CodeBlock className="language-python">
            {AdaptivePlaywrightCrawlerInitBeautifulSoup}
        </CodeBlock>
    </TabItem>
    <TabItem value="ParselCrawler" label="ParselCrawler">
        <CodeBlock className="language-python">
            {AdaptivePlaywrightCrawlerInitParsel}
        </CodeBlock>
    </TabItem>
</Tabs>

### Prediction related arguments

To control which pages are crawled by which method you can use following arguments:

<ApiLink to="class/RenderingTypePredictor">`RenderingTypePredictor`</ApiLink> - Class that can give recommendations about which sub crawler should be used for specific url. Predictor will also recommend to use both sub crawlers for some page from time to time, to check that the given recommendation was correct. Predictor should be able to learn from previous results and gradually give more reliable recommendations.

`result_checker` - Is a function that checks result created from crawling a page. By default, it always returns `True`.

`result_comparator` - Is a function that compares two results (HTTP-based sub crawler result and playwright based sub crawler result) and returns `True` if they are considered the same. By default, this function compares calls of context helper `push_data` by each sub crawler. This function is used by `rendering_type_predictor` to evaluate whether HTTP-based crawler has the same results as playwright based sub crawler.

See the following example about how to pass prediction related arguments:

<CodeBlock className="language-python">
    {AdaptivePlaywrightCrawlerInitPrediction}
</CodeBlock>

## Page configuration with pre-navigation hooks

In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the <ApiLink to="class/AdaptivePlaywrightCrawler#pre_navigation_hook">`pre_navigation_hook`</ApiLink> method of the <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink>. This method is called before the page navigates to the target URL and allows you to configure the page instance. Due to the dynamic nature of <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> it is possible that the hook will be executed for HTTP-based sub crawler or playwright-based sub crawler. Using [page](https://playwright.dev/python/docs/api/class-page) object for hook that will be executed on HTTP-based sub crawler will raise an exception. To overcome this you can use optional argument `playwright_only` = `True` when registering the hook.

See the following example about how to register the pre navigation hooks:

<RunnableCodeBlock className="language-python" language="python">
    {AdaptivePlaywrightCrawlerPreNavHooks}
</RunnableCodeBlock>


================================================
FILE: docs/guides/playwright_crawler_stagehand.mdx
================================================
---
id: playwright-crawler-stagehand
title: Playwright with Stagehand
description: How to integrate Stagehand AI-powered automation with PlaywrightCrawler.
---

import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';

import SupportClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/support_classes.py';
import BrowserClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/browser_classes.py';
import StagehandRun from '!!raw-loader!./code_examples/playwright_crawler_stagehand/stagehand_run.py';

[Stagehand](https://docs.stagehand.dev/) is a framework that combines [Playwright](https://playwright.dev/python/) with AI-driven natural language understanding and decision-making capabilities. With Stagehand, you can use natural language instructions to interact with web pages instead of writing complex selectors and automation logic.

Stagehand supports multiple AI models through [`LiteLLM`](https://docs.litellm.ai/docs/). This guide demonstrates how to integrate Stagehand with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> using [Gemini](https://ai.google.dev/gemini-api/docs) as the AI model provider.

:::info

This guide is based on stagehand-python v0.4.0 with local configuration settings and may not be compatible with newer versions.

:::

## Get Gemini API key

You need to register with [Google AI Studio](https://aistudio.google.com/) and navigate to [Get API key](https://aistudio.google.com/app/apikey) to obtain your API key.

## Create support classes for Stagehand

To integrate Stagehand with Crawlee, you need to create wrapper classes that allow <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink> to manage the Playwright lifecycle.

Create `CrawleeStagehand` - a custom Stagehand subclass that overrides the `init` method to prevent Stagehand from launching its own Playwright instance.

Create `CrawleeStagehandPage` - a wrapper class for `StagehandPage` that implements the [Playwright Page](https://playwright.dev/python/docs/next/api/class-page) behavior expected by <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>.

<CodeBlock className="language-python" title="support_classes.py">
    {SupportClasses}
</CodeBlock>

## Create browser integration classes

You need to create a custom browser plugin and controller that properly initialize Stagehand and obtain browser pages from `StagehandContext`.

Create `StagehandPlugin` - a subclass of <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink> that holds the Stagehand instance and creates `PlaywrightPersistentBrowser` instances.

Create `StagehandBrowserController` - a subclass of <ApiLink to="class/PlaywrightBrowserController">`PlaywrightBrowserController`</ApiLink> that lazily initializes `StagehandContext` and creates new pages with AI capabilities on demand.

<CodeBlock className="language-python" title="browser_classes.py">
    {BrowserClasses}
</CodeBlock>

## Create a crawler

Now you can create a <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> that uses Stagehand's AI capabilities to interact with web pages using natural language commands:

<CodeBlock className="language-python" title="stagehand_run.py">
    {StagehandRun}
</CodeBlock>

The integration works through several key components:
- `CrawleeStagehand` prevents Stagehand from launching its own Playwright instance, allowing Crawlee to manage the browser lifecycle
- `StagehandPlugin` extends the Playwright browser plugin to create Stagehand-enabled browser instances
- `StagehandBrowserController` uses `StagehandContext` to create pages with AI capabilities
- `CrawleeStagehandPage` provides interface compatibility between Stagehand pages and Crawlee's expectations

In the request handler, you can use natural language commands like `page.extract('Extract title page')` to perform intelligent data extraction without writing complex selectors.


================================================
FILE: docs/guides/proxy_management.mdx
================================================
---
id: proxy-management
title: Proxy management
description: Using proxies to get around those annoying IP-blocks
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import CodeBlock from '@theme/CodeBlock';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import QuickStartExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/quick_start_example.py';
import IntegrationBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/integration_bs_example.py';
import IntegrationPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/integration_pw_example.py';
import TiersBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/tiers_bs_example.py';
import TiersPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/tiers_pw_example.py';
import InspectionBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/inspecting_bs_example.py';
import InspectionPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/inspecting_pw_example.py';

import SessionBsExample from '!!raw-loader!./code_examples/proxy_management/session_bs_example.py';
import SessionPwExample from '!!raw-loader!./code_examples/proxy_management/session_pw_example.py';

[IP address blocking](https://en.wikipedia.org/wiki/IP_address_blocking) is one of the oldest and most effective ways of preventing access to a website. It is therefore paramount for a good web scraping library to provide easy to use but powerful tools which can work around IP blocking. The most powerful weapon in our anti IP blocking arsenal is a [proxy server](https://en.wikipedia.org/wiki/Proxy_server).

With Crawlee we can use our own proxy servers or proxy servers acquired from third-party providers.

[//]: # (Check out the [avoid blocking guide]&#40;./avoid-blocking&#41; for more information about blocking.)

## Quick start

If you already have proxy URLs of your own, you can start using them immediately in only a few lines of code.

<RunnableCodeBlock className="language-python" language="python">
    {QuickStartExample}
</RunnableCodeBlock>

Examples of how to use our proxy URLs with crawlers are shown below in [Crawler integration](#crawler-integration) section.

## Proxy configuration

All our proxy needs are managed by the <ApiLink to="class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> class. We create an instance using the <ApiLink to="class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> constructor function based on the provided options.

### Crawler integration

`ProxyConfiguration` integrates seamlessly into <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> and <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>.

<Tabs>
    <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {IntegrationBsExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {IntegrationPwExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>

Our crawlers will now use the selected proxies for all connections.

### IP Rotation and session management

The <ApiLink to="class/ProxyConfiguration#new_url">`proxy_configuration.new_url()`</ApiLink> method allows us to pass a `session_id` parameter. This creates a `session_id`-`proxy_url` pair, ensuring that subsequent `new_url()` calls with the same `session_id` return the same `proxy_url`. This is extremely useful in scraping, because we want to create the impression of a real user. See the <ApiLink to="class/SessionPool">`SessionPool`</ApiLink> class for more information on how maintaining a real session helps avoid blocking.

For more details on session management, check out the [Session management](./session-management) guide.

When no `session_id` is provided, our proxy URLs are rotated round-robin.

<Tabs>
    <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
        <CodeBlock className="language-python">
            {SessionBsExample}
        </CodeBlock>
    </TabItem>
    <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
        <CodeBlock className="language-python">
            {SessionPwExample}
        </CodeBlock>
    </TabItem>
</Tabs>

### Tiered proxies

When you use HTTP proxies in real world crawling scenarios, you have to decide which type of proxy to use to reach the sweet spot between cost efficiency and reliably avoiding blocking. Some websites may allow crawling with no proxy, on some you may get away with using datacenter proxies, which are cheap but easily detected, and sometimes you need to use expensive residential proxies.

To take the guesswork out of this process, Crawlee allows you to configure multiple tiers of proxy URLs. When crawling, it will automatically pick the lowest tier (smallest index) where it doesn't encounter blocking. If you organize your proxy server URLs in tiers so that the lowest tier contains the cheapest, least reliable ones and each higher tier contains more expensive, more reliable ones, you will get an optimal anti-blocking performance.

In an active tier, Crawlee will alternate between proxies in a round-robin fashion, just like it would with `proxy_urls`.

<Tabs>
    <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {TiersBsExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {TiersPwExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>

## Inspecting current proxy in crawlers

The <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> and <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> provide access to information about the currently used proxy via the request handler using a <ApiLink to="class/ProxyInfo">`proxy_info`</ApiLink> object. This object allows easy access to the proxy URL.

<Tabs>
    <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {InspectionBsExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {InspectionPwExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>


================================================
FILE: docs/guides/request_loaders.mdx
================================================
---
id: request-loaders
title: Request loaders
description: How to manage the requests your crawler will go through.
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import RlBasicExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example.py';
import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_basic_example.py';
import RlTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example.py';
import RlExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example_explicit.py';
import SitemapTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example.py';
import SitemapExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example_explicit.py';
import RlBasicPersistExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example_with_persist.py';
import SitemapPersistExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_example_with_persist.py';

The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/crawlee/request_loaders) sub-package extends the functionality of the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, providing additional tools for managing URLs and requests. If you are new to Crawlee and unfamiliar with the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, consider starting with the [Storages](https://crawlee.dev/python/docs/guides/storages) guide first. Request loaders define how requests are fetched and stored, enabling various use cases such as reading URLs from files, external APIs, or combining multiple sources together.

## Overview

The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/crawlee/request_loaders) sub-package introduces the following abstract classes:

- <ApiLink to="class/RequestLoader">`RequestLoader`</ApiLink>: The base interface for reading requests in a crawl.
- <ApiLink to="class/RequestManager">`RequestManager`</ApiLink>: Extends `RequestLoader` with write capabilities.
- <ApiLink to="class/RequestManagerTandem">`RequestManagerTandem`</ApiLink>: Combines a read-only `RequestLoader` with a writable `RequestManager`.

And specific request loader implementations:

- <ApiLink to="class/RequestList">`RequestList`</ApiLink>: A lightweight implementation for managing a static list of URLs.
- <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink>: A specialized loader that reads URLs from XML and plain-text sitemaps following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html) with filtering capabilities.

Below is a class diagram that illustrates the relationships between these components and the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>:

```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Abstract classes
%% ========================

class Storage {
    <<abstract>>
    + id
    + name
    + open()
    + drop()
}

class RequestLoader {
    <<abstract>>
    + handled_count
    + total_count
    + fetch_next_request()
    + mark_request_as_handled()
    + is_empty()
    + is_finished()
    + to_tandem()
}

class RequestManager {
    <<abstract>>
    + add_request()
    + add_requests_batched()
    + reclaim_request()
    + drop()
}

%% ========================
%% Specific classes
%% ========================

class RequestQueue

class RequestList

class SitemapRequestLoader

class RequestManagerTandem

%% ========================
%% Inheritance arrows
%% ========================

Storage --|> RequestQueue
RequestManager --|> RequestQueue

RequestLoader --|> RequestManager
RequestLoader --|> RequestList
RequestLoader --|> SitemapRequestLoader
RequestManager --|> RequestManagerTandem
```

## Request loaders

The <ApiLink to="class/RequestLoader">`RequestLoader`</ApiLink> interface defines the foundation for fetching requests during a crawl. It provides abstract methods for basic operations like retrieving, marking, and checking the status of requests. Concrete implementations, such as <ApiLink to="class/RequestList">`RequestList`</ApiLink>, build on this interface to handle specific scenarios. You can create your own custom loader that reads from an external file, web endpoint, database, or any other specific data source. For more details, refer to the <ApiLink to="class/RequestLoader">`RequestLoader`</ApiLink> API reference.

:::info NOTE
To learn how to use request loaders in your crawlers, see the [Request manager tandem](#request-manager-tandem) section below.
:::

### Request list

The <ApiLink to="class/RequestList">`RequestList`</ApiLink> can accept an asynchronous generator as input, allowing requests to be streamed rather than loading them all into memory at once. This can significantly reduce memory usage, especially when working with large sets of URLs.

Here is a basic example of working with the <ApiLink to="class/RequestList">`RequestList`</ApiLink>:

<RunnableCodeBlock className="language-python" language="python">
    {RlBasicExample}
</RunnableCodeBlock>

### Request list with persistence

The <ApiLink to="class/RequestList">`RequestList`</ApiLink> supports state persistence, allowing it to resume from where it left off after interruption. This is particularly useful for long-running crawls or when you need to pause and resume crawling later.

To enable persistence, provide `persist_state_key` and optionally `persist_requests_key` parameters, and disable automatic cleanup by setting `purge_on_start = False` in the configuration. The `persist_state_key` saves the loader's progress, while `persist_requests_key` ensures that the request data doesn't change between runs. For more details on resuming interrupted crawls, see the [Resuming a paused crawl](../examples/resuming-paused-crawl) example.

<RunnableCodeBlock className="language-python" language="python">
    {RlBasicPersistExample}
</RunnableCodeBlock>

### Sitemap request loader

The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> is a specialized request loader that reads URLs from sitemaps following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats. It's particularly useful when you want to crawl a website systematically by following its sitemap structure.

:::note
The `SitemapRequestLoader` is designed specifically for sitemaps that follow the standard Sitemaps protocol. HTML pages containing links are not supported by this loader - those should be handled by regular crawlers using the `enqueue_links` functionality.
:::

The loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> provides streaming processing of sitemaps, ensuring efficient memory usage without loading the entire sitemap into memory.

<RunnableCodeBlock className="language-python" language="python">
    {SitemapExample}
</RunnableCodeBlock>

### Sitemap request loader with persistence

Similarly, the <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> supports state persistence to resume processing from where it left off. This is especially valuable when processing large sitemaps that may take considerable time to complete.

<RunnableCodeBlock className="language-python" language="python">
    {SitemapPersistExample}
</RunnableCodeBlock>

When using persistence with `SitemapRequestLoader`, make sure to use the context manager (`async with`) to properly save the state when the work is completed.

## Request managers

The <ApiLink to="class/RequestManager">`RequestManager`</ApiLink> extends `RequestLoader` with write capabilities. In addition to reading requests, a request manager can add and reclaim them. This is essential for dynamic crawling projects where new URLs may emerge during the crawl process, or when certain requests fail and need to be retried. For more details, refer to the <ApiLink to="class/RequestManager">`RequestManager`</ApiLink> API reference.

## Request manager tandem

The <ApiLink to="class/RequestManagerTandem">`RequestManagerTandem`</ApiLink> class allows you to combine the read-only capabilities of a `RequestLoader` (like <ApiLink to="class/RequestList">`RequestList`</ApiLink>) with the read-write capabilities of a `RequestManager` (like <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>). This is useful for scenarios where you need to load initial requests from a static source (such as a file or database) and dynamically add or retry requests during the crawl. Additionally, it provides deduplication capabilities, ensuring that requests are not processed multiple times.

Under the hood, <ApiLink to="class/RequestManagerTandem">`RequestManagerTandem`</ApiLink> checks whether the read-only loader still has pending requests. If so, each new request from the loader is transferred to the manager. Any newly added or reclaimed requests go directly to the manager side.

### Request list with request queue

This section describes the combination of the <ApiLink to="class/RequestList">`RequestList`</ApiLink> and <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> classes. This setup is particularly useful when you have a static list of URLs that you want to crawl, but also need to handle dynamic requests discovered during the crawl process. The <ApiLink to="class/RequestManagerTandem">`RequestManagerTandem`</ApiLink> class facilitates this combination, with the <ApiLink to="class/RequestLoader#to_tandem">`RequestLoader.to_tandem`</ApiLink> method available as a convenient shortcut. Requests from the <ApiLink to="class/RequestList">`RequestList`</ApiLink> are processed first by being enqueued into the default <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, which handles persistence and retries for failed requests.

<Tabs groupId="request_manager_tandem">
    <TabItem value="request_manager_tandem_explicit" label="Explicit usage">
        <RunnableCodeBlock className="language-python" language="python">
            {RlExplicitTandemExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="request_manager_tandem_helper" label="Using to_tandem helper" default>
        <RunnableCodeBlock className="language-python" language="python">
            {RlTandemExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>

### Sitemap request loader with request queue

Similar to the <ApiLink to="class/RequestList">`RequestList`</ApiLink> example above, you can combine a <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> with a <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> using the <ApiLink to="class/RequestManagerTandem">`RequestManagerTandem`</ApiLink> class. This setup is particularly useful when you want to crawl URLs from a sitemap while also handling dynamic requests discovered during the crawl process. URLs from the sitemap are processed first by being enqueued into the default <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, which handles persistence and retries for failed requests.

<Tabs groupId="sitemap_request_manager_tandem">
    <TabItem value="sitemap_request_manager_tandem_explicit" label="Explicit usage">
        <RunnableCodeBlock className="language-python" language="python">
            {SitemapExplicitTandemExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="sitemap_request_manager_tandem_helper" label="Using to_tandem helper" default>
        <RunnableCodeBlock className="language-python" language="python">
            {SitemapTandemExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>

## Conclusion

This guide explained the `request_loaders` sub-package, which extends the functionality of the `RequestQueue` with additional tools for managing URLs and requests. You learned about the `RequestLoader`, `RequestManager`, and `RequestManagerTandem` classes, as well as the `RequestList` and `SitemapRequestLoader` implementations. You also saw practical examples of how to work with these classes to handle various crawling scenarios.

If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!


================================================
FILE: docs/guides/request_router.mdx
================================================
---
id: request-router
title: Request router
description: Learn how to use the Router class to organize request handlers, error handlers, and pre-navigation hooks in Crawlee.
---

import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import BasicRequestHandlers from '!!raw-loader!roa-loader!./code_examples/request_router/basic_request_handlers.py';
import SimpleDefaultHandler from '!!raw-loader!roa-loader!./code_examples/request_router/simple_default_handler.py';
import CustomRouterDefaultOnly from '!!raw-loader!roa-loader!./code_examples/request_router/custom_router_default_only.py';
import HttpPreNavigation from '!!raw-loader!roa-loader!./code_examples/request_router/http_pre_navigation.py';
import ErrorHandler from '!!raw-loader!roa-loader!./code_examples/request_router/error_handler.py';
import FailedRequestHandler from '!!raw-loader!roa-loader!./code_examples/request_router/failed_request_handler.py';
import PlaywrightPreNavigation from '!!raw-loader!roa-loader!./code_examples/request_router/playwright_pre_navigation.py';
import AdaptiveCrawlerHandlers from '!!raw-loader!roa-loader!./code_examples/request_router/adaptive_crawler_handlers.py';

The <ApiLink to="class/Router">`Router`</ApiLink> class manages request flow and coordinates the execution of user-defined logic in Crawlee projects. It routes incoming requests to appropriate user-defined handlers based on labels, manages error scenarios, and provides hooks for pre-navigation execution. The <ApiLink to="class/Router">`Router`</ApiLink> serves as the orchestrator for all crawling operations, ensuring that each request is processed by the correct handler according to its type and label.

## Request handlers

Request handlers are user-defined functions that process individual requests and their corresponding responses. Each handler receives a crawling context as its primary argument, which provides access to the current request, response data, and utility methods for data extraction, link enqueuing, and storage operations. Handlers determine how different types of pages are processed and how data is extracted and stored.

:::note

The code examples in this guide use <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> for demonstration, but the <ApiLink to="class/Router">`Router`</ApiLink> works with all crawler types.

:::

### Built-in router

Every crawler instance includes a built-in <ApiLink to="class/Router">`Router`</ApiLink> accessible through the `crawler.router` property. This approach simplifies initial setup and covers basic use cases where request routing requirements are straightforward.

<RunnableCodeBlock className="language-python" language="python">
    {SimpleDefaultHandler}
</RunnableCodeBlock>

The default handler processes all requests that either lack a label or have a label for which no specific handler has been registered.

### Custom router

Applications requiring explicit control over router configuration or router reuse across multiple crawler instances can create custom <ApiLink to="class/Router">`Router`</ApiLink> instances. Custom routers provide complete control over request routing configuration and enable modular application architecture. Router instances can be configured independently and attached to your crawler instances as needed.

You can also implement a custom request router class from scratch or by inheriting from <ApiLink to="class/Router">`Router`</ApiLink>. This allows you to define custom routing logic or manage request handlers in a different way.

<RunnableCodeBlock className="language-python" language="python">
    {CustomRouterDefaultOnly}
</RunnableCodeBlock>

### Advanced routing by labels

More complex crawling projects often require different processing logic for various page types. The router supports label-based routing, which allows registration of specialized handlers for specific content categories. This pattern enables clean separation of concerns and targeted processing logic for different URL patterns or content types.

<RunnableCodeBlock className="language-python" language="python">
    {BasicRequestHandlers}
</RunnableCodeBlock>

## Error handlers

Crawlee provides error handling mechanisms to manage request processing failures. It distinguishes between recoverable errors that may succeed on retry and permanent failures that require alternative handling strategies.

### Error handler

The error handler executes when exceptions occur during request processing, before any retry attempts. This handler receives the error context and can implement custom recovery logic, modify request parameters, or determine whether the request should be retried. Error handlers enable control over failure scenarios and allow applications to implement error recovery strategies.

<RunnableCodeBlock className="language-python" language="python">
    {ErrorHandler}
</RunnableCodeBlock>

### Failed request handler

The failed request handler executes when a request has exhausted all retry attempts and is considered permanently failed. This handler serves as the final opportunity to log failures, store failed requests for later analysis, create alternative requests, or implement fallback processing strategies.

<RunnableCodeBlock className="language-python" language="python">
    {FailedRequestHandler}
</RunnableCodeBlock>

## Pre-navigation hooks

Pre-navigation hooks execute before each request is processed, providing opportunities to configure request parameters, modify browser settings, or implement request-specific optimizations. You can use pre-navigation hooks for example for viewport configuration, resource blocking, timeout management, header customization, custom proxy rotation, and request interception.

### HTTP crawler

HTTP crawlers support pre-navigation hooks that execute before making HTTP requests. These hooks enable request modification, header configuration, and other HTTP-specific optimizations.

<RunnableCodeBlock className="language-python" language="python">
    {HttpPreNavigation}
</RunnableCodeBlock>

### Playwright crawler

Playwright crawlers provide extensive pre-navigation capabilities that allow browser page configuration before navigation. These hooks can modify browser behavior and configure page settings.

<RunnableCodeBlock className="language-python" language="python">
    {PlaywrightPreNavigation}
</RunnableCodeBlock>

### Adaptive Playwright crawler

The <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> implements a dual-hook system with common hooks that execute for all requests and Playwright-specific hooks that execute only when browser automation is required. This is perfect for projects that need both static and dynamic content handling.

<RunnableCodeBlock className="language-python" language="python">
    {AdaptiveCrawlerHandlers}
</RunnableCodeBlock>

## Conclusion

This guide introduced you to the <ApiLink to="class/Router">`Router`</ApiLink> class and how to organize your crawling logic. You learned how to use built-in and custom routers, implement request handlers with label-based routing, handle errors with error and failed request handlers, and configure pre-navigation hooks for different crawler types.

If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!


================================================
FILE: docs/guides/running_in_web_server.mdx
================================================
---
id: running-in-web-server
title: Running in web server
description: Running in web server
---

import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';

import Crawler from '!!raw-loader!./code_examples/running_in_web_server/crawler.py';
import Server from '!!raw-loader!./code_examples/running_in_web_server/server.py';


Most of the time, Crawlee jobs are run as batch jobs. You have a list of URLs you want to scrape every week or you might want to scrape a whole website once per day. After the scrape, you send the data to your warehouse for analytics. Batch jobs are efficient because they can use Crawlee's built-in autoscaling to fully utilize the resources you have available. But sometimes you have a use-case where you need to return scrape data as soon as possible. There might be a user waiting on the other end so every millisecond counts. This is where running Crawlee in a web server comes in.

We will build a simple HTTP server that receives a page URL and returns the page title in the response.

## Set up a web server

There are many popular web server frameworks for Python, such as [Flask](https://flask.palletsprojects.com/en/stable/), [Django](https://www.djangoproject.com/), [Pyramid](https://trypyramid.com/), ... In this guide, we will use the [FastAPI](https://fastapi.tiangolo.com/) to keep things simple.

This will be our core server setup:

<CodeBlock className="language-python" title="server.py">
    {Server}
</CodeBlock>

The server has two endpoints.
- `/` - The index is just giving short description of the server with example link to the second endpoint.
- `/scrape` - This is the endpoint that receives a `url` parameter and returns the page title scraped from the URL

To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and from the directory where the example code is located you can use the following command:
```
fastapi dev server.py
```

## Create a crawler

We will create a standard <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> and use the `keep_alive=true` option to keep the crawler running even if there are no requests currently in the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. This way it will always be waiting for new requests to come in.

<CodeBlock className="language-python" title="crawler.py">
    {Crawler}
</CodeBlock>

Crawler is defined inside of [Lifespan](https://fastapi.tiangolo.com/advanced/events/#lifespan) which is a FastAPI way to run some start up/ teardown code for the app. There are two objects that we want to save to the app state so that they can be accessed in any endpoint through `request.state`:
- `crawler` holds instance of our crawler and allows the app to interact with it.
- `requests_to_results` is dictionary that is used to temporarily register expected results for each request and populate them when they are made available by the crawler.


================================================
FILE: docs/guides/scaling_crawlers.mdx
================================================
---
id: scaling-crawlers
title: Scaling crawlers
description: Learn how to scale your crawlers by controlling concurrency and limiting requests per minute.
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import MaxTasksPerMinuteExample from '!!raw-loader!roa-loader!./code_examples/scaling_crawlers/max_tasks_per_minute_example.py';
import MinAndMaxConcurrencyExample from '!!raw-loader!roa-loader!./code_examples/scaling_crawlers/min_and_max_concurrency_example.py';

As we build our crawler, we may want to control how many tasks it performs at any given time. In other words, how many requests it makes to the web we are trying to scrape. Crawlee offers several options to fine-tune the number of parallel tasks, limit the number of requests per minute, and optimize scaling based on available system resources.

:::tip

All of these options are available across all crawlers provided by Crawlee. In this guide, we are using the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> as an example. You should also explore the <ApiLink to="class/ConcurrencySettings">`ConcurrencySettings`</ApiLink>.

:::

## Max tasks per minute

The `max_tasks_per_minute` setting in <ApiLink to="class/ConcurrencySettings">`ConcurrencySettings`</ApiLink> controls how many total tasks the crawler can process per minute. It ensures that tasks are spread evenly throughout the minute, preventing a sudden burst at the `max_concurrency` limit followed by idle time. By default, this is set to `Infinity`, meaning the crawler can run at full speed, limited only by `max_concurrency`. Use this option if you want to throttle your crawler to avoid overwhelming the target website with continuous requests.

<RunnableCodeBlock className="language-python" language="python">
    {MaxTasksPerMinuteExample}
</RunnableCodeBlock>

## Minimum and maximum concurrency

The `min_concurrency` and `max_concurrency` options in the <ApiLink to="class/ConcurrencySettings">`ConcurrencySettings`</ApiLink> define the minimum and maximum number of parallel tasks that can run at any given time. By default, crawlers start with a single parallel task and gradually scale up to a maximum of concurrent requests.

:::caution Avoid setting minimum concurrency too high

If you set `min_concurrency` too high compared to the available system resources, the crawler may run very slowly or even crash. It is recommended to stick with the default value and let the crawler automatically adjust concurrency based on the system's available resources.

:::

## Desired concurrency

The `desired_concurrency` option in the <ApiLink to="class/ConcurrencySettings">`ConcurrencySettings`</ApiLink> specifies the initial number of parallel tasks to start with, assuming sufficient resources are available. It defaults to the same value as `min_concurrency`.

<RunnableCodeBlock className="language-python" language="python">
    {MinAndMaxConcurrencyExample}
</RunnableCodeBlock>

## Autoscaled pool

The <ApiLink to="class/AutoscaledPool">`AutoscaledPool`</ApiLink> manages a pool of asynchronous, resource-intensive tasks that run in parallel. It automatically starts new tasks only when there is enough free CPU and memory. To monitor system resources, it leverages the <ApiLink to="class/Snapshotter">`Snapshotter`</ApiLink> and <ApiLink to="class/SystemStatus">`SystemStatus`</ApiLink> classes. If any task raises an exception, the error is propagated, and the pool is stopped. Every crawler uses an <ApiLink to="class/AutoscaledPool">`AutoscaledPool`</ApiLink> under the hood.


================================================
FILE: docs/guides/service_locator.mdx
================================================
---
id: service-locator
title: Service locator
description: Crawlee's service locator is a central registry for global services, managing and providing access to them throughout the whole framework.
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';

import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import ServiceLocatorConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_configuration.py';
import ServiceLocatorStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_storage_client.py';
import ServiceLocatorEventManager from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_event_manager.py';

import ServiceCrawlerConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_configuration.py';
import ServiceCrawlerStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_storage_client.py';
import ServiceCrawlerEventManager from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_event_manager.py';

import ServiceStorageConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_storage_configuration.py';
import ServiceStorageStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_storage_storage_client.py';

import ServiceConflicts from '!!raw-loader!roa-loader!./code_examples/service_locator/service_conflicts.py';

The <ApiLink to="class/ServiceLocator">`ServiceLocator`</ApiLink> is a central registry for global services. It manages and provides access to these services throughout the framework, ensuring their consistent configuration and across all components.

The service locator manages three core services: <ApiLink to="class/Configuration">`Configuration`</ApiLink>, <ApiLink to="class/EventManager">`EventManager`</ApiLink>, and <ApiLink to="class/StorageClient">`StorageClient`</ApiLink>. All services are initialized lazily with defaults when first accessed.

## Services

There are three core services that are managed by the service locator:

### Configuration

<ApiLink to="class/Configuration">`Configuration`</ApiLink> is a class that provides access to application-wide settings and parameters. It allows you to configure various aspects of Crawlee, such as timeouts, logging level, persistence intervals, and various other settings. The configuration can be set directly in the code or via environment variables.

### StorageClient

<ApiLink to="class/StorageClient">`StorageClient`</ApiLink> is the backend implementation for storages in Crawlee. It provides a unified interface for <ApiLink to="class/Dataset">`Dataset`</ApiLink>, <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, and <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, regardless of the underlying storage implementation. Storage clients were already explained in the storage clients section.

Refer to the [Storage clients guide](./storage-clients) for more information about storage clients and how to use them.

### EventManager

<ApiLink to="class/EventManager">`EventManager`</ApiLink> is responsible for coordinating internal events in Crawlee. It allows you to register event listeners and emit events throughout the framework. Examples of such events aborting, migrating, system info, or browser-specific events like page created, page closed and more. It provides a way to listen to events and execute custom logic when certain events occur.

## Service registration

There are several ways to register services in Crawlee, depending on your use case and preferences.

### Via service locator

Services can be registered globally through the <ApiLink to="class/ServiceLocator">`ServiceLocator`</ApiLink> before they are first accessed. There is a singleton `service_locator` instance that is used throughout the framework, making the services available to all components throughout the whole framework.

<Tabs>

<TabItem value="storage-client" label="Storage client">
    <RunnableCodeBlock className="language-python" language="python">
        {ServiceLocatorStorageClient}
    </RunnableCodeBlock>
</TabItem>

<TabItem value="configuration" label="Configuration">
    <RunnableCodeBlock className="language-python" language="python">
        {ServiceLocatorConfiguration}
    </RunnableCodeBlock>
</TabItem>

<TabItem value="event-manager" label="Event manager">
    <RunnableCodeBlock className="language-python" language="python">
        {ServiceLocatorEventManager}
    </RunnableCodeBlock>
</TabItem>

</Tabs>

### Via crawler constructors

Alternatively services can be passed to the crawler constructors. They will be registered globally to the <ApiLink to="class/ServiceLocator">`ServiceLocator`</ApiLink> under the hood, making them available to all components and reaching consistent configuration.

<Tabs>

<TabItem value="storage-client" label="Storage client">
    <RunnableCodeBlock className="language-python" language="python">
        {ServiceCrawlerStorageClient}
    </RunnableCodeBlock>
</TabItem>

<TabItem value="configuration" label="Configuration">
    <RunnableCodeBlock className="language-python" language="python">
        {ServiceCrawlerConfiguration}
    </RunnableCodeBlock>
</TabItem>

<TabItem value="event-manager" label="Event manager">
    <RunnableCodeBlock className="language-python" language="python">
        {ServiceCrawlerEventManager}
    </RunnableCodeBlock>
</TabItem>

</Tabs>

### Via storage constructors

Alternatively, services can be provided when opening specific storage instances, which uses them only for that particular instance without affecting global configuration.

<Tabs>

<TabItem value="storage-client" label="Storage client">
    <RunnableCodeBlock className="language-python" language="python">
        {ServiceStorageStorageClient}
    </RunnableCodeBlock>
</TabItem>

<TabItem value="configuration" label="Configuration">
    <RunnableCodeBlock className="language-python" language="python">
        {ServiceStorageConfiguration}
    </RunnableCodeBlock>
</TabItem>

</Tabs>

## Conflict prevention

Once a service has been retrieved from the service locator, attempting to set a different instance will raise a <ApiLink to="class/ServiceConflictError">`ServiceConflictError`</ApiLink> to prevent accidental configuration conflicts.

<RunnableCodeBlock className="language-python" language="python">
    {ServiceConflicts}
</RunnableCodeBlock>

## Conclusion

The <ApiLink to="class/ServiceLocator">`ServiceLocator`</ApiLink> is a tool for managing global services in Crawlee. It provides a consistent way to configure and access services throughout the framework, ensuring that all components have access to the same configuration and services.

If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!


================================================
FILE: docs/guides/session_management.mdx
================================================
---
id: session-management
title: Session management
description: How to manage your cookies, proxy IP rotations and more.
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import BasicSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_basic.py';
import HttpSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_http.py';
import BeautifulSoupSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_beautifulsoup.py';
import ParselSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_parsel.py';
import PlaywrightSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_playwright.py';
import StandaloneSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_standalone.py';
import OneSession from '!!raw-loader!roa-loader!./code_examples/session_management/one_session_http.py';
import MultiSessions from '!!raw-loader!roa-loader!./code_examples/session_management/multi_sessions_http.py';

The <ApiLink to="class/SessionPool">`SessionPool`</ApiLink> class provides a robust way to manage the rotation of proxy IP addresses, cookies, and other custom settings in Crawlee. Its primary advantage is the ability to filter out blocked or non-functional proxies, ensuring that your scraper avoids retrying requests through known problematic proxies.

Additionally, it enables storing information tied to specific IP addresses, such as cookies, authentication tokens, and custom headers. This association reduces the probability of detection and blocking by ensuring cookies and other identifiers are used consistently with the same IP address.

Finally, it ensures even IP address rotation by randomly selecting sessions. This helps prevent overuse of a limited pool of available IPs, reducing the risk of IP bans and enhancing the efficiency of your scraper.

For more details on configuring proxies, refer to the [Proxy management](./proxy-management) guide.

Now, let's explore examples of how to use the <ApiLink to="class/SessionPool">`SessionPool`</ApiLink> in different scenarios:
- with <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink>;
- with <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>;
- with <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>;
- with <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>;
- with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>;
- without a crawler (standalone usage to manage sessions manually).

<Tabs groupId="session_pool">
    <TabItem value="basic" label="BasicSource">
        <RunnableCodeBlock className="language-python" language="python">
            {BasicSource}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="http" label="HttpCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {HttpSource}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="beautifulsoup" label="BeautifulSoupCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {BeautifulSoupSource}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="parsel" label="ParselCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {ParselSource}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="playwright" label="PlaywrightCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {PlaywrightSource}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="standalone" label="Standalone">
        <RunnableCodeBlock className="language-python" language="python">
            {StandaloneSource}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>

These examples demonstrate the basics of configuring and using the <ApiLink to="class/SessionPool">`SessionPool`</ApiLink>.

Please, bear in mind that <ApiLink to="class/SessionPool">`SessionPool`</ApiLink> requires some time to establish a stable pool of working IPs. During the initial setup, you may encounter errors as the pool identifies and filters out blocked or non-functional IPs. This stabilization period is expected and will improve over time.

## Configuring a single session

In some cases, you need full control over session usage. For example, when working with websites requiring authentication or initialization of certain parameters like cookies.

When working with a site that requires authentication, we typically don't want multiple sessions with different browser fingerprints or client parameters accessing the site. In this case, we need to configure the <ApiLink to="class/SessionPool">`SessionPool`</ApiLink> appropriately:

<RunnableCodeBlock className="language-python" language="python">
    {OneSession}
</RunnableCodeBlock>

## Binding requests to specific sessions

In the previous example, there's one obvious limitation - you're restricted to only one session.

In some cases, we need to achieve the same behavior but using multiple sessions in parallel, such as authenticating with different profiles or using different proxies.

To do this, use the `session_id` parameter for the <ApiLink to="class/Request">`Request`</ApiLink> object to bind a request to a specific session:

<RunnableCodeBlock className="language-python" language="python">
    {MultiSessions}
</RunnableCodeBlock>


================================================
FILE: docs/guides/storage_clients.mdx
================================================
---
id: storage-clients
title: Storage clients
description: How to work with storage clients in Crawlee, including the built-in clients and how to create your own.
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
import CodeBlock from '@theme/CodeBlock';

import MemoryStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/memory_storage_client_basic_example.py';
import FileSystemStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_basic_example.py';
import FileSystemStorageClientConfigurationExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_configuration_example.py';
import CustomStorageClientExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/custom_storage_client_example.py';
import RegisteringStorageClientsExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/registering_storage_clients_example.py';
import SQLStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/sql_storage_client_basic_example.py';
import SQLStorageClientConfigurationExample from '!!raw-loader!./code_examples/storage_clients/sql_storage_client_configuration_example.py';
import RedisStorageClientBasicExample from '!!raw-loader!./code_examples/storage_clients/redis_storage_client_basic_example.py';
import RedisStorageClientConfigurationExample from '!!raw-loader!./code_examples/storage_clients/redis_storage_client_configuration_example.py';

Storage clients provide a unified interface for interacting with <ApiLink to="class/Dataset">`Dataset`</ApiLink>, <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, and <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, regardless of the underlying implementation. They handle operations like creating, reading, updating, and deleting storage instances, as well as managing data persistence and cleanup. This abstraction makes it easy to switch between different environments, such as local development and cloud production setups.

## Built-in storage clients

Crawlee provides three main storage client implementations:

- <ApiLink to="class/FileSystemStorageClient">`FileSystemStorageClient`</ApiLink> - Provides persistent file system storage with in-memory caching.
- <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink> - Stores data in memory with no persistence.
- <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> - Provides persistent storage using a SQL database ([SQLite](https://sqlite.org/), [PostgreSQL](https://www.postgresql.org/), [MySQL](https://www.mysql.com/) or [MariaDB](https://mariadb.org/)). Requires installing the extra dependency: `crawlee[sql_sqlite]` for SQLite, `crawlee[sql_postgres]` for PostgreSQL or `crawlee[sql_mysql]` for MySQL and MariaDB.
- <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> - Provides persistent storage using a [Redis](https://redis.io/) database v8.0+. Requires installing the extra dependency `crawlee[redis]`.
- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com), implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python).

```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Abstract classes
%% ========================

class StorageClient {
    <<abstract>>
}

%% ========================
%% Specific classes
%% ========================

class FileSystemStorageClient

class MemoryStorageClient

class SqlStorageClient

class RedisStorageClient

class ApifyStorageClient

%% ========================
%% Inheritance arrows
%% ========================

StorageClient --|> FileSystemStorageClient
StorageClient --|> MemoryStorageClient
StorageClient --|> SqlStorageClient
StorageClient --|> RedisStorageClient
StorageClient --|> ApifyStorageClient
```

### File system storage client

The <ApiLink to="class/FileSystemStorageClient">`FileSystemStorageClient`</ApiLink> provides persistent storage by writing data directly to the file system. It uses intelligent caching and batch processing for better performance while storing data in human-readable JSON format. This is the default storage client used by Crawlee when no other storage client is specified, making it ideal for large datasets and long-running operations where data persistence is required.

:::warning Concurrency limitation
The `FileSystemStorageClient` is not safe for concurrent access from multiple crawler processes. Use it only when running a single crawler process at a time.
:::

This storage client is ideal for large datasets, and long-running operations where data persistence is required. Data can be easily inspected and shared with other tools.

<RunnableCodeBlock className="language-python" language="python">
    {FileSystemStorageClientBasicExample}
</RunnableCodeBlock>

Configuration options for the <ApiLink to="class/FileSystemStorageClient">`FileSystemStorageClient`</ApiLink> can be set through environment variables or the <ApiLink to="class/Configuration">`Configuration`</ApiLink> class:

- **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`) - The root directory for all storage data.
- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start.

Data is stored using the following directory structure:

```text
{CRAWLEE_STORAGE_DIR}/
├── datasets/
│   └── {DATASET_NAME}/
│       ├── __metadata__.json
│       ├── 000000001.json
│       └── 000000002.json
├── key_value_stores/
│   └── {KVS_NAME}/
│       ├── __metadata__.json
│       ├── key1.json
│       ├── key2.txt
│       └── key3.json
└── request_queues/
    └── {RQ_NAME}/
        ├── __metadata__.json
        ├── {REQUEST_ID_1}.json
        └── {REQUEST_ID_2}.json
```

Where:
- `{CRAWLEE_STORAGE_DIR}` - The root directory for local storage.
- `{DATASET_NAME}`, `{KVS_NAME}`, `{RQ_NAME}` - The unique names for each storage instance (defaults to `"default"`).
- Files are stored directly without additional metadata files for simpler structure.

Here is an example of how to configure the <ApiLink to="class/FileSystemStorageClient">`FileSystemStorageClient`</ApiLink>:

<RunnableCodeBlock className="language-python" language="python">
    {FileSystemStorageClientConfigurationExample}
</RunnableCodeBlock>

### Memory storage client

The <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink> stores all data in memory using Python data structures. It provides fast access but does not persist data between runs, meaning all data is lost when the program terminates. This storage client is primarily suitable for testing and development, and is usually not a good fit for production use. However, in some cases where speed is prioritized over persistence, it can make sense.

:::warning Persistence limitation
The `MemoryStorageClient` does not persist data between runs. All data is lost when the program terminates.
:::

<RunnableCodeBlock className="language-python" language="python">
    {MemoryStorageClientBasicExample}
</RunnableCodeBlock>

### SQL storage client

:::warning Experimental feature
The `SqlStorageClient` is experimental. Its API and behavior may change in future releases.
:::

The <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> provides persistent storage using a SQL database (SQLite by default, or PostgreSQL, MySQL, MariaDB). It supports all Crawlee storage types and enables concurrent access from multiple independent clients or processes.

:::note dependencies
The <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> is not included in the core Crawlee package.
To use it, you need to install Crawlee with the appropriate extra dependency:

- For SQLite support, run:
  <code>pip install 'crawlee[sql_sqlite]'</code>
- For PostgreSQL support, run:
  <code>pip install 'crawlee[sql_postgres]'</code>
- For MySQL or MariaDB support, run:
  <code>pip install 'crawlee[sql_mysql]'</code>
:::

By default, <ApiLink to="class/SqlStorageClient">SqlStorageClient</ApiLink> uses SQLite.
To use a different database, just provide the appropriate connection string via the `connection_string` parameter. No other code changes are needed—the same client works for all supported databases.

<RunnableCodeBlock className="language-python" language="python">
    {SQLStorageClientBasicExample}
</RunnableCodeBlock>

Data is organized in relational tables. Below are the main tables and columns used for each storage type:

```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Storage Clients
%% ========================

class SqlDatasetClient {
    <<Dataset>>
}

class SqlKeyValueStoreClient {
    <<Key-value store>>
}

%% ========================
%% Dataset Tables
%% ========================

class datasets {
    <<table>>
    + dataset_id (PK)
    + internal_name
    + name
    + accessed_at
    + created_at
    + modified_at
    + item_count
    + buffer_locked_until
}

class dataset_records {
    <<table>>
    + item_id (PK)
    + dataset_id (FK)
    + data
}

class dataset_metadata_buffer {
    <<table>>
    + id (PK)
    + accessed_at
    + modified_at
    + delta_item_count
}

%% ========================
%% Key-Value Store Tables
%% ========================

class key_value_stores {
    <<table>>
    + key_value_store_id (PK)
    + internal_name
    + name
    + accessed_at
    + created_at
    + modified_at
    + buffer_locked_until
}

class key_value_store_records {
    <<table>>
    + key_value_store_id (FK, PK)
    + key (PK)
    + value
    + content_type
    + size
}

class key_value_store_metadata_buffer {
    <<table>>
    + id (PK)
    + accessed_at
    + modified_at
}

%% ========================
%% Client to Table arrows
%% ========================

SqlDatasetClient --> datasets
SqlDatasetClient --> dataset_records
SqlDatasetClient --> dataset_metadata_buffer

SqlKeyValueStoreClient --> key_value_stores
SqlKeyValueStoreClient --> key_value_store_records
SqlKeyValueStoreClient --> key_value_store_metadata_buffer
```
```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Storage Clients
%% ========================

class SqlRequestQueueClient {
    <<Request queue>>
}

%% ========================
%% Request Queue Tables
%% ========================

class request_queues {
    <<table>>
    + request_queue_id (PK)
    + internal_name
    + name
    + accessed_at
    + created_at
    + modified_at
    + had_multiple_clients
    + handled_request_count
    + pending_request_count
    + total_request_count
    + buffer_locked_until
}

class request_queue_records {
    <<table>>
    + request_id (PK)
    + request_queue_id (FK, PK)
    + data
    + sequence_number
    + is_handled
    + time_blocked_until
    + client_key
}

class request_queue_state {
    <<table>>
    + request_queue_id (FK, PK)
    + sequence_counter
    + forefront_sequence_counter
}

class request_queue_metadata_buffer {
    <<table>>
    + id (PK)
    + accessed_at
    + modified_at
    + client_id
    + delta_handled_count
    + delta_pending_count
    + delta_total_count
    + need_recalc
}

%% ========================
%% Client to Table arrows
%% ========================

SqlRequestQueueClient --> request_queues
SqlRequestQueueClient --> request_queue_records
SqlRequestQueueClient --> request_queue_state
SqlRequestQueueClient --> request_queue_metadata_buffer
```

Configuration options for the <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> can be set through environment variables or the <ApiLink to="class/Configuration">`Configuration`</ApiLink> class:

- **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`) - The root directory where the default SQLite database will be created if no connection string is provided.
- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start.

Configuration options for the <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> can be set via constructor arguments:

- **`connection_string`** (default: SQLite in <ApiLink to="class/Configuration">`Configuration`</ApiLink> storage dir) - SQLAlchemy connection string, e.g. `sqlite+aiosqlite:///my.db`, `postgresql+asyncpg://user:pass@host/db`, `mysql+aiomysql://user:pass@host/db` or `mariadb+aiomysql://user:pass@host/db`.
- **`engine`** - Pre-configured SQLAlchemy AsyncEngine (optional).

For advanced scenarios, you can configure <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> with a custom SQLAlchemy engine and additional options via the <ApiLink to="class/Configuration">`Configuration`</ApiLink> class. This is useful, for example, when connecting to an external PostgreSQL database or customizing connection pooling.

:::warning
If you use MySQL or MariaDB, pass the `isolation_level='READ COMMITTED'` argument to `create_async_engine`. MySQL/MariaDB default to the `REPEATABLE READ` isolation level, which can cause unnecessary locking, deadlocks, or stale reads when multiple Crawlee workers access the same tables concurrently. Using `READ COMMITTED` ensures more predictable row-level locking and visibility semantics for `SqlStorageClient`.
:::

<CodeBlock className="language-python" language="python">
    {SQLStorageClientConfigurationExample}
</CodeBlock>

### Redis storage client

:::warning Experimental feature
The <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> is experimental. Its API and behavior may change in future releases.
:::

The <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> provides persistent storage using [Redis](https://redis.io/) database. It supports concurrent access from multiple independent clients or processes and uses Redis native data structures for efficient operations.

:::note dependencies
The <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> is not included in the core Crawlee package.
To use it, you need to install Crawlee with the Redis extra dependency:

<code>pip install 'crawlee[redis]'</code>

Additionally, Redis version 8.0 or higher is required.
:::

:::note Redis persistence
Data persistence in Redis depends on your [database configuration](https://redis.io/docs/latest/operate/oss_and_stack/management/persistence/).
:::

The client requires either a Redis connection string or a pre-configured Redis client instance. Use a pre-configured client when you need custom Redis settings such as connection pooling, timeouts, or SSL/TLS encryption.

<CodeBlock className="language-python" language="python">
    {RedisStorageClientBasicExample}
</CodeBlock>

Data is organized using Redis key patterns. Below are the main data structures used for each storage type:

```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Storage Client
%% ========================

class RedisDatasetClient {
    <<Dataset>>
}

%% ========================
%% Dataset Keys
%% ========================

class DatasetKeys {
    datasets:[name]:items - JSON Array
    datasets:[name]:metadata - JSON Object
}

class DatasetsIndexes {
    datasets:id_to_name - Hash
    datasets:name_to_id - Hash
}

%% ========================
%% Client to Keys arrows
%% ========================

RedisDatasetClient --> DatasetKeys
RedisDatasetClient --> DatasetsIndexes
```

```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Storage Clients
%% ========================

class RedisKeyValueStoreClient {
    <<Key-value store>>
}

%% ========================
%% Key-Value Store Keys
%% ========================

class KeyValueStoreKeys {
    key_value_stores:[name]:items - Hash
    key_value_stores:[name]:metadata_items - Hash
    key_value_stores:[name]:metadata - JSON Object
}

class KeyValueStoresIndexes {
    key_value_stores:id_to_name - Hash
    key_value_stores:name_to_id - Hash
}

%% ========================
%% Client to Keys arrows
%% ========================

RedisKeyValueStoreClient --> KeyValueStoreKeys
RedisKeyValueStoreClient --> KeyValueStoresIndexes
```

```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Storage Clients
%% ========================

class RedisRequestQueueClient {
    <<Request queue>>
}

%% ========================
%% Request Queue Keys
%% ========================

class RequestQueueKeys{
    request_queues:[name]:queue - List
    request_queues:[name]:data - Hash
    request_queues:[name]:in_progress - Hash
    request_queues:[name]:added_bloom_filter - Bloom Filter | bloom queue_dedup_strategy
    request_queues:[name]:handled_bloom_filter - Bloom Filter | bloom queue_dedup_strategy
    request_queues:[name]:pending_set - Set | default queue_dedup_strategy
    request_queues:[name]:handled_set - Set | default queue_dedup_strategy
    request_queues:[name]:metadata - JSON Object
}

class RequestQueuesIndexes {
    request_queues:id_to_name - Hash
    request_queues:name_to_id - Hash
}

%% ========================
%% Client to Keys arrows
%% ========================

RedisRequestQueueClient --> RequestQueueKeys
RedisRequestQueueClient --> RequestQueuesIndexes
```

Configuration options for the <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> can be set through environment variables or the <ApiLink to="class/Configuration">`Configuration`</ApiLink> class:

- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start.

Configuration options for the <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> can be set via constructor arguments:

- **`connection_string`** - Redis connection string, e.g. `redis://localhost:6379/0`.
- **`redis`** - Pre-configured Redis client instance (optional).

<CodeBlock className="language-python" language="python">
    {RedisStorageClientConfigurationExample}
</CodeBlock>

## Creating a custom storage client

A storage client consists of two parts: the storage client factory and individual storage type clients. The <ApiLink to="class/StorageClient">`StorageClient`</ApiLink> acts as a factory that creates specific clients (<ApiLink to="class/DatasetClient">`DatasetClient`</ApiLink>, <ApiLink to="class/KeyValueStoreClient">`KeyValueStoreClient`</ApiLink>, <ApiLink to="class/RequestQueueClient">`RequestQueueClient`</ApiLink>) where the actual storage logic is implemented.

Here is an example of a custom storage client that implements the <ApiLink to="class/StorageClient">`StorageClient`</ApiLink> interface:

<RunnableCodeBlock className="language-python" language="python">
    {CustomStorageClientExample}
</RunnableCodeBlock>

Custom storage clients can implement any storage logic, such as connecting to a database, using a cloud storage service, or integrating with other systems. They must implement the required methods for creating, reading, updating, and deleting data in the respective storages.

## Registering storage clients

Storage clients can be registered in multiple ways:
- **Globally** - Using the <ApiLink to="class/ServiceLocator">`ServiceLocator`</ApiLink> or passing directly to the crawler.
- **Per storage** - When opening a specific storage instance like <ApiLink to="class/Dataset">`Dataset`</ApiLink>, <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, or <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>.

<RunnableCodeBlock className="language-python" language="python">
    {RegisteringStorageClientsExample}
</RunnableCodeBlock>

You can also register different storage clients for each storage instance, allowing you to use different backends for different storages. This is useful when you want to use a fast in-memory storage for <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> while persisting scraping results in <ApiLink to="class/Dataset">`Dataset`</ApiLink> or <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>.

## Conclusion

Storage clients in Crawlee provide different backends for data storage. Use <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink> for testing and fast operations without persistence, or <ApiLink to="class/FileSystemStorageClient">`FileSystemStorageClient`</ApiLink> for environments where data needs to persist. You can also create custom storage clients for specialized backends by implementing the <ApiLink to="class/StorageClient">`StorageClient`</ApiLink> interface.

If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!


================================================
FILE: docs/guides/storages.mdx
================================================
---
id: storages
title: Storages
description: How to work with storages in Crawlee, how to manage requests and how to store and retrieve scraping results.
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import OpeningExample from '!!raw-loader!roa-loader!./code_examples/storages/opening.py';

import RqBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_basic_example.py';
import RqWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_with_crawler_example.py';
import RqWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_with_crawler_explicit_example.py';
import RqHelperAddRequestsExample from '!!raw-loader!roa-loader!./code_examples/storages/helper_add_requests_example.py';
import RqHelperEnqueueLinksExample from '!!raw-loader!roa-loader!./code_examples/storages/helper_enqueue_links_example.py';

import DatasetBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_basic_example.py';
import DatasetWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_with_crawler_example.py';
import DatasetWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_with_crawler_explicit_example.py';

import KvsBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_basic_example.py';
import KvsWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_example.py';
import KvsWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_explicit_example.py';

import CleaningDoNotPurgeExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_do_not_purge_example.py';
import CleaningPurgeExplicitlyExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_purge_explicitly_example.py';

Crawlee offers several storage types for managing and persisting your crawling data. Request-oriented storages, such as the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, help you store and deduplicate URLs, while result-oriented storages, like <ApiLink to="class/Dataset">`Dataset`</ApiLink> and <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, focus on storing and retrieving scraping results. This guide explains when to use each type, how to interact with them, and how to control their lifecycle.

## Overview

Crawlee's storage system consists of two main layers:
- **Storages** (<ApiLink to="class/Dataset">`Dataset`</ApiLink>, <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>): High-level interfaces for interacting with different storage types.
- **Storage clients** (<ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink>, <ApiLink to="class/FileSystemStorageClient">`FileSystemStorageClient`</ApiLink>, etc.): Backend implementations that handle the actual data persistence and management.

For more information about storage clients and their configuration, see the [Storage clients guide](./storage-clients).

```mermaid
---
config:
    class:
        hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Abstract classes
%% ========================

class Storage {
    <<abstract>>
}

%% ========================
%% Specific classes
%% ========================

class Dataset

class KeyValueStore

class RequestQueue

%% ========================
%% Inheritance arrows
%% ========================

Storage --|> Dataset
Storage --|> KeyValueStore
Storage --|> RequestQueue
```

### Named and unnamed storages

Crawlee supports two types of storages:

- **Named storages**: Persistent storages with a specific name that persist across runs. These are useful when you want to share data between different crawler runs or access the same storage from multiple places.
- **Unnamed storages**: Temporary storages identified by an alias that are scoped to a single run. These are automatically purged at the start of each run (when `purge_on_start` is enabled, which is the default).

### Default storage

Each storage type (<ApiLink to="class/Dataset">`Dataset`</ApiLink>, <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>) has a default instance that can be accessed without specifying `id`, `name` or `alias`. Default unnamed storage is accessed by calling storage's `open` method without parameters. This is the most common way to use storages in simple crawlers.

<RunnableCodeBlock className="language-python" language="python">
    {OpeningExample}
</RunnableCodeBlock>

## Request queue

The <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> is the primary storage for URLs in Crawlee, especially useful for deep crawling. It supports dynamic addition of URLs, making it ideal for recursive tasks where URLs are discovered and added during the crawling process (e.g., following links across multiple pages). Each Crawlee project has a **default request queue**, which can be used to store URLs during a specific run.

The following code demonstrates the usage of the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>:

<Tabs groupId="request_queue">
    <TabItem value="request_queue_basic_example" label="Basic usage" default>
        <RunnableCodeBlock className="language-python" language="python">
            {RqBasicExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="request_queue_with_crawler" label="Usage with Crawler">
        <RunnableCodeBlock className="language-python" language="python">
            {RqWithCrawlerExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="request_queue_with_crawler_explicit" label="Explicit usage with Crawler" default>
        <RunnableCodeBlock className="language-python" language="python">
            {RqWithCrawlerExplicitExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>

### Request-related helpers

Crawlee provides helper functions to simplify interactions with the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>:

- The <ApiLink to="class/AddRequestsFunction">`add_requests`</ApiLink> function allows you to manually add specific URLs to the configured request storage. In this case, you must explicitly provide the URLs you want to be added to the request storage. If you need to specify further details of the request, such as a `label` or `user_data`, you have to pass instances of the <ApiLink to="class/Request">`Request`</ApiLink> class to the helper.
- The <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> function is designed to discover new URLs in the current page and add them to the request storage. It can be used with default settings, requiring no arguments, or you can customize its behavior by specifying link element selectors, choosing different enqueue strategies, or applying include/exclude filters to control which URLs are added. See [Crawl website with relative links](../examples/crawl-website-with-relative-links) example for more details.

<Tabs groupId="request_helpers">
    <TabItem value="request_helper_add_requests" label="Add requests" default>
        <RunnableCodeBlock className="language-python" language="python">
            {RqHelperAddRequestsExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="request_helper_enqueue_links" label="Enqueue links">
        <RunnableCodeBlock className="language-python" language="python">
            {RqHelperEnqueueLinksExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>

### Request manager

The <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> implements the <ApiLink to="class/RequestManager">`RequestManager`</ApiLink> interface, offering a unified API for interacting with various request storage types. This provides a unified way to interact with different request storage types.

If you need custom functionality, you can create your own request storage by subclassing the <ApiLink to="class/RequestManager">`RequestManager`</ApiLink> class and implementing its required methods.

For a detailed explanation of the <ApiLink to="class/RequestManager">`RequestManager`</ApiLink> and other related components, refer to the [Request loaders guide](https://crawlee.dev/python/docs/guides/request-loaders).

## Dataset

The <ApiLink to="class/Dataset">`Dataset`</ApiLink> is designed for storing structured data, where each entry has a consistent set of attributes, such as products in an online store or real estate listings. Think of a <ApiLink to="class/Dataset">`Dataset`</ApiLink> as a table: each entry corresponds to a row, with attributes represented as columns. Datasets are append-only, allowing you to add new records but not modify or delete existing ones. Every Crawlee project run is associated with a default dataset, typically used to store results specific to that crawler execution. However, using this dataset is optional.

The following code demonstrates basic operations of the dataset:

<Tabs groupId="dataset_storage">
    <TabItem value="dataset_basic_example" label="Basic usage" default>
        <RunnableCodeBlock className="language-python" language="python">
            {DatasetBasicExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="dataset_with_crawler" label="Usage with Crawler">
        <RunnableCodeBlock className="language-python" language="python">
            {DatasetWithCrawlerExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="dataset_with_crawler_explicit" label="Explicit usage with Crawler" default>
        <RunnableCodeBlock className="language-python" language="python">
            {DatasetWithCrawlerExplicitExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>

### Dataset-related helpers

Crawlee provides the following helper function to simplify interactions with the <ApiLink to="class/Dataset">`Dataset`</ApiLink>:

- The <ApiLink to="class/PushDataFunction">`push_data`</ApiLink> function allows you to manually add data to the dataset. You can optionally specify the dataset ID or its name.

## Key-value store

The <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink> is designed to save and retrieve data records or files efficiently. Each record is uniquely identified by a key and is associated with a specific MIME type, making the <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink> ideal for tasks like saving web page screenshots, PDFs, or tracking the state of crawlers.

The following code demonstrates the usage of the <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>:

<Tabs groupId="kv_storage">
    <TabItem value="kvs_basic_example" label="Basic usage" default>
        <RunnableCodeBlock className="language-python" language="python">
            {KvsBasicExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="kvs_with_crawler" label="Usage with Crawler">
        <RunnableCodeBlock className="language-python" language="python">
            {KvsWithCrawlerExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="kvs_with_crawler_explicit" label="Explicit usage with Crawler" default>
        <RunnableCodeBlock className="language-python" language="python">
            {KvsWithCrawlerExplicitExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>

To see a real-world example of how to get the input from the key-value store, see the [Screenshots](https://crawlee.dev/python/docs/examples/capture-screenshots-using-playwright) example.

### Key-value store-related helpers

Crawlee provides the following helper function to simplify interactions with the <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>:

- The <ApiLink to="class/GetKeyValueStoreFunction">`get_key_value_store`</ApiLink> function retrieves the key-value store for the current crawler run. If the KVS does not exist, it will be created. You can also specify the KVS's ID or its name.

## Cleaning up the storages

By default, Crawlee cleans up all unnamed storages (including the default one) at the start of each run, so every crawl begins with a clean state. This behavior is controlled by <ApiLink to="class/Configuration#purge_on_start">`Configuration.purge_on_start`</ApiLink> (default: True). In contrast, named storages are never purged automatically and persist across runs. The exact behavior may vary depending on the storage client implementation.

### When purging happens

The cleanup occurs as soon as a storage is accessed:
- When opening a storage explicitly (e.g., <ApiLink to="class/RequestQueue#open">`RequestQueue.open`</ApiLink>, <ApiLink to="class/Dataset#open">`Dataset.open`</ApiLink>, <ApiLink to="class/KeyValueStore#open">`KeyValueStore.open`</ApiLink>).
- When using helper functions that implicitly open storages (e.g., <ApiLink to="class/PushDataFunction">`push_data`</ApiLink>).
- Automatically when <ApiLink to="class/BasicCrawler#run">`BasicCrawler.run`</ApiLink> is invoked.

### Disabling automatic purging

To disable automatic purging, set `purge_on_start=False` in your configuration:

<RunnableCodeBlock className="language-python" language="python">
    {CleaningDoNotPurgeExample}
</RunnableCodeBlock>

### Manual purging

Purge on start behavior just triggers the storage's `purge` method, which removes all data from the storage. If you want to purge the storage manually, you can do so by calling the `purge` method on the storage instance. Or if you want to delete the storage completely, you can call the `drop` method on the storage instance, which will remove the storage, including metadata and all its data.

<RunnableCodeBlock className="language-python" language="python">
    {CleaningPurgeExplicitlyExample}
</RunnableCodeBlock>

Note that purging behavior may vary between storage client implementations. For more details on storage configuration and client implementations, see the [Storage clients guide](./storage-clients).

## Conclusion

This guide introduced you to the different storage types available in Crawlee and how to interact with them. You learned about the distinction between named storages (persistent across runs) and unnamed storages with aliases (temporary and purged on start). You discovered how to manage requests using the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> and store and retrieve scraping results using the <ApiLink to="class/Dataset">`Dataset`</ApiLink> and <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>. You also learned how to use helper functions to simplify interactions with these storages and how to control storage cleanup behavior.

If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!


================================================
FILE: docs/guides/trace_and_monitor_crawlers.mdx
================================================
---
id: trace-and-monitor-crawlers
title: Trace and monitor crawlers
description: Learn how to instrument your crawlers with OpenTelemetry to trace request handling, identify bottlenecks, monitor performance, and visualize telemetry data using Jaeger for performance optimization.
---

import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';

import InstrumentCrawler from '!!raw-loader!./code_examples/trace_and_monitor_crawlers/instrument_crawler.py';

[OpenTelemtery](https://opentelemetry.io/) is a collection of APIs, SDKs, and tools to instrument, generate, collect, and export telemetry data (metrics, logs, and traces) to help you analyze your software’s performance and behavior. In the context of crawler development, it can be used to better understand how the crawler internally works, identify bottlenecks, debug, log metrics, and more. The topic described in this guide requires at least a basic understanding of OpenTelemetry. A good place to start is [What is open telemetry](https://opentelemetry.io/docs/what-is-opentelemetry/).

In this guide, it will be shown how to set up OpenTelemetry and instrument a specific crawler to see traces of individual requests that are being processed by the crawler. OpenTelemetry on its own does not provide out of the box tool for convenient visualisation of the exported data (apart from printing to the console), but there are several good available tools to do that. In this guide, we will use [Jaeger](https://www.jaegertracing.io/) to visualise the telemetry data. To better understand concepts such as exporter, collector, and visualisation backend, please refer to the [OpenTelemetry documentation](https://opentelemetry.io/docs/collector/).

## Set up the Jaeger

This guide will show how to set up the environment locally to run the example code and visualize the telemetry data in Jaeger that will be running locally in a [docker](https://www.docker.com/) container.

To start the preconfigured Docker container, you can use the following command:

```bash
docker run -d --name jaeger -e COLLECTOR_OTLP_ENABLED=true -p 16686:16686 -p 4317:4317 -p 4318:4318 jaegertracing/all-in-one:latest
```
For more details about the Jaeger setup, see the [getting started](https://www.jaegertracing.io/docs/2.7/getting-started/) section in their documentation.
You can see the Jaeger UI in your browser by navigating to http://localhost:16686

## Instrument the Crawler

Now you can proceed with instrumenting the crawler to send the telemetry data to Jaeger and running it. To have the Python environment ready, you should install either **crawlee[all]** or **crawlee[otel]**, This will ensure that OpenTelemetry dependencies are installed, and you can run the example code snippet.
In the following example, you can see the function `instrument_crawler` that contains the instrumentation setup and is called before the crawler is started. If you have already set up the Jaeger, then you can just run the following code snippet.

<CodeBlock className="language-python">
    {InstrumentCrawler}
</CodeBlock>

## Analyze the results

In the Jaeger UI, you can search for different traces, apply filtering, compare traces, view their detailed attributes, view timing details, and more. For the detailed description of the tool's capabilities, please refer to the [Jaeger documentation](https://www.jaegertracing.io/docs/1.47/deployment/frontend-ui/#trace-page).

![Jaeger search view](/img/guides/jaeger_otel_search_view_example.png 'Example visualisation of search view in Jaeger')
![Jaeger trace view](/img/guides/jaeger_otel_trace_example.png 'Example visualisation of crawler request trace in Jaeger')

You can use different tools to consume the OpenTelemetry data that might better suit your needs. Please see the list of known Vendors in [OpenTelemetry documentation](https://opentelemetry.io/ecosystem/vendors/).

## Customize the instrumentation

You can customize the <ApiLink to="class/CrawlerInstrumentor">`CrawlerInstrumentor`</ApiLink>. Depending on the arguments used during its initialization, the instrumentation will be applied to different parts of the Crawlee code. By default, it instruments some functions that can give quite a good picture of each individual request handling. To turn this default instrumentation off, you can pass `request_handling_instrumentation=False` during initialization. You can also extend instrumentation by passing `instrument_classes=[...]` initialization argument that contains classes you want to be auto-instrumented. All their public methods will be automatically instrumented. Bear in mind that instrumentation has some runtime costs as well. The more instrumentation is used, the more overhead it will add to the crawler execution.

You can also create your instrumentation by selecting only the methods you want to instrument. For more details, see the <ApiLink to="class/CrawlerInstrumentor">`CrawlerInstrumentor`</ApiLink> source code and the [Python documentation for OpenTelemetry](https://opentelemetry.io/docs/languages/python/).

If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU).


================================================
FILE: docs/introduction/01_setting_up.mdx
================================================
---
id: setting-up
title: Setting up
---

import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';

This guide will help you get started with Crawlee by setting it up on your computer. Follow the steps below to ensure a smooth installation process.

## Prerequisites

Before installing Crawlee itself, make sure that your system meets the following requirements:

- **Python 3.10 or higher**: Crawlee requires Python 3.10 or a newer version. You can download Python from the [official website](https://python.org/downloads/).
- **Python package manager**: While this guide uses [pip](https://pip.pypa.io/) (the most common package manager), you can also use any package manager you want. You can download pip from the [official website](https://pip.pypa.io/en/stable/installation/).

### Verifying prerequisites

To check if Python and pip are installed, run the following commands:

```sh
python --version
```

```sh
python -m pip --version
```

If these commands return the respective versions, you're ready to continue.

## Installing Crawlee

Crawlee is available as [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal.

### Basic installation

To install the core package, run:

```sh
python -m pip install crawlee
```

After installation, verify that Crawlee is installed correctly by checking its version:

```sh
python -c 'import crawlee; print(crawlee.__version__)'
```

### Full installation

If you do not mind the package size, you can run the following command to install Crawlee with all optional features:

```sh
python -m pip install 'crawlee[all]'
```

### Installing specific extras

Depending on your use case, you may want to install specific extras to enable additional functionality:

For using the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, install the `beautifulsoup` extra:

```sh
python -m pip install 'crawlee[beautifulsoup]'
```

For using the <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>, install the `parsel` extra:

```sh
python -m pip install 'crawlee[parsel]'
```

For using the <ApiLink to="class/CurlImpersonateHttpClient">`CurlImpersonateHttpClient`</ApiLink>, install the `curl-impersonate` extra:

```sh
python -m pip install 'crawlee[curl-impersonate]'
```

If you plan to use a (headless) browser with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, install Crawlee with the `playwright` extra:

```sh
python -m pip install 'crawlee[playwright]'
```

After installing the playwright extra, install the necessary Playwright dependencies:

```sh
playwright install
```

### Installing multiple extras

You can install multiple extras at once by using a comma as a separator:

```sh
python -m pip install 'crawlee[beautifulsoup,curl-impersonate]'
```

## Start a new project

The quickest way to get started with Crawlee is by using the Crawlee CLI and selecting one of the prepared templates. The CLI helps you set up a new project in seconds.

### Using Crawlee CLI with uv

First, ensure you have [uv](https://pypi.org/project/uv/) installed. You can check if it is installed by running:

```sh
uv --version
```

If [uv](https://pypi.org/project/uv/) is not installed, follow the official [installation guide](https://docs.astral.sh/uv/getting-started/installation/).

Then, run the Crawlee CLI using `uvx` and choose from the available templates:

```sh
uvx 'crawlee[cli]' create my-crawler
```

### Using Crawlee CLI directly

If you already have `crawlee` installed, you can spin it up by running:

```sh
crawlee create my_crawler
```

Follow the interactive prompts in the CLI to choose a crawler type and set up your new project.

### Running your project

To run your newly created project, navigate to the project directory, activate the virtual environment, and execute the Python interpreter with the project module:

<Tabs>
  <TabItem value="Linux" label="Linux" default>
      <CodeBlock language="sh">cd my_crawler/</CodeBlock>
      <CodeBlock language="sh">source .venv/bin/activate</CodeBlock>
      <CodeBlock language="sh">python -m my_crawler</CodeBlock>
  </TabItem>
<TabItem value="Windows" label="Windows" default>
      <CodeBlock language="sh">cd my_crawler/</CodeBlock>
      <CodeBlock language="sh">venv\Scripts\activate</CodeBlock>
      <CodeBlock language="sh">python -m my_crawler</CodeBlock>
  </TabItem>
</Tabs>

Congratulations! You have successfully set up and executed your first Crawlee project.

## Next steps

Next, you will learn how to create a very simple crawler and Crawlee components while building it.


================================================
FILE: docs/introduction/02_first_crawler.mdx
================================================
---
id: first-crawler
title: First crawler
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import RequestQueueExample from '!!raw-loader!roa-loader!./code_examples/02_request_queue.py';
import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/02_bs.py';
import BeautifulSoupBetterExample from '!!raw-loader!roa-loader!./code_examples/02_bs_better.py';

Now, you will build your first crawler. But before you do, let's briefly introduce the Crawlee classes involved in the process.

## How Crawlee works

There are 3 main crawler classes available for use in Crawlee.

- <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>
- <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>
- <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>

We'll talk about their differences later. Now, let's talk about what they have in common.

The general idea of each crawler is to go to a web page, open it, do some stuff there, save some results, continue to the next page, and repeat this process until the crawler's done its job. So the crawler always needs to find answers to two questions: _Where should I go?_ and _What should I do there?_ Answering those two questions is the only required setup. The crawlers have reasonable defaults for everything else.

### The where - `Request` and `RequestQueue`

All crawlers use instances of the <ApiLink to="class/Request">`Request`</ApiLink> class to determine where they need to go. Each request may hold a lot of information, but at the very least, it must hold a URL - a web page to open. But having only one URL would not make sense for crawling. Sometimes you have a pre-existing list of your own URLs that you wish to visit, perhaps a thousand. Other times you need to build this list dynamically as you crawl, adding more and more URLs to the list as you progress. Most of the time, you will use both options.

The requests are stored in a <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, a dynamic queue of <ApiLink to="class/Request">`Request`</ApiLink> instances. You can seed it with start URLs and also add more requests while the crawler is running. This allows the crawler to open one page, extract interesting data, such as links to other pages on the same domain, add them to the queue (called _enqueuing_) and repeat this process to build a queue of virtually unlimited number of URLs.

### The what - request handler

In the request handler you tell the crawler what to do at each and every page it visits. You can use it to handle extraction of data from the page, processing the data, saving it, calling APIs, doing calculations and so on.

The request handler is a user-defined function, invoked automatically by the crawler for each <ApiLink to="class/Request">`Request`</ApiLink> from the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. It always receives a single argument - <ApiLink to="class/BasicCrawlingContext">`BasicCrawlingContext`</ApiLink> (or its descendants). Its properties change depending on the crawler class used, but it always includes the `request` property, which represents the currently crawled URL and related metadata.

## Building a crawler

Let's put the theory into practice and start with something easy. Visit a page and get its HTML title. In this tutorial, you'll scrape the Crawlee website [https://crawlee.dev](https://crawlee.dev), but the same code will work for any website.

### Adding requests to the crawling queue

Earlier you learned that the crawler uses a queue of requests as its source of URLs to crawl. Let's create it and add the first request.

<RunnableCodeBlock className="language-python" language="python">
    {RequestQueueExample}
</RunnableCodeBlock>

The <ApiLink to="class/RequestQueue#add_request">`RequestQueue.add_request`</ApiLink> method automatically converts the object with URL string to a <ApiLink to="class/Request">`Request`</ApiLink> instance. So now you have a <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> that holds one request which points to `https://crawlee.dev`.

:::tip Bulk add requests

The code above is for illustration of the request queue concept. Soon you'll learn about the  <ApiLink to="class/BasicCrawler#add_requests">`BasicCrawler.add_requests`</ApiLink> method which allows you to skip this initialization code, and it also supports adding a large number of requests without blocking.

:::

### Building a BeautifulSoupCrawler

Crawlee comes with three main crawler classes: <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>, and <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>. You can read their short descriptions in the [Quick start](../quick-start) lesson.

Unless you have a good reason to start with a different one, you should try building a <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> first. It is an HTTP crawler with HTTP2 support, anti-blocking features and integrated HTML parser - [BeautifulSoup](https://pypi.org/project/beautifulsoup4/). It's fast, simple, cheap to run and does not require complicated dependencies. The only downside is that it won't work out of the box for websites which require JavaScript rendering. But you might not need JavaScript rendering at all, because many modern websites use server-side rendering.

Let's continue with the earlier <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> example.

<RunnableCodeBlock className="language-python" language="python">
    {BeautifulSoupExample}
</RunnableCodeBlock>

When you run the example, you will see the title of https://crawlee.dev printed to the log. What really happens is that <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> first makes an HTTP request to `https://crawlee.dev`, then parses the received HTML with BeautifulSoup and makes it available as the `context` argument of the request handler.

```log
[__main__] INFO  The title of "https://crawlee.dev" is "Crawlee · Build reliable crawlers. Fast. | Crawlee".
```

### Add requests faster

Earlier we mentioned that you'll learn how to use the  <ApiLink to="class/BasicCrawler#add_requests">`BasicCrawler.add_requests`</ApiLink> method to skip the request queue initialization. It's simple. Every crawler has an implicit <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> instance, and you can add requests to it with the  <ApiLink to="class/BasicCrawler#add_requests">`BasicCrawler.add_requests`</ApiLink> method. In fact, you can go even further and just use the first parameter of `crawler.run()`!

<RunnableCodeBlock className="language-python" language="python">
    {BeautifulSoupBetterExample}
</RunnableCodeBlock>

When you run this code, you'll see exactly the same output as with the earlier, longer example. The <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> is still there, it's just managed by the crawler automatically.

:::info

This method not only makes the code shorter, it will help with performance too! Internally it calls  <ApiLink to="class/RequestQueue#add_requests_batched">`RequestQueue.add_requests_batched`</ApiLink> method. It will wait only for the initial batch of 1000 requests to be added to the queue before resolving, which means the processing will start almost instantly. After that, it will continue adding the rest of the requests in the background (again, in batches of 1000 items, once every second).

:::

## Next steps

Next, you'll learn about crawling links. That means finding new URLs on the pages you crawl and adding them to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> for the crawler to visit.


================================================
FILE: docs/introduction/03_adding_more_urls.mdx
================================================
---
id: adding-more-urls
title: Adding more URLs
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import OriginalCodeExample from '!!raw-loader!roa-loader!./code_examples/03_original_code.py';
import FindingNewLinksExample from '!!raw-loader!roa-loader!./code_examples/03_finding_new_links.py';
import EnqueueStrategyExample from '!!raw-loader!roa-loader!./code_examples/03_enqueue_strategy.py';
import GlobsExample from '!!raw-loader!roa-loader!./code_examples/03_globs.py';
import TransformExample from '!!raw-loader!roa-loader!./code_examples/03_transform_request.py';

Previously you've built a very simple crawler that downloads HTML of a single page, reads its title and prints it to the console. This is the original source code:

<RunnableCodeBlock className="language-python" language="python">
    {OriginalCodeExample}
</RunnableCodeBlock>

Now you'll use the example from the previous section and improve on it. You'll add more URLs to the queue and thanks to that the crawler will keep going, finding new links, enqueuing them into the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> and then scraping them.

## How crawling works

The process is simple:

1. Find new links on the page.
2. Filter only those pointing to the same domain, in this case [crawlee.dev](https://crawlee.dev/).
3. Enqueue (add) them to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>.
4. Visit the newly enqueued links.
5. Repeat the process.

In the following paragraphs you will learn about the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> function which simplifies crawling to a single function call.

:::tip context awareness

The <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> function is context aware. It means that it will read the information about the currently crawled page from the context, and you don't need to explicitly provide any arguments. However, you can specify filtering criteria or an enqueuing strategy if desired. It will find the links and automatically add the links to the running crawler's <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>.

:::

## Limit your crawls

When you're just testing your code or when your crawler could potentially find millions of links, it's very useful to set a maximum limit of crawled pages. The option is called <ApiLink to="class/BasicCrawlerOptions#max_requests_per_crawl">`max_requests_per_crawl`</ApiLink>, is available in all crawlers, and you can set it like this:

```python
crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)
```

This means that no new requests will be started after the 20th request is finished. The actual number of processed requests might be a little higher thanks to parallelization, because the running requests won't be forcefully aborted. It's not even possible in most cases.

## Finding new links

There are numerous approaches to finding links to follow when crawling the web. For our purposes, we will be looking for `<a>` elements that contain the `href` attribute because that's what you need in most cases. For example:

```html
<a href="https://crawlee.dev/docs/introduction">This is a link to Crawlee introduction</a>
```

Since this is the most common case, it is also the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> default.

<RunnableCodeBlock className="language-python" language="python">
    {FindingNewLinksExample}
</RunnableCodeBlock>

If you need to override the default selection of elements in <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink>, you can use the `selector` argument.

```python
await context.enqueue_links(selector='a.article-link')
```

## Filtering links to same domain

Websites typically contain a lot of links that lead away from the original page. This is normal, but when crawling a website, we usually want to crawl that one site and not let our crawler wander away to Google, Facebook and Twitter. Therefore, we need to filter out the off-domain links and only keep the ones that lead to the same domain.

```python
# The default behavior of enqueue_links is to stay on the same hostname, so it does not require
# any parameters. This will ensure the subdomain stays the same.
await context.enqueue_links()
```

The default behavior of <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> is to stay on the same hostname. This **does not include subdomains**. To include subdomains in your crawl, use the `strategy` argument. The `strategy` argument is an instance of the `EnqueueStrategy` type alias.

<RunnableCodeBlock className="language-python" language="python">
    {EnqueueStrategyExample}
</RunnableCodeBlock>

When you run the code, you will see the crawler log the **title** of the first page, then the **enqueueing** message showing number of URLs, followed by the **title** of the first enqueued page and so on and so on.

## Skipping duplicate URLs

Skipping of duplicate URLs is critical, because visiting the same page multiple times would lead to duplicate results. This is automatically handled by the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> which deduplicates requests using their `unique_key`. This `unique_key` is automatically generated from the request's URL by lowercasing the URL, lexically ordering query parameters, removing fragments and a few other tweaks that ensure the queue only includes unique URLs.

## Advanced filtering arguments

While the defaults for <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> can be often exactly what you need, it also gives you fine-grained control over which URLs should be enqueued. One way we already mentioned above. It is using the `EnqueueStrategy` type alias. You can use the `all` strategy if you want to follow every single link, regardless of its domain, or you can enqueue links that target the same domain name with the `same-domain` strategy.

```python
# Wanders the internet.
await context.enqueue_links(strategy='all')
```

### Filter URLs with patterns

For even more control, you can use the `include` or `exclude` parameters, either as glob patterns or regular expressions, to filter the URLs. Refer to the API documentation for <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> for detailed information on these and other available options.

<RunnableCodeBlock className="language-python" language="python">
    {GlobsExample}
</RunnableCodeBlock>

### Transform requests before enqueuing

For cases where you need to modify or filter requests before they are enqueued, you can use the `transform_request_function` parameter. This function receives a <ApiLink to="class/RequestOptions">`RequestOptions`</ApiLink> object and should return either a modified <ApiLink to="class/RequestOptions">`RequestOptions`</ApiLink> object, or a string of type `RequestTransformAction`, which only allows the values `skip` and `unchanged`. Returning `skip` means the request will be skipped, while `unchanged` will add it without any changes

<RunnableCodeBlock className="language-python" language="python">
    {TransformExample}
</RunnableCodeBlock>

## Next steps

Next, you will start your project of scraping a production website and learn some more Crawlee tricks in the process.


================================================
FILE: docs/introduction/04_real_world_project.mdx
================================================
---
id: real-world-project
title: Real-world project
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import SanityCheckExample from '!!raw-loader!roa-loader!./code_examples/04_sanity_check.py';

> _Hey, guys, you know, it's cool that we can scrape the `<title>` elements of web pages, but that's not very useful. Can we finally scrape some real data and save it somewhere in a machine-readable format? Because that's why I started reading this tutorial in the first place!_

We hear you, young padawan! First, learn how to crawl, you must. Only then, walk through data, you can!

## Making a production-grade crawler

Making a production-grade crawler is not difficult, but there are many pitfalls of scraping that can catch you off guard. So for the real world project you'll learn how to scrape an [Warehouse store example](https://warehouse-theme-metal.myshopify.com/collections) instead of the Crawlee website. It contains a list of products of different categories, and each product has its own detail page.

The website requires JavaScript rendering, which allows us to showcase more features of Crawlee. We've also added some helpful tips that prepare you for the real-world issues that you will surely encounter when scraping at scale.

:::tip Not interested in theory?

If you're not interested in crawling theory, feel free to [skip to the next chapter](./crawling) and get right back to coding.

:::

## Drawing a plan

Sometimes scraping is really straightforward, but most of the time, it really pays off to do a bit of research first and try to answer some of these questions:

- How is the website structured?
- Can I scrape it only with HTTP requests (read "with some <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>, e.g. <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>")?
- Do I need a headless browser for something?
- Are there any anti-scraping protections in place?
- Do I need to parse the HTML or can I get the data otherwise, such as directly from the website's API?

For the purposes of this tutorial, let's assume that the website cannot be scraped with <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>. It actually can, but we would have to dive a bit deeper than this introductory guide allows. So for now we will make things easier for you, scrape it with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, and you'll learn about headless browsers in the process.

## Choosing the data you need

A good first step is to figure out what data you want to scrape and where to find it. For the time being, let's just agree that we want to scrape all products from all categories available on the [all collections page of the store](https://warehouse-theme-metal.myshopify.com/collections) and for each product we want to get its:

- URL
- Manufacturer
- SKU
- Title
- Current price
- Stock available

You will notice that some information is available directly on the list page, but for details such as "SKU" we'll also need to open the product's detail page.

![data to scrape](/img/getting-started/scraping-practice.jpg 'Overview of data to be scraped.')

### The start URL(s)

This is where you start your crawl. It's convenient to start as close to the data as possible. For example, it wouldn't make much sense to start at https://warehouse-theme-metal.myshopify.com and look for a `collections` link there, when we already know that everything we want to extract can be found at the https://warehouse-theme-metal.myshopify.com/collections page.

## Exploring the page

Let's take a look at the https://warehouse-theme-metal.myshopify.com/collections page more carefully. There are some **categories** on the page, and each category has a list of **items**. On some category pages, at the bottom you will notice there are links to the next pages of results. This is usually called **the pagination**.

### Categories and sorting

When you click the categories, you'll see that they load a page of products filtered by that category. By going through a few categories and observing the behavior, we can also observe that we can sort by different conditions (such as `Best selling`, or `Price, low to high`), but for this example, we will not be looking into those.

:::caution Limited pagination

Be careful, because on some websites, like [amazon.com](https://amazon.com), this is not true and the sum of products in categories is actually larger than what's available without filters. Learn more in our [tutorial on scraping websites with limited pagination](https://docs.apify.com/tutorials/scrape-paginated-sites).

:::

### Pagination

The pagination of the demo Warehouse Store is simple enough. When switching between pages, you will see that the URL changes to:

```text
https://warehouse-theme-metal.myshopify.com/collections/headphones?page=2
```

Try clicking on the link to page 4. You'll see that the pagination links update and show more pages. But can you trust that this will include all pages and won't stop at some point?

:::caution Test your assumptions

Similarly to the issue with filters explained above, the existence of pagination does not guarantee that you can simply paginate through all the results. Always test your assumptions about pagination. Otherwise, you might miss a chunk of results, and not even know about it.

:::

At the time of writing the `Headphones` collection results counter showed 75 results - products. Quick count of products on one page of results makes 24. 6 rows times 4 products. This means that there are 4 pages of results.

If you're not convinced, you can visit a page somewhere in the middle, like `https://warehouse-theme-metal.myshopify.com/collections/headphones?page=2` and see how the pagination looks there.

## The crawling strategy

Now that you know where to start and how to find all the collection details, let's look at the crawling process.

1. Visit the store page containing the list of categories (our start URL).
2. Enqueue all links to all categories.
3. Enqueue all product pages from the current page.
4. Enqueue links to next pages of results.
5. Open the next page in queue.
    - When it's a results list page, go to 2.
    - When it's a product page, scrape the data.
6. Repeat until all results pages and all products have been processed.

`PlaywrightCrawler` will make sure to visit the pages for you, if you provide the correct requests, and you already know how to enqueue pages, so this should be fairly easy. Nevertheless, there are few more tricks that we'd like to showcase.

## Sanity check

Let's check that everything is set up correctly before writing the scraping logic itself. You might realize that something in your previous analysis doesn't quite add up, or the website might not behave exactly as you expected.

The example below creates a new crawler that visits the start URL and prints the text content of all the categories on that page. When you run the code, you will see the _very badly formatted_ content of the individual category card.

<RunnableCodeBlock className="language-python" language="python">
    {SanityCheckExample}
</RunnableCodeBlock>

If you're wondering how to get that `.collection-block-item` selector. We'll explain it in the next chapter on DevTools.

## DevTools - the scraper's toolbox

:::info DevTool choice

We'll use Chrome DevTools here, since it's the most common browser, but feel free to use any other, they're all very similar.

:::

Let's open DevTools by going to https://warehouse-theme-metal.myshopify.com/collections in Chrome and then right-clicking anywhere in the page and selecting **Inspect**, or by pressing **F12** or whatever your system prefers. With DevTools, you can inspect or manipulate any aspect of the currently open web page. You can learn more about DevTools in their [official documentation](https://developer.chrome.com/docs/devtools/).

## Selecting elements

In the DevTools, choose the **Select an element** tool and try hovering over one of the Actor cards.

![select an element](/img/getting-started/select-an-element.jpg 'Finding the select an element tool.')

You'll see that you can select different elements inside the card. Instead, select the whole card, not just some of its contents, such as its title or description.

![selected element](/img/getting-started/selected-element.jpg 'Selecting an element by hovering over it.')

Selecting an element will highlight it in the DevTools HTML inspector. When carefully look at the elements, you'll see that there are some **classes** attached to the different HTML elements. Those are called **CSS classes**, and we can make a use of them in scraping.

Conversely, by hovering over elements in the HTML inspector, you will see them highlight on the page. Inspect the page's structure around the collection card. You'll see that all the card's data is displayed in an `<a>` element with a `class` attribute that includes **collection-block-item**. It should now make sense how we got that `.collection-block-item` selector. It's just a way to find all elements that are annotated with the `collection-block-item`.

It's always a good idea to double-check that you're not getting any unwanted elements with this class. To do that, go into the **Console** tab of DevTools and run:

```ts
document.querySelectorAll('.collection-block-item');
```

You will see that only the 31 collection cards will be returned, and nothing else.

:::tip Learn more about CSS selectors and DevTools

CSS selectors and DevTools are quite a big topic. If you want to learn more, visit the [Web scraping for beginners course](https://developers.apify.com/academy/web-scraping-for-beginners) in the Apify Academy. **It's free and open-source** ❤️.

:::

## Next steps

Next, you will crawl the whole store, including all the listing pages and all the product detail pages.


================================================
FILE: docs/introduction/05_crawling.mdx
================================================
---
id: crawling
title: Crawling
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import CrawlingListingExample from '!!raw-loader!roa-loader!./code_examples/05_crawling_listing.py';
import CrawlingDetailExample from '!!raw-loader!roa-loader!./code_examples/05_crawling_detail.py';

To crawl the whole [Warehouse store example](https://warehouse-theme-metal.myshopify.com/collections) and find all the data, you first need to visit all the pages with products - going through all categories available and also all the product detail pages.

## Crawling the listing pages

In previous lessons, you used the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> function like this:

```python
await enqueue_links()
```

While useful in that scenario, you need something different now. Instead of finding all the `<a href="..">` elements with links to the same hostname, you need to find only the specific ones that will take your crawler to the next page of results. Otherwise, the crawler will visit a lot of other pages that you're not interested in. Using the power of DevTools and yet another <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> parameter, this becomes fairly easy.

<RunnableCodeBlock className="language-python" language="python">
    {CrawlingListingExample}
</RunnableCodeBlock>

The code should look pretty familiar to you. It's a very simple request handler where we log the currently processed URL to the console and enqueue more links. But there are also a few new, interesting additions. Let's break it down.

### The `selector` parameter of `enqueue_links`

When you previously used <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink>, you were not providing any `selector` parameter, and it was fine, because you wanted to use the default value, which is `a` - finds all `<a>` elements. But now, you need to be more specific. There are multiple `<a>` links on the `Categories` page, and you're only interested in those that will take your crawler to the available list of results. Using the DevTools, you'll find that you can select the links you need using the `.collection-block-item` selector, which selects all the elements that have the `class=collection-block-item` attribute.

### The `label` of `enqueue_links`

You will see `label` used often throughout Crawlee, as it's a convenient way of labelling a <ApiLink to="class/Request">`Request`</ApiLink> instance for quick identification later. You can access it with `request.label` and it's a `string`. You can name your requests any way you want. Here, we used the label `CATEGORY` to note that we're enqueueing pages that represent a category of products. The <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> function will add this label to all requests before enqueueing them to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. Why this is useful will become obvious in a minute.

## Crawling the detail pages

In a similar fashion, you need to collect all the URLs to the product detail pages, because only from there you can scrape all the data you need. The following code only repeats the concepts you already know for another set of links.

<RunnableCodeBlock className="language-python" language="python">
    {CrawlingDetailExample}
</RunnableCodeBlock>

The crawling code is now complete. When you run the code, you'll see the crawler visit all the listing URLs and all the detail URLs.

## Next steps

This concludes the Crawling lesson, because you have taught the crawler to visit all the pages it needs. Let's continue with scraping data.


================================================
FILE: docs/introduction/06_scraping.mdx
================================================
---
id: scraping
title: Scraping
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import ScrapingExample from '!!raw-loader!roa-loader!./code_examples/06_scraping.py';

In the [Real-world project](./real-world-project#choosing-the-data-you-need) chapter, you've created a list of the information you wanted to collect about the products in the example Warehouse store. Let's review that and figure out ways to access the data.

- URL
- Manufacturer
- SKU
- Title
- Current price
- Stock available

![data to scrape](/img/getting-started/scraping-practice.jpg 'Overview of data to be scraped.')

## Scraping the URL and manufacturer

Some information is lying right there in front of us without even having to touch the product detail pages. The `URL` we already have - the `context.request.url`. And by looking at it carefully, we realize that we can also extract the manufacturer from the URL (as all product urls start with `/products/<manufacturer>`). We can just split the `string` and be on our way then!

:::info url vs loaded url

You can use `request.loaded_url` as well. Remember the difference: `request.url` is what you enqueue, `request.loaded_url` is what gets processed (after possible redirects).

:::

By splitting the `request.url`, we can extract the manufacturer name directly from the URL. This is done by first splitting the URL to get the product identifier and then splitting that identifier to get the manufacturer name.

```python
# context.request.url:
# https://warehouse-theme-metal.myshopify.com/products/sennheiser-mke-440-professional-stereo-shotgun-microphone-mke-440

# Split the URL and get the last part.
url_part = context.request.url.split('/').pop()
# url_part: sennheiser-mke-440-professional-stereo-shotgun-microphone-mke-440

# Split the last part by '-' and get the first element.
manufacturer = url_part.split('-')[0]
# manufacturer: 'sennheiser'
```

:::tip Storing information

It's a matter of preference, whether to store this information separately in the resulting dataset, or not. Whoever uses the dataset can easily parse the `manufacturer` from the `URL`, so should you duplicate the data unnecessarily? Our opinion is that unless the increased data consumption would be too large to bear, it's better to make the dataset as rich as possible. For example, someone might want to filter by `manufacturer`.

:::

:::caution Adapt and extract

One thing you may notice is that the `manufacturer` might have a `-` in its name. If that's the case, your best bet is extracting it from the details page instead, but it's not mandatory. At the end of the day, you should always adjust and pick the best solution for your use case, and website you are crawling.

:::

Now it's time to add more data to the results. Let's open one of the product detail pages, for example the [Sony XBR-950G](https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv) page and use our DevTools-Fu 🥋 to figure out how to get the title of the product.

## Scraping title

To scrape the product title from a webpage, you need to identify its location in the HTML structure. By using the element selector tool in your browser's DevTools, you can see that the title is within an `<h1>` tag, which is a common practice for important headers. This `<h1>` tag is enclosed in a `<div>` with the class product-meta. We can leverage this structure to create a combined selector `.product-meta h1`. This selector targets any `<h1>` element that is a child of an element with the class `product-meta`.

![product title](/img/getting-started/title.jpg 'Finding product title in DevTools.')

:::tip Verifying selectors with DevTools

Remember that you can press CTRL+F (or CMD+F on Mac) in the **Elements** tab of DevTools to open the search bar where you can quickly search for elements using their selectors. Always verify your scraping process and assumptions using the DevTools. It's faster than changing the crawler code all the time.

:::

To get the title, you need to locate it using Playwright with the `.product-meta h1` selector. This selector specifically targets the `<h1>` element you need. If multiple elements match, it will throw an error, which is beneficial as it prevents returning incorrect data silently. Ensuring the accuracy of your selectors is crucial for reliable data extraction.

```python
title = await context.page.locator('.product-meta h1').text_content()
```

## Scraping SKU

Using the DevTools, you can find that the product SKU is inside a `<span>` tag with the class `product-meta__sku-number`. Since there is no other `<span>` with that class on the page, you can safely use this selector to extract the SKU.

![product sku selector](/img/getting-started/sku.jpg 'Finding product SKU in DevTools.')

```python
# Find the SKU element using the selector and get its text content.
sku = await context.page.locator('span.product-meta__sku-number').text_content()
```

## Scraping current price

Using DevTools, you can find that the current price is within a `<span>` element tagged with the `price` class. However, it is nested alongside another `<span>` element with the `visually-hidden` class. To avoid extracting the wrong text, you can filter the elements to get the correct one using the `has_text` helper.

![product current price selector](/img/getting-started/current-price.jpg 'Finding product current price in DevTools.')

```python
# Locate the price element and filter out the visually hidden elements.
price_element = context.page.locator('span.price', has_text='$').first

# Extract the text content of the price element.
current_price_string = await price_element.text_content() or ''
# current_price_string: 'Sale price$1,398.00'

# Split the string by the '$' sign to get the numeric part.
raw_price = current_price_string.split('$')[1]
# raw_price: '1,398.00'

# Convert the raw price string to a float after removing commas.
price = float(raw_price.replace(',', ''))
# price: 1398.00
```

It might look a little complex at first glance, but let's walk through what you did. First, you locate the correct part of the `price` span by filtering for elements containing the `$` sign. This ensures that you get the actual price element. Once you have the right element, you extract its text content, which gives you a string similar to `Sale price$1,398.00`. To get the numeric value, you split this string by the `$` sign. Next, you remove any commas from the resulting numeric string and convert it to a float, allowing you to work with the price as a number. This process ensures that you accurately extract and convert the current price from the product page.

## Scraping stock availability

The final step is to scrape the stock availability information. There is a `<span>` with the class `product-form__inventory`, which contains the text `In stock` if the product is available. You can use the `has_text` helper to filter out the correct element.

```python
# Locate the element that contains the text 'In stock' and filter out other elements.
in_stock_element = context.page.locator(
    selector='span.product-form__inventory',
    has_text='In stock',
).first

# Check if the element exists by counting the matching elements.
in_stock = await in_stock_element.count() > 0
```

For this, all that matters is whether the element exists or not. You can use the `count()` method to check if any elements match the selector. If there are, it means the product is in stock.

## Trying it out

You have everything that is needed, so grab your newly created scraping logic, dump it into your original request handler and see the magic happen!

<RunnableCodeBlock className="language-python" language="python">
    {ScrapingExample}
</RunnableCodeBlock>

When you run the crawler, you will see the crawled URLs and their scraped data printed to the console. The output will look something like this:

```json
{
    "url": "https://warehouse-theme-metal.myshopify.com/products/sony-str-za810es-7-2-channel-hi-res-wi-fi-network-av-receiver",
    "manufacturer": "sony",
    "title": "Sony STR-ZA810ES 7.2-Ch Hi-Res Wi-Fi Network A/V Receiver",
    "sku": "SON-692802-STR-DE",
    "price": 698,
    "in_stock": true
}
```

## Next steps

Next, you'll see how to save the data you scraped to the disk for further processing.


================================================
FILE: docs/introduction/07_saving_data.mdx
================================================
---
id: saving-data
title: Saving data
---

import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import FirstCodeExample from '!!raw-loader!./code_examples/07_first_code.py';

import FinalCodeExample from '!!raw-loader!roa-loader!./code_examples/07_final_code.py';

A data extraction job would not be complete without saving the data for later use and processing. You've come to the final and most difficult part of this tutorial so make sure to pay attention very carefully!

## Save data to the dataset

Crawlee provides a <ApiLink to="class/Dataset">`Dataset`</ApiLink> class, which acts as an abstraction over tabular storage, making it useful for storing scraping results. To get started:

- Add the necessary imports: Include the <ApiLink to="class/Dataset">`Dataset`</ApiLink> and any required crawler classes at the top of your file.
- Create a Dataset instance: Use the asynchronous <ApiLink to="class/Dataset#open">`Dataset.open`</ApiLink> constructor to initialize the dataset instance within your crawler's setup.

Here's an example:

<CodeBlock language="python">
    {FirstCodeExample}
</CodeBlock>

Finally, instead of logging the extracted data to stdout, we can export them to the dataset:

```python
# ...

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        # ...

        data = {
            'manufacturer': manufacturer,
            'title': title,
            'sku': sku,
            'price': price,
            'in_stock': in_stock,
        }

        # Push the data to the dataset.
        await dataset.push_data(data)

        # ...
```

### Using a context helper

Instead of importing a new class and manually creating an instance of the dataset, you can use the context helper  <ApiLink to="class/PushDataFunction">`context.push_data`</ApiLink>. Remove the dataset import and instantiation, and replace `dataset.push_data` with the following:

```python
# ...

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        # ...

        data = {
            'manufacturer': manufacturer,
            'title': title,
            'sku': sku,
            'price': price,
            'in_stock': in_stock,
        }

        # Push the data to the dataset.
        await context.push_data(data)

        # ...
```

### Final code

And that's it. Unlike earlier, we are being serious now. That's it, you're done. The final code looks like this:

<RunnableCodeBlock className="language-python" language="python">
    {FinalCodeExample}
</RunnableCodeBlock>

## What `push_data` does?

A helper <ApiLink to="class/PushDataFunction">`context.push_data`</ApiLink> saves data to the default dataset. You can provide additional arguments there like `id` or `name` to open a different dataset. Dataset is a storage designed to hold data in a format similar to a table. Each time you call <ApiLink to="class/PushDataFunction">`context.push_data`</ApiLink> or direct <ApiLink to="class/Dataset#push_data">`Dataset.push_data`</ApiLink> a new row in the table is created, with the property names serving as column titles. In the default configuration, the rows are represented as JSON files saved on your file system, but other backend storage systems can be plugged into Crawlee as well. More on that later.

:::info Automatic dataset initialization

Each time you start Crawlee a default <ApiLink to="class/Dataset">`Dataset`</ApiLink> is automatically created, so there's no need to initialize it or create an instance first. You can create as many datasets as you want and even give them names. For more details see the <ApiLink to="class/Dataset#open">`Dataset.open`</ApiLink> function.

:::

{/* TODO: mention result storage guide once it's done

:::info Automatic dataset initialization

Each time you start Crawlee a default <ApiLink to="class/Dataset">`Dataset`</ApiLink> is automatically created, so there's no need to initialize it or create an instance first. You can create as many datasets as you want and even give them names. For more details see the [Result storage guide](../guides/result-storage#dataset) and the `Dataset.open()` function.

:::
*/}

## Finding saved data

Unless you changed the configuration that Crawlee uses locally, which would suggest that you knew what you were doing, and you didn't need this tutorial anyway, you'll find your data in the storage directory that Crawlee creates in the working directory of the running script:

```text
{PROJECT_FOLDER}/storage/datasets/default/
```

The above folder will hold all your saved data in numbered files, as they were pushed into the dataset. Each file represents one invocation of <ApiLink to="class/Dataset#push_data">`Dataset.push_data`</ApiLink> or one table row.

{/* TODO: add mention of "Result storage guide" once it's ready:

:::tip Single file data storage options

If you would like to store your data in a single big file, instead of many small ones, see the [Result storage guide](../guides/result-storage#key-value-store) for Key-value stores.

:::

*/}

## Next steps

Next, you'll see some improvements that you can add to your crawler code that will make it more readable and maintainable in the long run.


================================================
FILE: docs/introduction/08_refactoring.mdx
================================================
---
id: refactoring
title: Refactoring
---

import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';

import MainExample from '!!raw-loader!./code_examples/08_main.py';
import RoutesExample from '!!raw-loader!./code_examples/08_routes.py';

It may seem that the data is extracted and the crawler is done, but honestly, this is just the beginning. For the sake of brevity, we've completely omitted error handling, proxies, logging, architecture, tests, documentation and other stuff that a reliable software should have. The good thing is, error handling is mostly done by Crawlee itself, so no worries on that front, unless you need some custom magic.

:::info Navigating automatic bot-protextion avoidance

You might be wondering about the **anti-blocking, bot-protection avoiding stealthy features** and why we haven't highlighted them yet. The reason is straightforward: these features are **automatically used** within the default configuration, providing a smooth start without manual adjustments.

:::

{/* TODO: add this to the info once the relevant guide is ready

However, the default configuration, while powerful, may not cover every scenario.

If you want to learn more, browse the [Avoid getting blocked](../guides/avoid-blocking), [Proxy management](../guides/proxy-management) and [Session management](../guides/session-management) guides.
*/}

To promote good coding practices, let's look at how you can use a <ApiLink to="class/Router">`Router`</ApiLink> class to better structure your crawler code.

## Request routing

In the following code, we've made several changes:

- Split the code into multiple files.
- Added custom instance of <ApiLink to="class/Router">`Router`</ApiLink> to make our routing cleaner, without if clauses.
- Moved route definitions to a separate `routes.py` file.
- Simplified the `main.py` file to focus on the general structure of the crawler.

### Routes file

First, let's define our routes in a separate file:

<CodeBlock className="language-python" title="src/routes.py">
    {RoutesExample}
</CodeBlock>

### Main file

Next, our main file becomes much simpler and cleaner:

<CodeBlock className="language-python" title="src/main.py">
    {MainExample}
</CodeBlock>

By structuring your code this way, you achieve better separation of concerns, making the code easier to read, manage and extend. The <ApiLink to="class/Router">`Router`</ApiLink> class keeps your routing logic clean and modular, replacing if clauses with function decorators.

## Summary

Refactoring your crawler code with these practices enhances readability, maintainability, and scalability.

### Splitting your code into multiple files

There's no reason not to split your code into multiple files and keep your logic separate. Less code in a single file means less complexity to handle at any time, which improves overall readability and maintainability. Consider further splitting the routes into separate files for even better organization.

### Using a router to structure your crawling

Initially, using a simple `if` / `else` statement for selecting different logic based on the crawled pages might appear more readable. However, this approach can become cumbersome with more than two types of pages, especially when the logic for each page extends over dozens or even hundreds of lines of code.

It's good practice in any programming language to split your logic into bite-sized chunks that are easy to read and reason about. Scrolling through a thousand line long `request_handler()` where everything interacts with everything and variables can be used everywhere is not a beautiful thing to do and a pain to debug. That's why we prefer the separation of routes into their own files.

## Next steps

In the next and final step, you'll see how to deploy your Crawlee project to the cloud. If you used the CLI to bootstrap your project, you already have a `Dockerfile` ready, and the next section will show you how to deploy it to the [Apify platform](../deployment/apify-platform) with ease.


================================================
FILE: docs/introduction/09_running_in_cloud.mdx
================================================
---
id: deployment
title: Running your crawler in the Cloud
sidebar_label: Running in the Cloud
description: Deploying Crawlee-python projects to the Apify platform
---

import CodeBlock from '@theme/CodeBlock';
import MainExample from '!!raw-loader!./code_examples/09_apify_sdk.py';

## Apify platform

Crawlee is developed by [**Apify**](https://apify.com), the web scraping and automation platform. You could say it is the **home of Crawlee projects**. In this section you'll see how to deploy the crawler there with just a few simple steps. You can deploy a **Crawlee** project wherever you want, but using the [**Apify platform**](https://console.apify.com) will give you the best experience.

{/*In case you want to deploy your Crawlee project to other platforms, check out the [**Deployment**](../deployment) section.*/}

With a few simple steps, you can convert your Crawlee project into a so-called **Actor**. Actors are serverless micro-apps that are easy to develop, run, share, and integrate. The infra, proxies, and storages are ready to go. [Learn more about Actors](https://apify.com/actors).

{/*:::info Choosing between Crawlee CLI and Apify CLI for project setup

We started this guide by using the Crawlee CLI to bootstrap the project - it offers the basic Crawlee templates, including a ready-made `Dockerfile`. If you know you will be deploying your project to the Apify platform, you might want to start with the Apify CLI instead. It also offers several project templates, and those are all set up to be used on the Apify platform right ahead.

:::*/}

## Dependencies

Before we get started, you'll need to install two new dependencies:

- [**Apify SDK**](https://pypi.org/project/apify/), a toolkit for working with the Apify platform. This will allow us to wire the storages (e.g. [`RequestQueue`](https://docs.apify.com/sdk/python/reference/class/RequestQueue) and [`Dataset`](https://docs.apify.com/sdk/python/reference/class/Dataset)) to the Apify cloud products. The Apify SDK, like Crawlee itself, is available as a PyPI package and can be installed with any Python package manager. To install it using [pip](https://pip.pypa.io/), run:

    ```sh
    pip install apify
    ```

- [**Apify CLI**](https://docs.apify.com/cli/), a command-line tool that will help us with authentication and deployment. It is a [Node.js](https://nodejs.org/) package, and can be installed using any Node.js package manager. In this guide, we will use [npm](https://npmjs.com/). We will install it globally, so you can use it across all your Crawlee and Apify projects. To install it using npm, run:

    ```sh
    npm install -g apify-cli
    ```

## Logging in to the Apify platform

The next step will be [creating your Apify account](https://console.apify.com/sign-up). Don't worry, we have a **free tier**, so you can try things out before you buy in! Once you have that, it's time to log in with the just-installed [Apify CLI](https://docs.apify.com/cli/). You will need your personal access token, which you can find at https://console.apify.com/account#/integrations.

```sh
apify login
```

## Adjusting the code

Now that you have your account set up, you will need to adjust the code a tiny bit. We will use the [Apify SDK](https://docs.apify.com/sdk/python/), which will help us to wire the Crawlee storages (like the [`RequestQueue`](https://docs.apify.com/sdk/python/reference/class/RequestQueue)) to their Apify platform counterparts - otherwise Crawlee would keep things only in memory.

Open your `src/main.py` file, and wrap everything in your `main` function with the [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) context manager. Your code should look like this:

<CodeBlock className="language-python" title="src/main.py">
    {MainExample}
</CodeBlock>

The context manager will configure Crawlee to use the Apify API instead of its default memory storage interface. It also sets up few other things, like listening to the platform events via websockets. After the body is finished, it handles graceful shutdown.

:::info Understanding `async with Actor` behavior with environment variables

The [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) context manager works conditionally based on the environment variables, namely based on the `APIFY_IS_AT_HOME` env var, which is set to `true` on the Apify platform. This means that your project will remain working the same locally, but will use the Apify API when deployed to the Apify platform.

:::

## Initializing the project

You will also need to initialize the project for Apify, to do that, use the Apify CLI again:

```sh
apify init
```

The CLI will check the project structure and guide you through the setup process. If prompted, follow the instructions and answer the questions to configure the project correctly. For more information follow the [Apify CLI documentation](https://docs.apify.com/cli/docs).

This will create a folder called `.actor`, and an `actor.json` file inside it - this file contains the configuration relevant to the Apify platform, namely the Actor name, version, build tag, and few other things. Check out the [relevant documentation](https://docs.apify.com/platform/actors/development/actor-definition/actor-json) to see all the different things you can set there up.

## Ship it!

And that's all, your project is now ready to be published on the Apify platform. You can use the Apify CLI once more to do that:

```sh
apify push
```

This command will create an archive from your project, upload it to the Apify platform and initiate a Docker build. Once finished, you will get a link to your new Actor on the platform.

## Learning more about web scraping

:::tip Explore Apify Academy Resources

If you want to learn more about web scraping and browser automation, check out the [Apify Academy](https://developers.apify.com/academy). It's full of courses and tutorials on the topic. From beginner to advanced. And the best thing: **It's free and open source** ❤️

{/*If you want to do one more project, checkout our tutorial on building a [HackerNews scraper using Crawlee](https://blog.apify.com/crawlee-web-scraping-tutorial/).*/}

:::

## Thank you! 🎉

That's it! Thanks for reading the whole introduction and if there's anything wrong, please 🙏 let us know on [GitHub](https://github.com/apify/crawlee-python) or in our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! 👋


================================================
FILE: docs/introduction/code_examples/02_bs.py
================================================
import asyncio

# Add import of crawler and crawling context.
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.storages import RequestQueue


async def main() -> None:
    # First you create the request queue instance.
    rq = await RequestQueue.open()

    # And then you add one or more requests to it.
    await rq.add_request('https://crawlee.dev')

    crawler = BeautifulSoupCrawler(request_manager=rq)

    # Define a request handler and attach it to the crawler using the decorator.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        # Extract <title> text with BeautifulSoup.
        # See BeautifulSoup documentation for API docs.
        url = context.request.url
        title = context.soup.title.string if context.soup.title else ''
        context.log.info(f'The title of {url} is: {title}.')

    await crawler.run()


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/02_bs_better.py
================================================
import asyncio

# You don't need to import RequestQueue anymore.
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler()

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        url = context.request.url
        title = context.soup.title.string if context.soup.title else ''
        context.log.info(f'The title of {url} is: {title}.')

    # Start the crawler with the provided URLs.
    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/02_request_queue.py
================================================
import asyncio

from crawlee.storages import RequestQueue


async def main() -> None:
    # First you create the request queue instance.
    rq = await RequestQueue.open()

    # And then you add one or more requests to it.
    await rq.add_request('https://crawlee.dev')


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/03_enqueue_strategy.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}.')

        # See the `EnqueueStrategy` type alias for more strategy options.
        # highlight-next-line
        await context.enqueue_links(
            # highlight-next-line
            strategy='same-domain',
            # highlight-next-line
        )

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/03_finding_new_links.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    # Let's limit our crawls to make our tests shorter and safer.
    crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        url = context.request.url
        title = context.soup.title.string if context.soup.title else ''
        context.log.info(f'The title of {url} is: {title}.')

        # The enqueue_links function is available as one of the fields of the context.
        # It is also context aware, so it does not require any parameters.
        await context.enqueue_links()

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/03_globs.py
================================================
import asyncio

from crawlee import Glob
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}.')

        # Enqueue links that match the 'include' glob pattern and
        # do not match the 'exclude' glob pattern.
        # highlight-next-line
        await context.enqueue_links(
            # highlight-next-line
            include=[Glob('https://someplace.com/**/cats')],
            # highlight-next-line
            exclude=[Glob('https://**/archive/**')],
            # highlight-next-line
        )

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/03_original_code.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler()

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        url = context.request.url
        title = context.soup.title.string if context.soup.title else ''
        context.log.info(f'The title of {url} is: {title}.')

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/03_transform_request.py
================================================
from __future__ import annotations

import asyncio

from crawlee import HttpHeaders, RequestOptions, RequestTransformAction
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


def transform_request(
    request_options: RequestOptions,
) -> RequestOptions | RequestTransformAction:
    # Skip requests to PDF files
    if request_options['url'].endswith('.pdf'):
        return 'skip'

    if '/docs' in request_options['url']:
        # Add custom headers to requests to specific URLs
        request_options['headers'] = HttpHeaders({'Custom-Header': 'value'})

    elif '/blog' in request_options['url']:
        # Add label for certain URLs
        request_options['label'] = 'BLOG'

    else:
        # Signal that the request should proceed without any transformation
        return 'unchanged'

    return request_options


async def main() -> None:
    crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}.')

        # Transform request before enqueueing
        await context.enqueue_links(transform_request_function=transform_request)

    @crawler.router.handler('BLOG')
    async def blog_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Blog Processing {context.request.url}.')

    await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/04_sanity_check.py
================================================
import asyncio

# Instead of BeautifulSoupCrawler let's use Playwright to be able to render JavaScript.
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler()

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        # Wait for the collection cards to render on the page. This ensures that
        # the elements we want to interact with are present in the DOM.
        await context.page.wait_for_selector('.collection-block-item')

        # Execute a function within the browser context to target the collection
        # card elements and extract their text content, trimming any leading or
        # trailing whitespace.
        category_texts = await context.page.eval_on_selector_all(
            '.collection-block-item',
            '(els) => els.map(el => el.textContent.trim())',
        )

        # Log the extracted texts.
        for i, text in enumerate(category_texts):
            context.log.info(f'CATEGORY_{i + 1}: {text}')

    await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/05_crawling_detail.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler()

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # We're not processing detail pages yet, so we just pass.
        if context.request.label == 'DETAIL':
            pass

        # We are now on a category page. We can use this to paginate through and
        # enqueue all products, as well as any subsequent pages we find.
        elif context.request.label == 'CATEGORY':
            # Wait for the product items to render.
            await context.page.wait_for_selector('.product-item > a')

            # Enqueue links found within elements matching the provided selector.
            # These links will be added to the crawling queue with the label DETAIL.
            await context.enqueue_links(
                selector='.product-item > a',
                label='DETAIL',
            )

            # Find the "Next" button to paginate through the category pages.
            next_button = await context.page.query_selector('a.pagination__next')

            # If a "Next" button is found, enqueue the next page of results.
            if next_button:
                await context.enqueue_links(
                    selector='a.pagination__next',
                    label='CATEGORY',
                )

        # This indicates we're on the start page with no specific label.
        # On the start page, we want to enqueue all the category pages.
        else:
            # Wait for the collection cards to render.
            await context.page.wait_for_selector('.collection-block-item')

            # Enqueue links found within elements matching the provided selector.
            # These links will be added to the crawling queue with the label CATEGORY.
            await context.enqueue_links(
                selector='.collection-block-item',
                label='CATEGORY',
            )

    await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/05_crawling_listing.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler()

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # Wait for the category cards to render on the page. This ensures that
        # the elements we want to interact with are present in the DOM.
        await context.page.wait_for_selector('.collection-block-item')

        # Enqueue links found within elements that match the specified selector.
        # These links will be added to the crawling queue with the label CATEGORY.
        await context.enqueue_links(
            selector='.collection-block-item',
            label='CATEGORY',
        )

    await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/06_scraping.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Let's limit our crawls to make our tests shorter and safer.
        max_requests_per_crawl=10,
    )

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # We're not processing detail pages yet, so we just pass.
        if context.request.label == 'DETAIL':
            # Split the URL and get the last part to extract the manufacturer.
            url_part = context.request.url.split('/').pop()
            manufacturer = url_part.split('-')[0]

            # Extract the title using the combined selector.
            title = await context.page.locator('.product-meta h1').text_content()

            # Extract the SKU using its selector.
            sku = await context.page.locator(
                'span.product-meta__sku-number'
            ).text_content()

            # Locate the price element that contains the '$' sign and filter out
            # the visually hidden elements.
            price_element = context.page.locator('span.price', has_text='$').first
            current_price_string = await price_element.text_content() or ''
            raw_price = current_price_string.split('$')[1]
            price = float(raw_price.replace(',', ''))

            # Locate the element that contains the text 'In stock'
            # and filter out other elements.
            in_stock_element = context.page.locator(
                selector='span.product-form__inventory',
                has_text='In stock',
            ).first
            in_stock = await in_stock_element.count() > 0

            # Put it all together in a dictionary.
            data = {
                'manufacturer': manufacturer,
                'title': title,
                'sku': sku,
                'price': price,
                'in_stock': in_stock,
            }

            # Print the extracted data.
            context.log.info(data)

        # We are now on a category page. We can use this to paginate through and
        # enqueue all products, as well as any subsequent pages we find.
        elif context.request.label == 'CATEGORY':
            # Wait for the product items to render.
            await context.page.wait_for_selector('.product-item > a')

            # Enqueue links found within elements matching the provided selector.
            # These links will be added to the crawling queue with the label DETAIL.
            await context.enqueue_links(
                selector='.product-item > a',
                label='DETAIL',
            )

            # Find the "Next" button to paginate through the category pages.
            next_button = await context.page.query_selector('a.pagination__next')

            # If a "Next" button is found, enqueue the next page of results.
            if next_button:
                await context.enqueue_links(
                    selector='a.pagination__next',
                    label='CATEGORY',
                )

        # This indicates we're on the start page with no specific label.
        # On the start page, we want to enqueue all the category pages.
        else:
            # Wait for the collection cards to render.
            await context.page.wait_for_selector('.collection-block-item')

            # Enqueue links found within elements matching the provided selector.
            # These links will be added to the crawling queue with the label CATEGORY.
            await context.enqueue_links(
                selector='.collection-block-item',
                label='CATEGORY',
            )

    await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/07_final_code.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Let's limit our crawls to make our tests shorter and safer.
        max_requests_per_crawl=10,
    )

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}')

        # We're not processing detail pages yet, so we just pass.
        if context.request.label == 'DETAIL':
            # Split the URL and get the last part to extract the manufacturer.
            url_part = context.request.url.split('/').pop()
            manufacturer = url_part.split('-')[0]

            # Extract the title using the combined selector.
            title = await context.page.locator('.product-meta h1').text_content()

            # Extract the SKU using its selector.
            sku = await context.page.locator(
                'span.product-meta__sku-number'
            ).text_content()

            # Locate the price element that contains the '$' sign and filter out
            # the visually hidden elements.
            price_element = context.page.locator('span.price', has_text='$').first
            current_price_string = await price_element.text_content() or ''
            raw_price = current_price_string.split('$')[1]
            price = float(raw_price.replace(',', ''))

            # Locate the element that contains the text 'In stock' and filter out
            # other elements.
            in_stock_element = context.page.locator(
                selector='span.product-form__inventory',
                has_text='In stock',
            ).first
            in_stock = await in_stock_element.count() > 0

            # Put it all together in a dictionary.
            data = {
                'manufacturer': manufacturer,
                'title': title,
                'sku': sku,
                'price': price,
                'in_stock': in_stock,
            }

            # Push the data to the dataset.
            await context.push_data(data)

        # We are now on a category page. We can use this to paginate through and
        # enqueue all products, as well as any subsequent pages we find.
        elif context.request.label == 'CATEGORY':
            # Wait for the product items to render.
            await context.page.wait_for_selector('.product-item > a')

            # Enqueue links found within elements matching the provided selector.
            # These links will be added to the crawling queue with the label DETAIL.
            await context.enqueue_links(
                selector='.product-item > a',
                label='DETAIL',
            )

            # Find the "Next" button to paginate through the category pages.
            next_button = await context.page.query_selector('a.pagination__next')

            # If a "Next" button is found, enqueue the next page of results.
            if next_button:
                await context.enqueue_links(
                    selector='a.pagination__next',
                    label='CATEGORY',
                )

        # This indicates we're on the start page with no specific label.
        # On the start page, we want to enqueue all the category pages.
        else:
            # Wait for the collection cards to render.
            await context.page.wait_for_selector('.collection-block-item')

            # Enqueue links found within elements matching the provided selector.
            # These links will be added to the crawling queue with the label CATEGORY.
            await context.enqueue_links(
                selector='.collection-block-item',
                label='CATEGORY',
            )

    await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/07_first_code.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.storages import Dataset

# ...


async def main() -> None:
    crawler = PlaywrightCrawler()
    dataset = await Dataset.open()

    # ...

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        ...
        # ...


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/08_main.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler

from .routes import router


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Let's limit our crawls to make our tests shorter and safer.
        max_requests_per_crawl=10,
        # Provide our router instance to the crawler.
        request_handler=router,
    )

    await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/08_routes.py
================================================
from crawlee.crawlers import PlaywrightCrawlingContext
from crawlee.router import Router

router = Router[PlaywrightCrawlingContext]()


@router.default_handler
async def default_handler(context: PlaywrightCrawlingContext) -> None:
    # This is a fallback route which will handle the start URL.
    context.log.info(f'default_handler is processing {context.request.url}')

    await context.page.wait_for_selector('.collection-block-item')

    await context.enqueue_links(
        selector='.collection-block-item',
        label='CATEGORY',
    )


@router.handler('CATEGORY')
async def category_handler(context: PlaywrightCrawlingContext) -> None:
    # This replaces the context.request.label == CATEGORY branch of the if clause.
    context.log.info(f'category_handler is processing {context.request.url}')

    await context.page.wait_for_selector('.product-item > a')

    await context.enqueue_links(
        selector='.product-item > a',
        label='DETAIL',
    )

    next_button = await context.page.query_selector('a.pagination__next')

    if next_button:
        await context.enqueue_links(
            selector='a.pagination__next',
            label='CATEGORY',
        )


@router.handler('DETAIL')
async def detail_handler(context: PlaywrightCrawlingContext) -> None:
    # This replaces the context.request.label == DETAIL branch of the if clause.
    context.log.info(f'detail_handler is processing {context.request.url}')

    url_part = context.request.url.split('/').pop()
    manufacturer = url_part.split('-')[0]

    title = await context.page.locator('.product-meta h1').text_content()

    sku = await context.page.locator('span.product-meta__sku-number').text_content()

    price_element = context.page.locator('span.price', has_text='$').first
    current_price_string = await price_element.text_content() or ''
    raw_price = current_price_string.split('$')[1]
    price = float(raw_price.replace(',', ''))

    in_stock_element = context.page.locator(
        selector='span.product-form__inventory',
        has_text='In stock',
    ).first
    in_stock = await in_stock_element.count() > 0

    data = {
        'manufacturer': manufacturer,
        'title': title,
        'sku': sku,
        'price': price,
        'in_stock': in_stock,
    }

    await context.push_data(data)


================================================
FILE: docs/introduction/code_examples/09_apify_sdk.py
================================================
import asyncio

# highlight-next-line
from apify import Actor

from crawlee.crawlers import PlaywrightCrawler

from .routes import router


async def main() -> None:
    # highlight-next-line
    async with Actor:
        crawler = PlaywrightCrawler(
            # Let's limit our crawls to make our tests shorter and safer.
            max_requests_per_crawl=10,
            # Provide our router instance to the crawler.
            request_handler=router,
        )

        await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/introduction/code_examples/__init__.py
================================================


================================================
FILE: docs/introduction/code_examples/routes.py
================================================
from crawlee.crawlers import PlaywrightCrawlingContext
from crawlee.router import Router

router = Router[PlaywrightCrawlingContext]()


================================================
FILE: docs/introduction/index.mdx
================================================
---
id: introduction
title: Introduction
---

import ApiLink from '@site/src/components/ApiLink';

Crawlee covers your crawling and scraping end-to-end and helps you **build reliable scrapers. Fast.**

Your crawlers will appear human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it.

## What you will learn

The goal of the introduction is to provide a step-by-step guide to the most important features of Crawlee. It will walk you through creating the simplest of crawlers that only prints text to console, all the way up to a full-featured scraper that collects links from a website and extracts data.

## 🛠 Features

Why Crawlee is the preferred choice for web scraping and crawling?

### Why use Crawlee instead of just a random HTTP library with an HTML parser?

- Unified interface for **HTTP & headless browser** crawling.
- Automatic **parallel crawling** based on available system resources.
- Written in Python with **type hints** - enhances DX (IDE autocompletion) and reduces bugs (static type checking).
- Automatic **retries** on errors or when you are getting blocked.
- Integrated **proxy rotation** and session management.
- Configurable **request routing** - direct URLs to the appropriate handlers.
- Persistent **queue for URLs** to crawl.
- Pluggable **storage** of both tabular data and files.
- Robust **error handling**.

### Why to use Crawlee rather than Scrapy?

- Crawlee has out-of-the-box support for **headless browser** crawling (Playwright).
- Crawlee has a **minimalistic & elegant interface** - Set up your scraper with fewer than 10 lines of code.
- Complete **type hint** coverage.
- Based on standard **Asyncio**.

{/* TODO:

### 👾 HTTP crawling

- ...
*/}

{/* TODO:
### 💻 Real browser crawling

- ...
*/}

## Next steps

Next, you will install Crawlee and learn how to bootstrap projects with the prepared Crawlee templates.


================================================
FILE: docs/pyproject.toml
================================================
# Line length different from the rest of the code to make sure that the example codes visualised on the generated
# documentation webpages are shown without vertical slider to make them more readable.

[tool.ruff]
# Inherit all from project top configuration file.
extend = "../pyproject.toml"

# Override just line length
line-length = 90 # Maximum possible fit to the doc webpage. Longer lines need slider.


================================================
FILE: docs/quick-start/code_examples/beautifulsoup_crawler_example.py
================================================
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    # BeautifulSoupCrawler crawls the web using HTTP requests
    # and parses HTML using the BeautifulSoup library.
    crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)

    # Define a request handler to process each crawled page
    # and attach it to the crawler using a decorator.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')
        # Extract relevant data from the page context.
        data = {
            'url': context.request.url,
            'title': context.soup.title.string if context.soup.title else None,
        }
        # Store the extracted data.
        await context.push_data(data)
        # Extract links from the current page and add them to the crawling queue.
        await context.enqueue_links()

    # Add first URL to the queue and start the crawl.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/quick-start/code_examples/parsel_crawler_example.py
================================================
import asyncio

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext


async def main() -> None:
    # ParselCrawler crawls the web using HTTP requests
    # and parses HTML using the Parsel library.
    crawler = ParselCrawler(max_requests_per_crawl=10)

    # Define a request handler to process each crawled page
    # and attach it to the crawler using a decorator.
    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')
        # Extract relevant data from the page context.
        data = {
            'url': context.request.url,
            'title': context.selector.xpath('//title/text()').get(),
        }
        # Store the extracted data.
        await context.push_data(data)
        # Extract links from the current page and add them to the crawling queue.
        await context.enqueue_links()

    # Add first URL to the queue and start the crawl.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/quick-start/code_examples/playwright_crawler_example.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    # PlaywrightCrawler crawls the web using a headless browser
    # controlled by the Playwright library.
    crawler = PlaywrightCrawler()

    # Define a request handler to process each crawled page
    # and attach it to the crawler using a decorator.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')
        # Extract relevant data from the page context.
        data = {
            'url': context.request.url,
            'title': await context.page.title(),
        }
        # Store the extracted data.
        await context.push_data(data)
        # Extract links from the current page and add them to the crawling queue.
        await context.enqueue_links()

    # Add first URL to the queue and start the crawl.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/quick-start/code_examples/playwright_crawler_headful_example.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Run with a visible browser window.
        # highlight-next-line
        headless=False,
        # Switch to the Firefox browser.
        browser_type='firefox',
    )

    # ...


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: docs/quick-start/index.mdx
================================================
---
id: quick-start
title: Quick start
---

import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import CodeBlock from '@theme/CodeBlock';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import BeautifulsoupCrawlerExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_example.py';
import ParselCrawlerExample from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_example.py';
import PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_example.py';

import PlaywrightCrawlerHeadfulExample from '!!raw-loader!./code_examples/playwright_crawler_headful_example.py';

This short tutorial will help you start scraping with Crawlee in just a minute or two. For an in-depth understanding of how Crawlee works, check out the [Introduction](../introduction/index.mdx) section, which provides a comprehensive step-by-step guide to creating your first scraper.

## Choose your crawler

Crawlee offers the following main crawler classes: <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>, and <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>. All crawlers share the same interface, providing maximum flexibility when switching between them.

:::caution Minimum Python version

Crawlee requires Python 3.10 or higher.

:::

### BeautifulSoupCrawler

The <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> is a plain HTTP crawler that parses HTML using the well-known [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library. It crawls the web using an HTTP client that mimics a browser. This crawler is very fast and efficient but cannot handle JavaScript rendering.

### ParselCrawler

The <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> is similar to the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> but uses the [Parsel](https://pypi.org/project/parsel/) library for HTML parsing. Parsel is a lightweight library that provides a CSS selector-based API for extracting data from HTML documents. If you are familiar with the [Scrapy](https://scrapy.org/) framework, you will feel right at home with Parsel. As with the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, the <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> cannot handle JavaScript rendering.

### PlaywrightCrawler

The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> uses a headless browser controlled by the [Playwright](https://playwright.dev/) library. It can manage Chromium, Firefox, Webkit, and other browsers. Playwright is the successor to the [Puppeteer](https://pptr.dev/) library and is becoming the de facto standard in headless browser automation. If you need a headless browser, choose Playwright.

## Installation

Crawlee is available the [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal.

You can install Crawlee with all features or choose only the ones you need. For installing it using the [pip](https://pip.pypa.io/en/stable/) package manager, run the following command:

```sh
python -m pip install 'crawlee[all]'
```

Verify that Crawlee is successfully installed:

```sh
python -c 'import crawlee; print(crawlee.__version__)'
```

If you plan to use the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, you'll need to install Playwright dependencies, including the browser binaries. To do this, run the following command:

```sh
playwright install
```

For detailed installation instructions, see the [Setting up](../introduction/01_setting_up.mdx) documentation page.

## Crawling

Run the following example to perform a recursive crawl of the Crawlee website using the selected crawler.

<Tabs groupId="quickStart">
    <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler" default>
        <RunnableCodeBlock className="language-python" language="python">
            {BeautifulsoupCrawlerExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="ParselCrawler" label="ParselCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {ParselCrawlerExample}
        </RunnableCodeBlock>
    </TabItem>
    <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
        <RunnableCodeBlock className="language-python" language="python">
            {PlaywrightCrawlerExample}
        </RunnableCodeBlock>
    </TabItem>
</Tabs>

When you run the example, you will see Crawlee automating the data extraction process in your terminal.

{/* TODO: improve the logging and add here a sample */}

## Running headful browser

By default, browsers controlled by Playwright run in headless mode (without a visible window). However, you can configure the crawler to run in a headful mode, which is useful during the development phase to observe the browser's actions. You can also switch from the default Chromium browser to Firefox or WebKit.

<CodeBlock language="python">
    {PlaywrightCrawlerHeadfulExample}
</CodeBlock>

When you run the example code, you'll see an automated browser navigating through the Crawlee website.

{/* TODO: add video example */}

## Results

By default, Crawlee stores data in the `./storage` directory within your current working directory. The results of your crawl will be saved as JSON files under `./storage/datasets/default/`.

To view the results, you can use the `cat` command:

```sh
cat ./storage/datasets/default/000000001.json
```

The JSON file will contain data similar to the following:

```json
{
    "url": "https://crawlee.dev/",
    "title": "Crawlee · Build reliable crawlers. Fast. | Crawlee"
}
```

:::tip

If you want to change the storage directory, you can set the `CRAWLEE_STORAGE_DIR` environment variable to your preferred path.

:::

## Examples and further reading

For more examples showcasing various features of Crawlee, visit the [Examples](/docs/examples) section of the documentation. To get a deeper understanding of Crawlee and its components, read the step-by-step [Introduction](../introduction/index.mdx) guide.

[//]: # (TODO: add related links once they are ready)


================================================
FILE: docs/upgrading/upgrading_to_v0x.md
================================================
---
id: upgrading-to-v0x
title: Upgrading to v0.x
---

This page summarizes the breaking changes between Crawlee for Python zero-based versions.

## Upgrading to v0.6

This section summarizes the breaking changes between v0.5.x and v0.6.0.

### HttpCrawlerOptions

- Removed `HttpCrawlerOptions` - which contained options from `BasicCrawlerOptions` and unique options `additional_http_error_status_codes` and `ignore_http_error_status_codes`. Both of the unique options were added to `BasicCrawlerOptions` instead.

### HttpClient

- The signature of the `HttpClient` class has been updated. The constructor parameters `additional_http_error_status_codes` and `ignore_http_error_status_codes` have been removed and are now only available in `BasicCrawlerOptions`.
- The method `_raise_for_error_status_code` has been removed from `HttpClient`. Its logic has been moved to the `BasicCrawler` class.

### SessionCookies

- Replaces the `dict` used for cookie storage in `Session.cookies` with a new `SessionCookies` class. `SessionCookies` uses `CookieJar`, which enables support for multiple domains.

### PlaywrightCrawler and PlaywrightBrowserPlugin

- `PlaywrightCrawler` now use a persistent browser context instead of the standard browser context.
- Added `user_data_dir` parameter for `PlaywrightCrawler` and `PlaywrightBrowserPlugin` to specify the directory for the persistent context. If not provided, a temporary directory will be created automatically.

### Configuration

The `Configuration` fields `chrome_executable_path`, `xvfb`, and `verbose_log` have been removed. The `chrome_executable_path` and `xvfb` fields were unused, while `verbose_log` can be replaced by setting `log_level` to `DEBUG`.

### CLI dependencies

CLI dependencies have been moved to optional dependencies. If you need the CLI, install `crawlee[cli]`

### Abstract base classes

We decided to move away from [Hungarian notation](https://en.wikipedia.org/wiki/Hungarian_notation) and remove all the `Base` prefixes from the abstract classes. It includes the following public classes:
- `BaseStorageClient` -> `StorageClient`
- `BaseBrowserController` -> `BrowserController`
- `BaseBrowserPlugin` -> `BrowserPlugin`

### EnqueueStrategy

The `EnqueueStrategy` has been changed from an enum to a string literal type. All its values and their meaning remain unchanged.

## Upgrading to v0.5

This section summarizes the breaking changes between v0.4.x and v0.5.0.

### Crawlers & CrawlingContexts

- All crawler and crawling context classes have been consolidated into a single sub-package called `crawlers`.
- The affected classes include: `AbstractHttpCrawler`, `AbstractHttpParser`, `BasicCrawler`, `BasicCrawlerOptions`, `BasicCrawlingContext`, `BeautifulSoupCrawler`, `BeautifulSoupCrawlingContext`, `BeautifulSoupParserType`, `ContextPipeline`, `HttpCrawler`, `HttpCrawlerOptions`, `HttpCrawlingContext`, `HttpCrawlingResult`, `ParsedHttpCrawlingContext`, `ParselCrawler`, `ParselCrawlingContext`, `PlaywrightCrawler`, `PlaywrightCrawlingContext`, `PlaywrightPreNavCrawlingContext`.

Example update:
```diff
- from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+ from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
```

### Storage clients

- All storage client classes have been moved into a single sub-package called `storage_clients`.
- The affected classes include: `MemoryStorageClient`, `BaseStorageClient`.

Example update:
```diff
- from crawlee.memory_storage_client import MemoryStorageClient
+ from crawlee.storage_clients import MemoryStorageClient
```

### CurlImpersonateHttpClient

- The `CurlImpersonateHttpClient` changed its import location.

Example update:
```diff
- from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient
+ from crawlee.http_clients import CurlImpersonateHttpClient
```

### BeautifulSoupParser

- Renamed `BeautifulSoupParser` to `BeautifulSoupParserType`. Probably used only in type hints. Please replace previous usages of `BeautifulSoupParser` by `BeautifulSoupParserType`.
- `BeautifulSoupParser` is now a new class that is used in refactored class `BeautifulSoupCrawler`.

### Service locator

- The `crawlee.service_container` was completely refactored and renamed to `crawlee.service_locator`.
- You can use it to set the configuration, event manager or storage client globally. Or you can pass them to your crawler instance directly and it will use the service locator under the hood.

### Statistics

- The `crawlee.statistics.Statistics` class do not accept an event manager as an input argument anymore. It uses the default, global one.
- If you want to set your custom event manager, do it either via the service locator or pass it to the crawler.

### Request

- The properties `json_` and `order_no` were removed. They were there only for the internal purpose of the memory storage client, you should not need them.

### Request storages and loaders

- The `request_provider` parameter of `BasicCrawler.__init__` has been renamed to `request_manager`
- The `BasicCrawler.get_request_provider` method has been renamed to `BasicCrawler.get_request_manager` and it does not accept the `id` and `name` arguments anymore
    - If using a specific request queue is desired, pass it as the `request_manager` on `BasicCrawler` creation
- The `RequestProvider` interface has been renamed to `RequestManager` and moved to the `crawlee.request_loaders` package
- `RequestList` has been moved to the `crawlee.request_loaders` package
- `RequestList` does not support `.drop()`, `.reclaim_request()`, `.add_request()` and `add_requests_batched()` anymore
    - It implements the new `RequestLoader` interface instead of `RequestManager`
    - `RequestManagerTandem` with a `RequestQueue` should be used to enable passing a `RequestList` (or any other `RequestLoader` implementation) as a `request_manager`, `await list.to_tandem()` can be used as a shortcut

### PlaywrightCrawler

- The `PlaywrightPreNavigationContext` was renamed to `PlaywrightPreNavCrawlingContext`.
- The input arguments in `PlaywrightCrawler.__init__` have been renamed:
    - `browser_options` is now `browser_launch_options`,
    - `page_options` is now `browser_new_context_options`.
- These argument renaming changes have also been applied to `BrowserPool`, `PlaywrightBrowserPlugin`, and `PlaywrightBrowserController`.

## Upgrading to v0.4

This section summarizes the breaking changes between v0.3.x and v0.4.0.

### Request model

- The `Request.query_params` field has been removed. Please add query parameters directly to the URL, which was possible before as well, and is now the only supported approach.
- The `Request.payload` and `Request.data` fields have been consolidated. Now, only `Request.payload` remains, and it should be used for all payload data in requests.

### Extended unique key computation

- The computation of `extended_unique_key` now includes HTTP headers. While this change impacts the behavior, the interface remains the same.

## Upgrading to v0.3

This section summarizes the breaking changes between v0.2.x and v0.3.0.

### Public and private interface declaration

In previous versions, the majority of the package was fully public, including many elements intended for internal use only. With the release of v0.3, we have clearly defined the public and private interface of the package. As a result, some imports have been updated (see below). If you are importing something now designated as private, we recommend reconsidering its use or discussing your use case with us in the discussions/issues.

Here is a list of the updated public imports:

```diff
- from crawlee.enqueue_strategy import EnqueueStrategy
+ from crawlee import EnqueueStrategy
```

```diff
- from crawlee.models import Request
+ from crawlee import Request
```

```diff
- from crawlee.basic_crawler import Router
+ from crawlee.router import Router
```

### Request queue

There were internal changes that should not affect the intended usage:

- The unused `BaseRequestQueueClient.list_requests()` method was removed
- `RequestQueue` internals were updated to match the "Request Queue V2" implementation in Crawlee for JS

### Service container

A new module, `crawlee.service_container`, was added to allow management of "global instances" - currently it contains `Configuration`, `EventManager` and `BaseStorageClient`. The module also replaces the `StorageClientManager` static class. It is likely that its interface will change in the future. If your use case requires working with it, please get in touch - we'll be glad to hear any feedback.


================================================
FILE: docs/upgrading/upgrading_to_v1.md
================================================
---
id: upgrading-to-v1
title: Upgrading to v1
---

This page summarizes the breaking changes between Crawlee for Python v0.6 and v1.0.

## Terminology change: "browser" in different contexts

The word "browser" is now used distinctly in two contexts:

- **Playwright context** - Refers to Playwright-supported browsers (`chromium`, `firefox`, `webkit`, `edge`).
- **Fingerprinting context** - Refers to browsers supported by fingerprint generation (`chrome`, `firefox`, `safari`, `edge`).

The type of `HeaderGeneratorOptions.browsers` has changed accordingly:

**Before (v0.6):**

```python
from crawlee.fingerprint_suite import HeaderGeneratorOptions

HeaderGeneratorOptions(browsers=['chromium'])
HeaderGeneratorOptions(browsers=['webkit'])
```

**Now (v1.0):**

```python
from crawlee.fingerprint_suite import HeaderGeneratorOptions

HeaderGeneratorOptions(browsers=['chrome'])
HeaderGeneratorOptions(browsers=['safari'])
```

## New default HTTP client

Crawlee v1.0 now uses `ImpitHttpClient` (based on [impit](https://apify.github.io/impit/) library) as the **default HTTP client**, replacing `HttpxHttpClient` (based on [httpx](https://www.python-httpx.org/) library).

If you want to keep using `HttpxHttpClient`, install Crawlee with `httpx` extra, e.g. using pip:

```bash
pip install 'crawlee[httpx]'
```

And then provide the HTTP client explicitly to the crawler:

```python
from crawlee.crawlers import HttpCrawler
from crawlee.http_clients import HttpxHttpClient

client = HttpxHttpClient()
crawler = HttpCrawler(http_client=client)
```

See the [HTTP clients guide](https://crawlee.dev/python/docs/guides/http-clients) for all options.

## Changes in storages

In Crawlee v1.0, the `Dataset`, `KeyValueStore`, and `RequestQueue` storage APIs have been updated for consistency and simplicity. Below is a detailed overview of what's new, what's changed, and what's been removed.

See the [Storages guide](https://crawlee.dev/python/docs/guides/storages) for more details.

### Dataset

The `Dataset` API now includes several new methods, such as:

- `get_metadata` - retrieves metadata information for the dataset.
- `purge` - completely clears the dataset, including all items (keeps the metadata only).
- `list_items` - returns the dataset's items in a list format.

Some older methods have been removed or replaced:

- `from_storage_object` constructor has been removed. You should now use the `open` method with either a `name` or `id` parameter.
- `get_info` method and the `storage_object` property have been replaced by the new `get_metadata` method.
- `set_metadata` method has been removed.
- `write_to_json` and `write_to_csv` methods have been removed; instead, use the `export_to` method for exporting data in different formats.

### Key-value store

The `KeyValueStore` API now includes several new methods, such as:

- `get_metadata` - retrieves metadata information for the key-value store.
- `purge` - completely clears the key-value store, removing all keys and values (keeps the metadata only).
- `delete_value` - deletes a specific key and its associated value.
- `list_keys` - lists all keys in the key-value store.

Some older methods have been removed or replaced:

- `from_storage_object` - removed; use the `open` method with either a `name` or `id` instead.
- `get_info` and `storage_object` - replaced by the new `get_metadata` method.
- `set_metadata` method has been removed.

### Request queue

The `RequestQueue` API now includes several new methods, such as:

- `get_metadata` - retrieves metadata information for the request queue.
- `purge` - completely clears the request queue, including all pending and processed requests (keeps the metadata only).
- `add_requests` - replaces the previous `add_requests_batched` method, offering the same functionality under a simpler name.

Some older methods have been removed or replaced:

- `from_storage_object` - removed; use the `open` method with either a `name` or `id` instead.
- `get_info` and `storage_object` - replaced by the new `get_metadata` method.
- `get_request` has argument `unique_key` instead of `request_id` as the `id` field was removed from the `Request`.
- `set_metadata` method has been removed.

Some changes in the related model classes:

- `resource_directory` in `RequestQueueMetadata` - removed; use the corresponding `path_to_*` property instead.
- `stats` field in `RequestQueueMetadata` - removed as it was unused.
- `RequestQueueHead` - replaced by `RequestQueueHeadWithLocks`.

## New architecture of storage clients

In v1.0, the storage client system has been completely reworked to simplify implementation and make custom storage clients easier to write.

See the [Storage clients guide](https://crawlee.dev/python/docs/guides/storage-clients) for more details.

### New dedicated storage clients

Previously, `MemoryStorageClient` handled both in-memory storage and optional file system persistence. This has now been split into two distinct storage clients:

- **`MemoryStorageClient`** - Stores all data in memory only.
- **`FileSystemStorageClient`** - Persists data on the file system, with in-memory caching for better performance.

**Before (v0.6):**

```python
from crawlee.configuration import Configuration
from crawlee.storage_clients import MemoryStorageClient

# In-memory only
configuration = Configuration(persist_storage=False)
storage_client = MemoryStorageClient.from_config(configuration)

# File-system persistence
configuration = Configuration(persist_storage=True)
storage_client = MemoryStorageClient.from_config(configuration)
```

**Now (v1.0):**

```python
from crawlee.storage_clients import MemoryStorageClient, FileSystemStorageClient

# In-memory only
storage_client = MemoryStorageClient()

# File-system persistence
storage_client = FileSystemStorageClient()
```

### Registering a storage client

The way you register a storage client remains unchanged:

```python
from crawlee import service_locator
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import MemoryStorageClient
from crawlee.storages import Dataset

# Create custom storage client
storage_client = MemoryStorageClient()

# Then register it globally
service_locator.set_storage_client(storage_client)

# Or use it for a single crawler only
crawler = ParselCrawler(storage_client=storage_client)

# Or use it for a single storage only
dataset = await Dataset.open(
    name='my-dataset',
    storage_client=storage_client,
)
```

### Instance caching

Instance caching of `Dataset.open`, `KeyValueStore.open`, and `RequestQueue.open` now return the same instance for the same arguments. Direct calls to `StorageClient.open_*` always return new instances.

### Writing custom storage clients

The interface for custom storage clients has been simplified:

- One storage client per storage type (`RequestQueue`, `KeyValueStore`, `Dataset`).
- Collection storage clients have been removed.
- The number of methods that have to be implemented have been reduced.

## ServiceLocator changes

### ServiceLocator is stricter with registering services
You can register the services just once, and you can no longer override already registered services.

**Before (v0.6):**
```python
from crawlee import service_locator
from crawlee.storage_clients import MemoryStorageClient

service_locator.set_storage_client(MemoryStorageClient())
service_locator.set_storage_client(MemoryStorageClient())
```
**Now (v1.0):**

```python
from crawlee import service_locator
from crawlee.storage_clients import MemoryStorageClient

service_locator.set_storage_client(MemoryStorageClient())
service_locator.set_storage_client(MemoryStorageClient())  # Raises an error
```

### BasicCrawler has its own instance of ServiceLocator to track its own services
Explicitly passed services to the crawler can be different the global ones accessible in `crawlee.service_locator`. `BasicCrawler` no longer causes the global services in `service_locator` to be set to the crawler's explicitly passed services.

**Before (v0.6):**
```python
from crawlee import service_locator
from crawlee.crawlers import BasicCrawler
from crawlee.storage_clients import MemoryStorageClient
from crawlee.storages import Dataset


async def main() -> None:
    custom_storage_client = MemoryStorageClient()
    crawler = BasicCrawler(storage_client=custom_storage_client)

    assert service_locator.get_storage_client() is custom_storage_client
    assert await crawler.get_dataset() is await Dataset.open()
```
**Now (v1.0):**

```python
from crawlee import service_locator
from crawlee.crawlers import BasicCrawler
from crawlee.storage_clients import MemoryStorageClient
from crawlee.storages import Dataset


async def main() -> None:
    custom_storage_client = MemoryStorageClient()
    crawler = BasicCrawler(storage_client=custom_storage_client)

    assert service_locator.get_storage_client() is not custom_storage_client
    assert await crawler.get_dataset() is not await Dataset.open()
```

This allows two crawlers with different services at the same time.

**Now (v1.0):**

```python
from crawlee.crawlers import BasicCrawler
from crawlee.storage_clients import MemoryStorageClient, FileSystemStorageClient
from crawlee.configuration import Configuration
from crawlee.events import LocalEventManager

custom_configuration_1 = Configuration()
custom_event_manager_1 = LocalEventManager.from_config(custom_configuration_1)
custom_storage_client_1 = MemoryStorageClient()

custom_configuration_2 = Configuration()
custom_event_manager_2 = LocalEventManager.from_config(custom_configuration_2)
custom_storage_client_2 = FileSystemStorageClient()

crawler_1 = BasicCrawler(
    configuration=custom_configuration_1,
    event_manager=custom_event_manager_1,
    storage_client=custom_storage_client_1,
)

crawler_2 = BasicCrawler(
    configuration=custom_configuration_2,
    event_manager=custom_event_manager_2,
    storage_client=custom_storage_client_2,
  )

# use crawlers without runtime crash...
```

## Other smaller updates

There are more smaller updates.

### Python version support

We drop support for Python 3.9. The minimum supported version is now Python 3.10.

### Changes in Configuration

The fields `persist_storage` and `persist_metadata` have been removed from the `Configuration`. Persistence is now determined only by which storage client class you use.

### Changes in Request

`Request` objects no longer have `id` field and all its usages have been transferred to `unique_key` field.

### Changes in HttpResponse

The method `HttpResponse.read` is now asynchronous. This affects all HTTP-based crawlers.

**Before (v0.6):**

```python
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext

async def main() -> None:
    crawler = ParselCrawler()

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        # highlight-next-line
        content = context.http_response.read()
        # ...

    await crawler.run(['https://crawlee.dev/'])
```

**Now (v1.0):**

```python
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext

async def main() -> None:
    crawler = ParselCrawler()

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        # highlight-next-line
        content = await context.http_response.read()
        # ...

    await crawler.run(['https://crawlee.dev/'])
```

### New storage naming restrictions

We've introduced naming restrictions for storages to ensure compatibility with Apify Platform requirements and prevent potential conflicts. Storage names may include only letters (a–z, A–Z), digits (0–9), and hyphens (-), with hyphens allowed only in the middle of the name (for example, my-storage-1).


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "crawlee"
version = "1.6.0"
description = "Crawlee for Python"
authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }]
license = { file = "LICENSE" }
readme = "README.md"
requires-python = ">=3.10"
classifiers = [
    "Development Status :: 5 - Production/Stable",
    "Environment :: Console",
    "Intended Audience :: Developers",
    "License :: OSI Approved :: Apache Software License",
    "Operating System :: OS Independent",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
    "Programming Language :: Python :: 3.14",
    "Topic :: Software Development :: Libraries",
]
keywords = [
    "apify",
    "automation",
    "chrome",
    "crawlee",
    "crawler",
    "headless",
    "scraper",
    "scraping",
]
dependencies = [
    "async-timeout>=5.0.1",
    "cachetools>=5.5.0",
    "colorama>=0.4.0",
    "impit>=0.8.0",
    "more-itertools>=10.2.0",
    "protego>=0.5.0",
    "psutil>=6.0.0",
    "pydantic-settings>=2.12.0",
    "pydantic>=2.11.0",
    "pyee>=9.0.0",
    "tldextract>=5.1.0",
    "typing-extensions>=4.1.0",
    "yarl>=1.18.0",
]

[project.optional-dependencies]
all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,redis]"]
adaptive-crawler = [
    "jaro-winkler>=2.0.3",
    "playwright>=1.27.0",
    "scikit-learn>=1.6.0",
    "apify_fingerprint_datapoints>=0.0.3",
    "browserforge>=1.2.4"
]
beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"]
cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"]
curl-impersonate = ["curl-cffi>=0.9.0"]
httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"]
parsel = ["parsel>=1.10.0"]
playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"]
otel = [
    "opentelemetry-api>=1.34.1",
    "opentelemetry-distro[otlp]>=0.54",
    "opentelemetry-instrumentation>=0.54",
    "opentelemetry-instrumentation-httpx>=0.54",
    "opentelemetry-sdk>=1.34.1",
    "opentelemetry-semantic-conventions>=0.54",
    "wrapt>=1.17.0",
]
sql_postgres = [
    "sqlalchemy[asyncio]>=2.0.0,<3.0.0",
    "asyncpg>=0.24.0"
]
sql_sqlite = [
    "sqlalchemy[asyncio]>=2.0.0,<3.0.0",
    "aiosqlite>=0.21.0",
]
sql_mysql = [
    "sqlalchemy[asyncio]>=2.0.0,<3.0.0",
    "aiomysql>=0.3.2",
    "cryptography>=46.0.5",
]
redis = ["redis[hiredis] >= 7.0.0"]

[project.scripts]
crawlee = "crawlee._cli:cli"

[project.urls]
"Apify Homepage" = "https://apify.com"
"Changelog" = "https://crawlee.dev/python/docs/changelog"
"Discord" = "https://discord.com/invite/jyEM2PRvMU"
"Documentation" = "https://crawlee.dev/python/docs/quick-start"
"Homepage" = "https://crawlee.dev/python"
"Issue Tracker" = "https://github.com/apify/crawlee-python/issues"
"Release Notes" = "https://crawlee.dev/python/docs/upgrading"
"Source Code" = "https://github.com/apify/crawlee-python"

[dependency-groups]
dev = [
    # TODO: Remove this constraint once pydoc-markdown updates its dependencies.
    # Package pydoc-markdown is unmaintained and pins old docspec-python with vulnerable black.
    # See https://github.com/apify/apify-client-python/pull/582/ for more details.
    # We explicitly constrain black>=24.3.0 to override the transitive dependency.
    "black>=24.3.0",
    "anyio<5.0.0",
    "apify_client", # For e2e tests.
    "build<2.0.0", # For e2e tests.
    "dycw-pytest-only<3.0.0",
    "fakeredis[probabilistic,json,lua]<3.0.0",
    "poethepoet<1.0.0",
    "pre-commit<5.0.0",
    "proxy-py<3.0.0",
    "pydoc-markdown<5.0.0",
    "pytest-asyncio<2.0.0",
    "pytest-cov<8.0.0",
    "pytest-rerunfailures<17.0.0",
    "pytest-timeout<3.0.0",
    "pytest-xdist<4.0.0",
    "pytest<10.0.0",
    "ruff~=0.15.0",
    "setuptools", # setuptools are used by pytest, but not explicitly required
    "ty~=0.0.0",
    "types-beautifulsoup4<5.0.0",
    "types-cachetools<7.0.0",
    "types-colorama<1.0.0",
    "types-psutil<8.0.0",
    "types-python-dateutil<3.0.0",
    "uvicorn[standard]<1.0.0",
]

[tool.hatch.build.targets.wheel]
packages = ["src/crawlee"]

[tool.ruff]
line-length = 120
include = ["src/**/*.py", "tests/**/*.py", "docs/**/*.py", "website/**/*.py"]
extend-exclude = ["src/crawlee/project_template"]

[tool.ruff.lint]
select = ["ALL"]
ignore = [
    "ANN401",   # Dynamically typed expressions (typing.Any) are disallowed in {filename}
    "ASYNC109", # Async function definition with a `timeout` parameter
    "BLE001",   # Do not catch blind exception
    "C901",     # `{name}` is too complex
    "COM812",   # This rule may cause conflicts when used with the formatter
    "D100",     # Missing docstring in public module
    "D104",     # Missing docstring in public package
    "D107",     # Missing docstring in `__init__`
    "D203",     # One blank line required before class docstring
    "D213",     # Multi-line docstring summary should start at the second line
    "D413",     # Missing blank line after last section
    "EM",       # flake8-errmsg
    "G004",     # Logging statement uses f-string
    "ISC001",   # This rule may cause conflicts when used with the formatter
    "FIX",      # flake8-fixme
    "PLR0911",  # Too many return statements
    "PLR0912",  # Too many branches
    "PLR0913",  # Too many arguments in function definition
    "PLR0915",  # Too many statements
    "PYI034",   # `__aenter__` methods in classes like `{name}` usually return `self` at runtime
    "PYI036",   # The second argument in `__aexit__` should be annotated with `object` or `BaseException | None`
    "S102",     # Use of `exec` detected
    "S105",     # Possible hardcoded password assigned to
    "S106",     # Possible hardcoded password assigned to argument: "{name}"
    "S301",     # `pickle` and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue
    "S303",     # Use of insecure MD2, MD4, MD5, or SHA1 hash function
    "S311",     # Standard pseudo-random generators are not suitable for cryptographic purposes
    "TD002",    # Missing author in TODO; try: `# TODO(<author_name>): ...` or `# TODO @<author_name>: ...
    "TRY003",   # Avoid specifying long messages outside the exception class
]

[tool.ruff.format]
quote-style = "single"
indent-style = "space"

[tool.ruff.lint.per-file-ignores]
"**/__init__.py" = [
    "F401", # Unused imports
]
"**/{tests}/*" = [
    "ASYNC230", # Async functions should not open files with blocking methods like `open`
    "D",       # Everything from the pydocstyle
    "INP001",  # File {filename} is part of an implicit namespace package, add an __init__.py
    "PLR2004", # Magic value used in comparison, consider replacing {value} with a constant variable
    "S101",    # Use of assert detected
    "SLF001",  # Private member accessed: `{name}`
    "T20",     # flake8-print
    "TRY301",  # Abstract `raise` to an inner function
]
"**/{docs,website}/**" = [
    "D",      # Everything from the pydocstyle
    "INP001", # File {filename} is part of an implicit namespace package, add an __init__.py
    "F841",   # Local variable {variable} is assigned to but never used
    "N999",   # Invalid module name
    "T201",   # `print` found
]
"**/docs/examples/code_examples/*crawler_with_error_snapshotter.py" = [
    "PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code.
]
"**/docs/guides/code_examples/running_in_web_server/server.py" = [
    "TC002", # ruff false positive. Import actually needed during runtime.
]
"**/docs/guides/code_examples/creating_web_archive/*.*" = [
    "ASYNC230", # Ignore for simplicity of the example.
]

[tool.ruff.lint.flake8-quotes]
docstring-quotes = "double"
inline-quotes = "single"

[tool.ruff.lint.flake8-type-checking]
runtime-evaluated-base-classes = [
    "pydantic.BaseModel",
    "pydantic_settings.BaseSettings",
]

[tool.ruff.lint.flake8-builtins]
builtins-ignorelist = ["id"]

[tool.ruff.lint.isort]
known-first-party = ["crawlee"]

[tool.pytest.ini_options]
addopts = "-r a --verbose"
asyncio_default_fixture_loop_scope = "function"
asyncio_mode = "auto"
timeout = 1800
markers = [
    "run_alone: marks tests that must run in isolation",
]
# Ignore DeprecationWarnings coming from Uvicorn's internal imports. Uvicorn relies on deprecated
# modules from `websockets`, which triggers warnings during tests. These are safe to ignore until
# Uvicorn updates its internals.
filterwarnings = [
    "ignore:websockets.legacy is deprecated:DeprecationWarning",
    "ignore:websockets.server.WebSocketServerProtocol is deprecated:DeprecationWarning",
]

[tool.ty.environment]
python-version = "3.10"

[tool.ty.src]
include = ["src", "tests", "scripts", "docs", "website"]
exclude = [
    "src/crawlee/project_template",
    "docs/guides/code_examples/storage_clients/custom_storage_client_example.py",
]

[[tool.ty.overrides]]
include = [
    "docs/**/*.py",
    "website/**/*.py",
]

[tool.ty.overrides.rules]
unresolved-import = "ignore"

[tool.coverage.report]
exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:", "assert_never()"]

[tool.ipdb]
context = 7

# Run tasks with: uv run poe <task>
[tool.poe.tasks]
clean = "rm -rf .coverage .pytest_cache .ruff_cache .ty_cache .uv-cache build coverage-unit.xml dist htmlcov website/.docusaurus website/.yarn website/module_shortcuts.json website/node_modules "
install-sync = "uv sync --all-extras"
build = "uv build --verbose"
publish-to-pypi = "uv publish --verbose --token ${APIFY_PYPI_TOKEN_CRAWLEE}"
type-check = "uv run ty check"
check-code = ["lint", "type-check", "unit-tests"]

[tool.poe.tasks.install-dev]
shell = "uv sync --all-extras && uv run pre-commit install && uv run playwright install"

[tool.poe.tasks.lint]
shell = "uv run ruff format --check && uv run ruff check"

[tool.poe.tasks.format]
shell = "uv run ruff check --fix && uv run ruff format"

[tool.poe.tasks.unit-tests]
shell = """
uv run pytest \
    --numprocesses=1 \
    -m "run_alone" \
    tests/unit && \
uv run pytest \
    --numprocesses=${TESTS_CONCURRENCY:-auto} \
    -m "not run_alone" \
    tests/unit
"""

[tool.poe.tasks.unit-tests-cov]
shell = """
uv run pytest \
    --numprocesses=1 \
    -m "run_alone" \
    --cov=src/crawlee \
    --cov-report=xml:coverage-unit.xml \
    tests/unit && \
uv run pytest \
    --numprocesses=${TESTS_CONCURRENCY:-auto} \
    -m "not run_alone" \
    --cov=src/crawlee \
    --cov-report=xml:coverage-unit.xml \
    --cov-append \
    tests/unit
"""

[tool.poe.tasks.e2e-templates-tests]
cmd = """
uv run pytest \
    --numprocesses=${TESTS_CONCURRENCY:-auto} \
    tests/e2e/project_template
"""

[tool.poe.tasks.build-docs]
shell = "./build_api_reference.sh && corepack enable && yarn && yarn build"
cwd = "website"

[tool.poe.tasks.run-docs]
shell = "./build_api_reference.sh && corepack enable && yarn && yarn start"
cwd = "website"


================================================
FILE: renovate.json
================================================
{
    "extends": ["config:base", ":semanticCommitTypeAll(chore)"],
    "ignorePaths": ["docs/**", "src/crawlee/project_template/**"],
    "pinVersions": false,
    "separateMajorMinor": false,
    "dependencyDashboard": false,
    "semanticCommits": "enabled",
    "lockFileMaintenance": {
        "enabled": true,
        "automerge": true,
        "automergeType": "branch"
    },
    "packageRules": [
        {
            "matchPaths": ["pyproject.toml"],
            "matchDepTypes": ["devDependencies"],
            "matchUpdateTypes": ["major", "minor"],
            "groupName": "major/minor dev dependencies",
            "groupSlug": "dev-dependencies",
            "automerge": true,
            "automergeType": "branch"
        }
    ],
    "schedule": ["before 7am every weekday"],
    "ignoreDeps": ["crawlee", "docusaurus-plugin-typedoc-api"]
}


================================================
FILE: src/crawlee/__init__.py
================================================
from importlib import metadata

from ._request import Request, RequestOptions, RequestState
from ._service_locator import service_locator
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
from ._utils.globs import Glob

__version__ = metadata.version('crawlee')

__all__ = [
    'ConcurrencySettings',
    'EnqueueStrategy',
    'Glob',
    'HttpHeaders',
    'Request',
    'RequestOptions',
    'RequestState',
    'RequestTransformAction',
    'SkippedReason',
    'service_locator',
]


================================================
FILE: src/crawlee/_autoscaling/__init__.py
================================================
from .autoscaled_pool import AutoscaledPool
from .snapshotter import Snapshotter
from .system_status import SystemStatus

__all__ = ['AutoscaledPool', 'Snapshotter', 'SystemStatus']


================================================
FILE: src/crawlee/_autoscaling/_types.py
================================================
from __future__ import annotations

from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING, Annotated

from pydantic import Field
from pydantic.dataclasses import dataclass as pydantic_dataclass

if TYPE_CHECKING:
    from crawlee._utils.byte_size import ByteSize

SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD = 0.97


@dataclass
class LoadRatioInfo:
    """Represent the load ratio of a resource."""

    limit_ratio: float
    """The maximum ratio of overloaded and non-overloaded samples. If the actual ratio exceeds this value,
    the resource is considered as overloaded."""

    actual_ratio: float
    """The actual ratio of overloaded and non-overloaded samples."""

    @property
    def is_overloaded(self) -> bool:
        """Indicate whether the resource is currently overloaded."""
        return self.actual_ratio > self.limit_ratio


@dataclass
class SystemInfo:
    """Represent the current status of the system."""

    cpu_info: LoadRatioInfo
    """The CPU load ratio."""

    memory_info: LoadRatioInfo
    """The memory load ratio."""

    event_loop_info: LoadRatioInfo
    """The event loop load ratio."""

    client_info: LoadRatioInfo
    """The client load ratio."""

    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
    """The time at which the system load information was measured."""

    @property
    def is_system_idle(self) -> bool:
        """Indicate whether the system is currently idle or overloaded."""
        return (
            not self.cpu_info.is_overloaded
            and not self.memory_info.is_overloaded
            and not self.event_loop_info.is_overloaded
            and not self.client_info.is_overloaded
        )

    def __str__(self) -> str:
        """Get a string representation of the system info."""
        stats = {
            'cpu': self.cpu_info.actual_ratio,
            'mem': self.memory_info.actual_ratio,
            'event_loop': self.event_loop_info.actual_ratio,
            'client_info': self.client_info.actual_ratio,
        }
        return '; '.join(f'{name} = {ratio}' for name, ratio in stats.items())


@dataclass
class CpuSnapshot:
    """A snapshot of CPU usage."""

    used_ratio: float
    """The ratio of CPU currently in use."""

    max_used_ratio: float
    """The maximum ratio of CPU that is considered acceptable."""

    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
    """The time at which the system load information was measured."""

    @property
    def is_overloaded(self) -> bool:
        """Indicate whether the CPU is considered as overloaded."""
        return self.used_ratio > self.max_used_ratio


@dataclass
class MemorySnapshot:
    """A snapshot of memory usage."""

    current_size: ByteSize
    """Memory usage of the current Python process and its children."""

    system_wide_used_size: ByteSize | None
    """Memory usage of all processes, system-wide."""

    max_memory_size: ByteSize
    """The maximum memory that can be used by `AutoscaledPool`."""

    system_wide_memory_size: ByteSize | None
    """Total memory available in the whole system."""

    max_used_memory_ratio: float
    """The maximum acceptable ratio of `current_size` to `max_memory_size`."""

    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
    """The time at which the system load information was measured."""

    @property
    def is_overloaded(self) -> bool:
        """Indicate whether the memory is considered as overloaded."""
        if self.system_wide_memory_size is not None and self.system_wide_used_size is not None:
            system_wide_utilization = self.system_wide_used_size / self.system_wide_memory_size
            if system_wide_utilization > SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD:
                return True

        return (self.current_size / self.max_memory_size) > self.max_used_memory_ratio


@dataclass
class EventLoopSnapshot:
    """Snapshot of the state of the event loop."""

    delay: timedelta
    """The current delay of the event loop."""

    max_delay: timedelta
    """The maximum delay that is considered acceptable."""

    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
    """The time at which the system load information was measured."""

    @property
    def max_delay_exceeded(self) -> timedelta:
        """The amount of time by which the delay exceeds the maximum delay."""
        return max(self.delay - self.max_delay, timedelta(seconds=0))

    @property
    def is_overloaded(self) -> bool:
        """Indicate whether the event loop is considered as overloaded."""
        return self.delay > self.max_delay


@dataclass
class ClientSnapshot:
    """Snapshot of the state of the client."""

    error_count: int
    """The number of errors (HTTP 429) that occurred."""

    new_error_count: int
    """The number of new errors (HTTP 429) that occurred since the last snapshot."""

    max_error_count: int
    """The maximum number of errors that is considered acceptable."""

    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
    """The time at which the system load information was measured."""

    @property
    def is_overloaded(self) -> bool:
        """Indicate whether the client is considered as overloaded."""
        return self.new_error_count > self.max_error_count


Snapshot = MemorySnapshot | CpuSnapshot | EventLoopSnapshot | ClientSnapshot


@pydantic_dataclass
class Ratio:
    """Represents ratio of memory."""

    value: Annotated[float, Field(gt=0.0, le=1.0)]


================================================
FILE: src/crawlee/_autoscaling/autoscaled_pool.py
================================================
# Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/autoscaling/autoscaled_pool.ts

from __future__ import annotations

import asyncio
import math
from contextlib import suppress
from datetime import timedelta
from logging import getLogger
from typing import TYPE_CHECKING

from crawlee._types import ConcurrencySettings
from crawlee._utils.docs import docs_group
from crawlee._utils.recurring_task import RecurringTask

if TYPE_CHECKING:
    from collections.abc import Awaitable, Callable

    from crawlee._autoscaling import SystemStatus

logger = getLogger(__name__)


class AbortError(Exception):
    """Raised when an AutoscaledPool run is aborted. Not for direct use."""


class _AutoscaledPoolRun:
    def __init__(self) -> None:
        self.worker_tasks = list[asyncio.Task]()
        """A list of worker tasks currently in progress"""

        self.worker_tasks_updated = asyncio.Event()
        self.cleanup_done = asyncio.Event()
        self.result: asyncio.Future = asyncio.Future()


@docs_group('Autoscaling')
class AutoscaledPool:
    """Manages a pool of asynchronous resource-intensive tasks that are executed in parallel.

    The pool only starts new tasks if there is enough free CPU and memory available. If an exception is thrown in
    any of the tasks, it is propagated and the pool is stopped.
    """

    _AUTOSCALE_INTERVAL = timedelta(seconds=10)
    """Interval at which the autoscaled pool adjusts the desired concurrency based on the latest system status."""

    _LOGGING_INTERVAL = timedelta(minutes=1)
    """Interval at which the autoscaled pool logs its current state."""

    _DESIRED_CONCURRENCY_RATIO = 0.9
    """Minimum ratio of desired concurrency that must be reached before allowing further scale-up."""

    _SCALE_UP_STEP_RATIO = 0.05
    """Fraction of desired concurrency to add during each scale-up operation."""

    _SCALE_DOWN_STEP_RATIO = 0.05
    """Fraction of desired concurrency to remove during each scale-down operation."""

    _TASK_TIMEOUT: timedelta | None = None
    """Timeout within which the `run_task_function` must complete."""

    def __init__(
        self,
        *,
        system_status: SystemStatus,
        concurrency_settings: ConcurrencySettings | None = None,
        run_task_function: Callable[[], Awaitable],
        is_task_ready_function: Callable[[], Awaitable[bool]],
        is_finished_function: Callable[[], Awaitable[bool]],
    ) -> None:
        """Initialize a new instance.

        Args:
            system_status: Provides data about system utilization (load).
            concurrency_settings: Settings of concurrency levels.
            run_task_function: A function that performs an asynchronous resource-intensive task.
            is_task_ready_function: A function that indicates whether `run_task_function` should be called. This
                function is called every time there is free capacity for a new task and it should indicate whether
                it should start a new task or not by resolving to either `True` or `False`. Besides its obvious use,
                it is also useful for task throttling to save resources.
            is_finished_function: A function that is called only when there are no tasks to be processed. If it
                resolves to `True` then the pool's run finishes. Being called only when there are no tasks being
                processed means that as long as `is_task_ready_function` keeps resolving to `True`,
                `is_finished_function` will never be called. To abort a run, use the `abort` method.
        """
        concurrency_settings = concurrency_settings or ConcurrencySettings()

        self._system_status = system_status
        self._run_task_function = run_task_function
        self._is_task_ready_function = is_task_ready_function
        self._is_finished_function = is_finished_function
        self._desired_concurrency = concurrency_settings.desired_concurrency
        self._max_concurrency = concurrency_settings.max_concurrency
        self._min_concurrency = concurrency_settings.min_concurrency
        self._max_tasks_per_minute = concurrency_settings.max_tasks_per_minute

        self._log_system_status_task = RecurringTask(self._log_system_status, self._LOGGING_INTERVAL)
        self._autoscale_task = RecurringTask(self._autoscale, self._AUTOSCALE_INTERVAL)

        self._is_paused = False
        self._current_run: _AutoscaledPoolRun | None = None

    async def run(self) -> None:
        """Start the autoscaled pool and return when all tasks are completed and `is_finished_function` returns True.

        If there is an exception in one of the tasks, it will be re-raised.
        """
        if self._current_run is not None:
            raise RuntimeError('The pool is already running')

        run = _AutoscaledPoolRun()
        self._current_run = run

        logger.debug('Starting the pool')

        self._autoscale_task.start()
        self._log_system_status_task.start()

        orchestrator = asyncio.create_task(
            self._worker_task_orchestrator(run), name='autoscaled pool worker task orchestrator'
        )

        try:
            await run.result
        except AbortError:
            orchestrator.cancel()
            for task in run.worker_tasks:
                if not task.done():
                    task.cancel()
        finally:
            with suppress(asyncio.CancelledError):
                await self._autoscale_task.stop()
            with suppress(asyncio.CancelledError):
                await self._log_system_status_task.stop()

            if not orchestrator.done():
                orchestrator.cancel()
            elif not orchestrator.cancelled() and orchestrator.exception() is not None:
                logger.error('Exception in worker task orchestrator', exc_info=orchestrator.exception())

            logger.info('Waiting for remaining tasks to finish')

            for task in run.worker_tasks:
                if not task.done():
                    with suppress(BaseException):
                        await task

            run.cleanup_done.set()
            self._current_run = None

            logger.debug('Pool cleanup finished')

    async def abort(self) -> None:
        """Interrupt the autoscaled pool and all the tasks in progress."""
        if not self._current_run:
            raise RuntimeError('The pool is not running')

        self._current_run.result.set_exception(AbortError())
        await self._current_run.cleanup_done.wait()

    def pause(self) -> None:
        """Pause the autoscaled pool so that it does not start new tasks."""
        self._is_paused = True

    def resume(self) -> None:
        """Resume a paused autoscaled pool so that it continues starting new tasks."""
        self._is_paused = False

    @property
    def desired_concurrency(self) -> int:
        """The current desired concurrency, possibly updated by the pool according to system load."""
        return self._desired_concurrency

    @property
    def current_concurrency(self) -> int:
        """The number of concurrent tasks in progress."""
        if self._current_run is None:
            return 0

        return len(self._current_run.worker_tasks)

    def _autoscale(self) -> None:
        """Inspect system load status and adjust desired concurrency if necessary. Do not call directly."""
        status = self._system_status.get_historical_system_info()

        min_current_concurrency = math.floor(self._DESIRED_CONCURRENCY_RATIO * self.desired_concurrency)
        should_scale_up = (
            status.is_system_idle
            and self._desired_concurrency < self._max_concurrency
            and self.current_concurrency >= min_current_concurrency
        )

        should_scale_down = not status.is_system_idle and self._desired_concurrency > self._min_concurrency

        if should_scale_up:
            step = math.ceil(self._SCALE_UP_STEP_RATIO * self._desired_concurrency)
            self._desired_concurrency = min(self._max_concurrency, self._desired_concurrency + step)
        elif should_scale_down:
            step = math.ceil(self._SCALE_DOWN_STEP_RATIO * self._desired_concurrency)
            self._desired_concurrency = max(self._min_concurrency, self._desired_concurrency - step)

    def _log_system_status(self) -> None:
        system_status = self._system_status.get_historical_system_info()

        logger.info(
            f'current_concurrency = {self.current_concurrency}; '
            f'desired_concurrency = {self.desired_concurrency}; '
            f'{system_status!s}'
        )

    async def _worker_task_orchestrator(self, run: _AutoscaledPoolRun) -> None:
        """Launch worker tasks whenever there is free capacity and a task is ready.

        Exits when `is_finished_function` returns True.
        """
        finished = False

        try:
            while not (finished := await self._is_finished_function()) and not run.result.done():
                run.worker_tasks_updated.clear()

                current_status = self._system_status.get_current_system_info()
                if not current_status.is_system_idle:
                    logger.debug('Not scheduling new tasks - system is overloaded')
                elif self._is_paused:
                    logger.debug('Not scheduling new tasks - the autoscaled pool is paused')
                elif self.current_concurrency >= self.desired_concurrency:
                    logger.debug('Not scheduling new tasks - already running at desired concurrency')
                elif not await self._is_task_ready_function():
                    logger.debug('Not scheduling new task - no task is ready')
                else:
                    logger.debug('Scheduling a new task')
                    worker_task = asyncio.create_task(self._worker_task(), name='autoscaled pool worker task')
                    worker_task.add_done_callback(lambda task: self._reap_worker_task(task, run))
                    run.worker_tasks.append(worker_task)

                    if math.isfinite(self._max_tasks_per_minute):
                        await asyncio.sleep(60 / self._max_tasks_per_minute)

                    continue

                with suppress(asyncio.TimeoutError):
                    await asyncio.wait_for(run.worker_tasks_updated.wait(), timeout=0.5)
        finally:
            if finished:
                logger.debug('`is_finished_function` reports that we are finished')
            elif run.result.done() and run.result.exception() is not None:
                logger.debug('Unhandled exception in `run_task_function`')

            if run.worker_tasks:
                logger.debug('Terminating - waiting for tasks to complete')
                await asyncio.wait(run.worker_tasks, return_when=asyncio.ALL_COMPLETED)
                logger.debug('Worker tasks finished')
            else:
                logger.debug('Terminating - no running tasks to wait for')

            if not run.result.done():
                run.result.set_result(object())

    def _reap_worker_task(self, task: asyncio.Task, run: _AutoscaledPoolRun) -> None:
        """Handle cleanup and tracking of a completed worker task.

        - Interrupt the run if the task encountered an exception.
        - Update the list of tasks in progress.
        - Notify the orchestrator about the task completion.
        """
        run.worker_tasks_updated.set()
        run.worker_tasks.remove(task)

        if not task.cancelled() and (exception := task.exception()) and not run.result.done():
            run.result.set_exception(exception)

    async def _worker_task(self) -> None:
        try:
            await asyncio.wait_for(
                self._run_task_function(),
                timeout=self._TASK_TIMEOUT.total_seconds() if self._TASK_TIMEOUT is not None else None,
            )
        except asyncio.TimeoutError:
            timeout_str = self._TASK_TIMEOUT.total_seconds() if self._TASK_TIMEOUT is not None else '*not set*'
            logger.warning(f'Task timed out after {timeout_str} seconds')
        finally:
            logger.debug('Worker task finished')


================================================
FILE: src/crawlee/_autoscaling/py.typed
================================================


================================================
FILE: src/crawlee/_autoscaling/snapshotter.py
================================================
# Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/autoscaling/snapshotter.ts

from __future__ import annotations

import functools
from bisect import insort
from datetime import datetime, timedelta, timezone
from logging import getLogger
from typing import TYPE_CHECKING, TypeVar, cast

from crawlee import service_locator
from crawlee._autoscaling._types import ClientSnapshot, CpuSnapshot, EventLoopSnapshot, MemorySnapshot, Ratio, Snapshot
from crawlee._utils.byte_size import ByteSize
from crawlee._utils.context import ensure_context
from crawlee._utils.docs import docs_group
from crawlee._utils.recurring_task import RecurringTask
from crawlee._utils.system import MemoryInfo, MemoryUsageInfo, get_memory_info
from crawlee.events._types import Event, EventSystemInfoData

if TYPE_CHECKING:
    from types import TracebackType

    from crawlee.configuration import Configuration

logger = getLogger(__name__)

T = TypeVar('T', bound=Snapshot)


@functools.lru_cache
def _warn_once(warning_message: str) -> None:
    """Log a warning message only once."""
    logger.warning(warning_message)


class SortedSnapshotList(list[T]):
    """A list that maintains sorted order by `created_at` attribute for snapshot objects."""

    def add(self, item: T) -> None:
        """Add an item to the list maintaining sorted order by `created_at` using binary search."""
        insort(self, item, key=lambda item: item.created_at)


@docs_group('Autoscaling')
class Snapshotter:
    """Monitors and logs system resource usage at predefined intervals for performance optimization.

    The class monitors and records the state of various system resources (CPU, memory, event loop, and client API)
    at predefined intervals. This continuous monitoring helps in identifying resource overloads and ensuring optimal
    performance of the application. It is utilized in the `AutoscaledPool` module to adjust task allocation
    dynamically based on the current demand and system load.
    """

    _EVENT_LOOP_SNAPSHOT_INTERVAL = timedelta(milliseconds=500)
    """The interval at which the event loop is sampled."""

    _CLIENT_SNAPSHOT_INTERVAL = timedelta(milliseconds=1000)
    """The interval at which the client is sampled."""

    _SNAPSHOT_HISTORY = timedelta(seconds=30)
    """The time interval for which the snapshots are kept."""

    _RESERVE_MEMORY_RATIO = 0.5
    """Fraction of memory kept in reserve. Used to calculate critical memory overload threshold."""

    _MEMORY_WARNING_COOLDOWN_PERIOD = timedelta(milliseconds=10000)
    """Minimum time interval between logging successive critical memory overload warnings."""

    _CLIENT_RATE_LIMIT_ERROR_RETRY_COUNT = 2
    """Number of retries for a client request before considering it a failure due to rate limiting."""

    def __init__(
        self,
        *,
        max_used_cpu_ratio: float,
        max_used_memory_ratio: float,
        max_event_loop_delay: timedelta,
        max_client_errors: int,
        max_memory_size: ByteSize | Ratio,
    ) -> None:
        """Initialize a new instance.

        In most cases, you should use the `from_config` constructor to create a new instance based on
        the provided configuration.

        Args:
            max_used_cpu_ratio: Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than
                the provided ratio, the CPU is considered overloaded.
            max_used_memory_ratio: Sets the ratio, defining the maximum ratio of memory usage. When the memory usage
                is higher than the provided ratio of `max_memory_size`, the memory is considered overloaded.
            max_event_loop_delay: Sets the maximum delay of the event loop. When the delay is higher than the provided
                value, the event loop is considered overloaded.
            max_client_errors: Sets the maximum number of client errors (HTTP 429). When the number of client errors
                is higher than the provided number, the client is considered overloaded.
            max_memory_size: Sets the maximum amount of system memory to be used by the `AutoscaledPool`. When of type
                `ByteSize` then it is used as fixed memory size. When of type `Ratio` then it allows for dynamic memory
                scaling based on the available system memory.
        """
        self._max_used_cpu_ratio = max_used_cpu_ratio
        self._max_used_memory_ratio = max_used_memory_ratio
        self._max_event_loop_delay = max_event_loop_delay
        self._max_client_errors = max_client_errors
        self._max_memory_size = max_memory_size

        self._cpu_snapshots = self._get_sorted_list_by_created_at(list[CpuSnapshot]())
        self._event_loop_snapshots = self._get_sorted_list_by_created_at(list[EventLoopSnapshot]())
        self._memory_snapshots = self._get_sorted_list_by_created_at(list[MemorySnapshot]())
        self._client_snapshots = self._get_sorted_list_by_created_at(list[ClientSnapshot]())

        self._snapshot_event_loop_task = RecurringTask(self._snapshot_event_loop, self._EVENT_LOOP_SNAPSHOT_INTERVAL)
        self._snapshot_client_task = RecurringTask(self._snapshot_client, self._CLIENT_SNAPSHOT_INTERVAL)

        self._timestamp_of_last_memory_warning: datetime = datetime.now(timezone.utc) - timedelta(hours=1)

        # Flag to indicate the context state.
        self._active = False

    @classmethod
    def from_config(cls, config: Configuration | None = None) -> Snapshotter:
        """Initialize a new instance based on the provided `Configuration`.

        Args:
            config: The `Configuration` instance. Uses the global (default) one if not provided.
        """
        config = config or service_locator.get_configuration()

        # Compute the maximum memory size based on the provided configuration. If `memory_mbytes` is provided,
        # it uses that value. Otherwise, it calculates the `max_memory_size` as a proportion of the system's
        # total available memory based on `available_memory_ratio`.
        max_memory_size = (
            ByteSize.from_mb(config.memory_mbytes)
            if config.memory_mbytes
            else Ratio(value=config.available_memory_ratio)
        )

        return cls(
            max_used_cpu_ratio=config.max_used_cpu_ratio,
            max_used_memory_ratio=config.max_used_memory_ratio,
            max_event_loop_delay=config.max_event_loop_delay,
            max_client_errors=config.max_client_errors,
            max_memory_size=max_memory_size,
        )

    @staticmethod
    def _get_sorted_list_by_created_at(input_list: list[T]) -> SortedSnapshotList[T]:
        """Create a sorted list from the input list.

        Returns a custom list that maintains sorted order by created_at when items are added.
        """
        result = SortedSnapshotList[T]()
        result.extend(input_list)
        return result

    @property
    def active(self) -> bool:
        """Indicate whether the context is active."""
        return self._active

    async def __aenter__(self) -> Snapshotter:
        """Start capturing snapshots at configured intervals.

        Raises:
            RuntimeError: If the context manager is already active.
        """
        if self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is already active.')

        self._active = True
        event_manager = service_locator.get_event_manager()
        event_manager.on(event=Event.SYSTEM_INFO, listener=self._snapshot_cpu)
        event_manager.on(event=Event.SYSTEM_INFO, listener=self._snapshot_memory)
        self._snapshot_event_loop_task.start()
        self._snapshot_client_task.start()
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        """Stop all resource capturing.

        This method stops capturing snapshots of system resources (CPU, memory, event loop, and client information).
        It should be called to terminate resource capturing when it is no longer needed.

        Raises:
            RuntimeError: If the context manager is not active.
        """
        if not self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is not active.')

        event_manager = service_locator.get_event_manager()
        event_manager.off(event=Event.SYSTEM_INFO, listener=self._snapshot_cpu)
        event_manager.off(event=Event.SYSTEM_INFO, listener=self._snapshot_memory)
        await self._snapshot_event_loop_task.stop()
        await self._snapshot_client_task.stop()
        self._active = False

    @ensure_context
    def get_memory_sample(self, duration: timedelta | None = None) -> list[Snapshot]:
        """Return a sample of the latest memory snapshots.

        Args:
            duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history.

        Returns:
            A sample of memory snapshots.
        """
        snapshots = cast('list[Snapshot]', self._memory_snapshots)
        return self._get_sample(snapshots, duration)

    @ensure_context
    def get_event_loop_sample(self, duration: timedelta | None = None) -> list[Snapshot]:
        """Return a sample of the latest event loop snapshots.

        Args:
            duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history.

        Returns:
            A sample of event loop snapshots.
        """
        snapshots = cast('list[Snapshot]', self._event_loop_snapshots)
        return self._get_sample(snapshots, duration)

    @ensure_context
    def get_cpu_sample(self, duration: timedelta | None = None) -> list[Snapshot]:
        """Return a sample of the latest CPU snapshots.

        Args:
            duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history.

        Returns:
            A sample of CPU snapshots.
        """
        snapshots = cast('list[Snapshot]', self._cpu_snapshots)
        return self._get_sample(snapshots, duration)

    @ensure_context
    def get_client_sample(self, duration: timedelta | None = None) -> list[Snapshot]:
        """Return a sample of the latest client snapshots.

        Args:
            duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history.

        Returns:
            A sample of client snapshots.
        """
        snapshots = cast('list[Snapshot]', self._client_snapshots)
        return self._get_sample(snapshots, duration)

    @staticmethod
    def _get_sample(snapshots: list[Snapshot], duration: timedelta | None = None) -> list[Snapshot]:
        """Return a time-limited sample from snapshots or full history if duration is None."""
        if not duration:
            return snapshots

        if not snapshots:
            return []

        latest_time = snapshots[-1].created_at
        return [snapshot for snapshot in snapshots if latest_time - snapshot.created_at <= duration]

    async def _snapshot_cpu(self, event_data: EventSystemInfoData) -> None:
        """Capture a snapshot of the current CPU usage.

        This method does not perform CPU usage measurement. Instead, it just reads the data received through
        the `event_data` parameter, which is expected to be supplied by the event manager.
        Must be `async` to ensure it is not scheduled to be run in own thread by the event manager, which could cause
        race conditions in snapshots manipulation(sorting and pruning).

        Args:
            event_data: System info data from which CPU usage is read.
        """
        snapshot = CpuSnapshot(
            used_ratio=event_data.cpu_info.used_ratio,
            max_used_ratio=self._max_used_cpu_ratio,
            created_at=event_data.cpu_info.created_at,
        )

        snapshots = cast('list[Snapshot]', self._cpu_snapshots)
        self._cpu_snapshots.add(snapshot)
        self._prune_snapshots(snapshots, self._cpu_snapshots[-1].created_at)

    async def _snapshot_memory(self, event_data: EventSystemInfoData) -> None:
        """Capture a snapshot of the current memory usage.

        This method does not perform memory usage measurement. Instead, it just reads the data received through
        the `event_data` parameter, which is expected to be supplied by the event manager.
        Must be `async` to ensure it is not scheduled to be run in own thread by the event manager, which could cause
        race conditions in snapshots manipulation(sorting and pruning).

        Args:
            event_data: System info data from which memory usage is read.
        """
        match event_data.memory_info, self._max_memory_size:
            case MemoryInfo() as memory_info, Ratio() as ratio:
                max_memory_size = memory_info.total_size * ratio.value
                system_wide_used_size = memory_info.system_wide_used_size
                system_wide_memory_size = memory_info.total_size

            case MemoryUsageInfo(), Ratio() as ratio:
                # This is just hypothetical case, that will most likely not happen in practice.
                # `LocalEventManager` should always provide `MemoryInfo` in the event data.
                # When running on Apify, `self._max_memory_size` is always `ByteSize`, not `Ratio`.
                _warn_once(
                    'It is recommended that a custom implementation of `LocalEventManager` emits `SYSTEM_INFO` events '
                    'with `MemoryInfo` and not just `MemoryUsageInfo`.'
                )
                max_memory_size = get_memory_info().total_size * ratio.value
                system_wide_used_size = None
                system_wide_memory_size = None

            case MemoryInfo() as memory_info, ByteSize() as byte_size:
                max_memory_size = byte_size
                system_wide_used_size = memory_info.system_wide_used_size
                system_wide_memory_size = memory_info.total_size

            case MemoryUsageInfo(), ByteSize() as byte_size:
                max_memory_size = byte_size
                system_wide_used_size = None
                system_wide_memory_size = None

            case _, _:
                raise NotImplementedError('Unsupported combination of memory info and max memory size types.')

        snapshot = MemorySnapshot(
            current_size=event_data.memory_info.current_size,
            max_memory_size=max_memory_size,
            max_used_memory_ratio=self._max_used_memory_ratio,
            created_at=event_data.memory_info.created_at,
            system_wide_used_size=system_wide_used_size,
            system_wide_memory_size=system_wide_memory_size,
        )

        snapshots = cast('list[Snapshot]', self._memory_snapshots)
        self._memory_snapshots.add(snapshot)
        self._prune_snapshots(snapshots, self._memory_snapshots[-1].created_at)

        self._evaluate_memory_load(
            event_data.memory_info.current_size,
            event_data.memory_info.created_at,
            max_memory_size=max_memory_size,
        )

    async def _snapshot_event_loop(self) -> None:
        """Capture a snapshot of the current event loop usage.

        This method evaluates the event loop's latency by comparing the expected time between snapshots to the actual
        time elapsed since the last snapshot. The delay in the snapshot reflects the time deviation due to event loop
        overhead - it's calculated by subtracting the expected interval between snapshots from the actual time elapsed
        since the last snapshot. If there's no previous snapshot, the delay is considered zero.
        Must be `async` to ensure it is not scheduled to be run in own thread by the event manager, which could cause
        race conditions in snapshots manipulation(sorting and pruning).
        """
        snapshot = EventLoopSnapshot(max_delay=self._max_event_loop_delay, delay=timedelta(seconds=0))
        previous_snapshot = self._event_loop_snapshots[-1] if self._event_loop_snapshots else None

        if previous_snapshot:
            event_loop_delay = snapshot.created_at - previous_snapshot.created_at - self._EVENT_LOOP_SNAPSHOT_INTERVAL
            snapshot.delay = event_loop_delay

        snapshots = cast('list[Snapshot]', self._event_loop_snapshots)
        self._event_loop_snapshots.add(snapshot)
        self._prune_snapshots(snapshots, self._event_loop_snapshots[-1].created_at)

    async def _snapshot_client(self) -> None:
        """Capture a snapshot of the current API state by checking for rate limit errors (HTTP 429).

        Only errors produced by a 2nd retry of the API call are considered for snapshotting since earlier errors may
        just be caused by a random spike in the number of requests and do not necessarily signify API overloading.
        Must be `async` to ensure it is not scheduled to be run in own thread by the event manager, which could cause
        race conditions in snapshots manipulation(sorting and pruning).
        """
        client = service_locator.get_storage_client()

        rate_limit_errors: dict[int, int] = client.get_rate_limit_errors()

        error_count = rate_limit_errors.get(self._CLIENT_RATE_LIMIT_ERROR_RETRY_COUNT, 0)
        previous_error_count = self._client_snapshots[-1].error_count if self._client_snapshots else 0
        snapshot = ClientSnapshot(
            error_count=error_count,
            new_error_count=error_count - previous_error_count,
            max_error_count=self._max_client_errors,
        )

        snapshots = cast('list[Snapshot]', self._client_snapshots)
        self._client_snapshots.add(snapshot)
        self._prune_snapshots(snapshots, self._client_snapshots[-1].created_at)

    def _prune_snapshots(self, snapshots: list[Snapshot], now: datetime) -> None:
        """Remove snapshots that are older than the `self._snapshot_history`.

        This method modifies the list of snapshots in place, removing all snapshots that are older than the defined
        snapshot history relative to the `now` parameter.

        Args:
            snapshots: List of snapshots to be pruned in place.
            now: The current date and time, used as the reference for pruning.
        """
        # Find the index where snapshots start to be within the allowed history window.
        # We'll keep snapshots from this index onwards.
        keep_from_index = None
        for i, snapshot in enumerate(snapshots):
            if now - snapshot.created_at <= self._SNAPSHOT_HISTORY:
                keep_from_index = i
                break

        # If all snapshots are old, keep_from_index will remain None, so we clear the list.
        # Otherwise, we keep only the recent snapshots.
        if keep_from_index is not None:
            del snapshots[:keep_from_index]
        else:
            snapshots.clear()

    def _evaluate_memory_load(
        self, current_memory_usage_size: ByteSize, snapshot_timestamp: datetime, max_memory_size: ByteSize
    ) -> None:
        """Evaluate and logs critical memory load conditions based on the system information.

        Args:
            current_memory_usage_size: The current memory usage.
            snapshot_timestamp: The time at which the memory snapshot was taken.
            max_memory_size: The maximum memory size to be used for evaluation.
        """
        # Check if the warning has been logged recently to avoid spamming
        if snapshot_timestamp < self._timestamp_of_last_memory_warning + self._MEMORY_WARNING_COOLDOWN_PERIOD:
            return

        threshold_memory_size = self._max_used_memory_ratio * max_memory_size
        buffer_memory_size = max_memory_size * (1 - self._max_used_memory_ratio) * self._RESERVE_MEMORY_RATIO
        overload_memory_threshold_size = threshold_memory_size + buffer_memory_size

        # Log a warning if current memory usage exceeds the critical overload threshold
        if current_memory_usage_size > overload_memory_threshold_size:
            memory_usage_percentage = round((current_memory_usage_size.bytes / max_memory_size.bytes) * 100)
            logger.warning(
                f'Memory is critically overloaded. Using {current_memory_usage_size} of '
                f'{max_memory_size} ({memory_usage_percentage}%). '
                'Consider increasing available memory.'
            )
            self._timestamp_of_last_memory_warning = snapshot_timestamp


================================================
FILE: src/crawlee/_autoscaling/system_status.py
================================================
# Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/autoscaling/system_status.ts

from __future__ import annotations

from datetime import timedelta
from logging import getLogger
from typing import TYPE_CHECKING

from more_itertools import pairwise

from crawlee._autoscaling._types import LoadRatioInfo, Snapshot, SystemInfo
from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
    from crawlee._autoscaling import Snapshotter

logger = getLogger(__name__)


@docs_group('Autoscaling')
class SystemStatus:
    """Provides a simple interface for evaluating system resource usage from snapshots collected by `Snapshotter`.

    This class aggregates and interprets snapshots from a Snapshotter instance to evaluate the current and historical
    status of system resources like CPU, memory, event loop, and client API usage. It exposes two methods
    `get_current_system_info` and `get_historical_system_info`. The system information is computed using a weighted
    average of overloaded messages in the snapshots, with the weights being the time intervals between the snapshots.
    Each resource is computed separately, and the system is considered as overloaded whenever at least one resource
    is overloaded.

    `get_current_system_info` returns a `SystemInfo` data structure that represents the current status
    of the system. The length of the current timeframe in seconds is configurable by the `max_snapshot_age` option
    and represents the max age of snapshots to be considered for the computation.

    `SystemStatus.get_historical_system_info` returns a `SystemInfo` that represents the long-term status of the system.
    It considers the full snapshot history available in the `Snapshotter` instance.
    """

    def __init__(
        self,
        snapshotter: Snapshotter,
        *,
        max_snapshot_age: timedelta = timedelta(seconds=5),
        cpu_overload_threshold: float = 0.4,
        memory_overload_threshold: float = 0.2,
        event_loop_overload_threshold: float = 0.6,
        client_overload_threshold: float = 0.3,
    ) -> None:
        """Initialize a new instance.

        Args:
            snapshotter: The `Snapshotter` instance to be queried for `SystemStatus`.
            max_snapshot_age: Defines max age of snapshots used in the `SystemStatus.get_current_system_info`
                measurement.
            cpu_overload_threshold: Sets the threshold of overloaded snapshots in the CPU sample.
                If the sample exceeds this threshold, the system will be considered overloaded.
            memory_overload_threshold: Sets the threshold of overloaded snapshots in the memory sample.
                If the sample exceeds this threshold, the system will be considered overloaded.
            event_loop_overload_threshold: Sets the threshold of overloaded snapshots in the event loop sample.
                If the sample exceeds this threshold, the system will be considered overloaded.
            client_overload_threshold: Sets the threshold of overloaded snapshots in the Client sample.
                If the sample exceeds this threshold, the system will be considered overloaded.
        """
        self._snapshotter = snapshotter
        self._max_snapshot_age = max_snapshot_age
        self._cpu_overload_threshold = cpu_overload_threshold
        self._memory_overload_threshold = memory_overload_threshold
        self._event_loop_overload_threshold = event_loop_overload_threshold
        self._client_overload_threshold = client_overload_threshold

    def get_current_system_info(self) -> SystemInfo:
        """Retrieve and evaluates the current status of system resources.

        Considers snapshots within the `_max_snapshot_age` timeframe and determines if the system is currently
        overloaded based on predefined thresholds for each resource type.

        Returns:
            An object representing the current system status.
        """
        return self._get_system_info(sample_duration=self._max_snapshot_age)

    def get_historical_system_info(self) -> SystemInfo:
        """Retrieve and evaluates the historical status of system resources.

        Considers the entire history of snapshots from the Snapshotter to assess long-term system performance and
        determines if the system has been historically overloaded.

        Returns:
            An object representing the historical system status.
        """
        return self._get_system_info()

    def _get_system_info(self, *, sample_duration: timedelta | None = None) -> SystemInfo:
        """Get system information based on the overload state of different resources within a specified duration.

        Args:
            sample_duration: Specific duration for which to evaluate the system status. If None, evaluates across
                the entire history available in the snapshotter.

        Returns:
            Aggregated system status indicating whether the system is idle or overloaded.
        """
        mem_info = self._is_memory_overloaded(sample_duration)
        event_loop_info = self._is_event_loop_overloaded(sample_duration)
        cpu_info = self._is_cpu_overloaded(sample_duration)
        client_info = self._is_client_overloaded(sample_duration)

        return SystemInfo(
            memory_info=mem_info,
            event_loop_info=event_loop_info,
            cpu_info=cpu_info,
            client_info=client_info,
        )

    def _is_cpu_overloaded(self, sample_duration: timedelta | None = None) -> LoadRatioInfo:
        """Determine if the CPU has been overloaded within a specified time duration.

        Args:
            sample_duration: The duration within which to analyze CPU snapshots. If None, evaluates across
                the entire history available in the snapshotter.

        Returns:
            CPU load ratio information.
        """
        sample = self._snapshotter.get_cpu_sample(sample_duration)
        return self._is_sample_overloaded(sample, self._cpu_overload_threshold)

    def _is_memory_overloaded(self, sample_duration: timedelta | None = None) -> LoadRatioInfo:
        """Determine if memory has been overloaded within a specified time duration.

        Args:
            sample_duration: The duration within which to analyze memory snapshots. If None, evaluates across
                the entire history available in the snapshotter.

        Returns:
            Memory load ratio information.
        """
        sample = self._snapshotter.get_memory_sample(sample_duration)
        return self._is_sample_overloaded(sample, self._memory_overload_threshold)

    def _is_event_loop_overloaded(self, sample_duration: timedelta | None = None) -> LoadRatioInfo:
        """Determine if the event loop has been overloaded within a specified time duration.

        Args:
            sample_duration: The duration within which to analyze event loop snapshots. If None, evaluates across
                the entire history available in the snapshotter.

        Returns:
            Event loop load ratio information.
        """
        sample = self._snapshotter.get_event_loop_sample(sample_duration)
        return self._is_sample_overloaded(sample, self._event_loop_overload_threshold)

    def _is_client_overloaded(self, sample_duration: timedelta | None = None) -> LoadRatioInfo:
        """Determine if the client has been overloaded within a specified time duration.

        Args:
            sample_duration: The duration within which to analyze client snapshots. If None, evaluates across
                the entire history available in the snapshotter.

        Returns:
            Client load ratio information.
        """
        sample = self._snapshotter.get_client_sample(sample_duration)
        return self._is_sample_overloaded(sample, self._client_overload_threshold)

    def _is_sample_overloaded(self, sample: list[Snapshot], threshold: float) -> LoadRatioInfo:
        """Determine if a sample of snapshot data is overloaded based on a specified ratio.

        Args:
            sample: A list of snapshot data to analyze.
            threshold: The threshold ratio to use for determining if the sample is overloaded.

        Returns:
            An object with an `is_overloaded` property set to `True` if the sample is considered overloaded based
            on the specified threshold ratio. Otherwise, `is_overloaded` is set to `False`.
        """
        if not sample:
            return LoadRatioInfo(limit_ratio=threshold, actual_ratio=0)

        if len(sample) == 1:
            return LoadRatioInfo(limit_ratio=threshold, actual_ratio=float(sample[0].is_overloaded))

        overloaded_time = 0.0
        non_overloaded_time = 0.0

        for previous, current in pairwise(sample):
            time = (current.created_at - previous.created_at).total_seconds()
            if time < 0:
                raise ValueError('Negative time. Code assumptions are not valid. Expected time sorted samples.')
            if current.is_overloaded:
                overloaded_time += time
            else:
                non_overloaded_time += time

        if (total_time := overloaded_time + non_overloaded_time) == 0:
            overloaded_ratio = 0.0
        else:
            overloaded_ratio = overloaded_time / total_time

        return LoadRatioInfo(limit_ratio=threshold, actual_ratio=round(overloaded_ratio, 3))


================================================
FILE: src/crawlee/_cli.py
================================================
# ruff: noqa: FBT002
from __future__ import annotations

import importlib.resources
import json
import sys
from pathlib import Path
from typing import Annotated, cast

from click import Choice

try:
    import inquirer
    import typer
    from cookiecutter.main import cookiecutter
    from inquirer.render.console import ConsoleRender
    from rich.progress import Progress, SpinnerColumn, TextColumn
except ModuleNotFoundError as exc:
    raise ImportError(
        "Missing required dependencies for the Crawlee CLI. It looks like you're running 'crawlee' "
        "without the CLI extra. Try using 'crawlee[cli]' instead."
    ) from exc

cli = typer.Typer(no_args_is_help=True)

template_directory = importlib.resources.files('crawlee') / 'project_template'
with (template_directory / 'cookiecutter.json').open() as f:
    cookiecutter_json = json.load(f)

crawler_choices = cookiecutter_json['crawler_type']
http_client_choices = cookiecutter_json['http_client']
package_manager_choices = cookiecutter_json['package_manager']
default_start_url = cookiecutter_json['start_url']
default_enable_apify_integration = cookiecutter_json['enable_apify_integration']
default_install_project = cookiecutter_json['install_project']


@cli.callback(invoke_without_command=True)
def callback(
    version: Annotated[
        bool,
        typer.Option(
            '-V',
            '--version',
            help='Print Crawlee version',
        ),
    ] = False,
) -> None:
    """Crawlee is a web scraping and browser automation library."""
    if version:
        from crawlee import __version__  # noqa: PLC0415

        typer.echo(__version__)


def _prompt_for_project_name(initial_project_name: str | None) -> str:
    """Prompt the user for a non-empty project name that does not lead to an existing folder."""
    while True:
        if initial_project_name is not None:
            project_name = initial_project_name
            initial_project_name = None
        else:
            project_name = ConsoleRender().render(
                inquirer.Text(
                    name='project_name',
                    message='Name of the new project folder',
                    validate=lambda _, value: bool(value.strip()),
                ),
            )

        if not project_name:
            typer.echo('Project name is required.', err=True)
            continue

        project_path = Path.cwd() / project_name

        if project_path.exists():
            typer.echo(f'Folder {project_path} already exists. Please choose another name.', err=True)
            continue

        return project_name


def _prompt_text(message: str, default: str) -> str:
    return cast(
        'str',
        ConsoleRender().render(
            inquirer.Text(
                name='text',
                message=message,
                default=default,
                validate=lambda _, value: bool(value.strip()),
            ),
        ),
    )


def _prompt_choice(message: str, choices: list[str]) -> str:
    """Prompt the user to pick one from a list of choices."""
    return cast(
        'str',
        ConsoleRender().render(
            inquirer.List(
                name='choice',
                message=message,
                choices=[(choice[0].upper() + choice[1:], choice) for choice in choices],
            ),
        ),
    )


def _prompt_bool(message: str, *, default: bool) -> bool:
    return cast(
        'bool',
        ConsoleRender().render(
            inquirer.Confirm(
                name='confirm',
                message=message,
                default=default,
            ),
        ),
    )


@cli.command()
def create(
    project_name: str | None = typer.Argument(
        default=None,
        show_default=False,
        help='The name of the project and the directory that will be created to contain it. '
        'If none is given, you will be prompted.',
    ),
    crawler_type: str | None = typer.Option(
        None,
        '--crawler-type',
        '--template',
        show_default=False,
        click_type=Choice(crawler_choices),
        help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.',
    ),
    http_client: str | None = typer.Option(
        None,
        show_default=False,
        click_type=Choice(http_client_choices),
        help='The library that will be used to make HTTP requests in your crawler. '
        'If none is given, you will be prompted.',
    ),
    package_manager: str | None = typer.Option(
        default=None,
        show_default=False,
        click_type=Choice(package_manager_choices),
        help='Package manager to be used in the new project. If none is given, you will be prompted.',
    ),
    start_url: str | None = typer.Option(
        default=None,
        show_default=False,
        metavar='[START_URL]',
        help='The URL where crawling should start. If none is given, you will be prompted.',
    ),
    *,
    enable_apify_integration: bool | None = typer.Option(
        None,
        '--apify/--no-apify',
        show_default=False,
        help='Should Apify integration be set up for you? If not given, you will be prompted.',
    ),
    install_project: bool | None = typer.Option(
        None,
        '--install/--no-install',
        show_default=False,
        help='Should the project be installed now? If not given, you will be prompted.',
    ),
) -> None:
    """Bootstrap a new Crawlee project."""
    try:
        # Prompt for project name if not provided.
        project_name = _prompt_for_project_name(project_name)

        # Prompt for crawler_type if not provided.
        if crawler_type is None:
            crawler_type = _prompt_choice('Please select the Crawler type', crawler_choices)

        # Prompt for http_client if not provided.
        if http_client is None:
            http_client = _prompt_choice('Please select the HTTP client', http_client_choices)

        # Prompt for package manager if not provided.
        if package_manager is None:
            package_manager = _prompt_choice('Please select the package manager', package_manager_choices)

        # Prompt for start URL
        if start_url is None:
            start_url = _prompt_text('Please specify the start URL', default=default_start_url)

        # Ask about Apify integration if not explicitly configured
        if enable_apify_integration is None:
            enable_apify_integration = _prompt_bool(
                'Should Apify integration be set up for you?', default=default_enable_apify_integration
            )

        # Ask about installing the project
        if install_project is None:
            install_project = _prompt_bool('Should the project be installed now?', default=default_install_project)

        if all(
            [
                project_name,
                crawler_type,
                http_client,
                package_manager,
                start_url,
                enable_apify_integration is not None,
                install_project is not None,
            ]
        ):
            package_name = project_name.replace('-', '_')

            # Start the bootstrap process.
            with Progress(
                SpinnerColumn(),
                TextColumn('[progress.description]{task.description}'),
                transient=True,
            ) as progress:
                bootstrap_task = progress.add_task(description='Bootstrapping...', total=None)

                try:
                    cookiecutter(
                        template=str(template_directory),
                        no_input=True,
                        extra_context={
                            'project_name': project_name,
                            'package_manager': package_manager,
                            'crawler_type': crawler_type,
                            'http_client': http_client,
                            'enable_apify_integration': enable_apify_integration,
                            'start_url': start_url,
                            'install_project': install_project,
                        },
                    )
                except Exception as exc:
                    progress.update(bootstrap_task, visible=False)
                    progress.refresh()

                    # Print just the last line of the error message (the actual error without traceback)
                    if 'Hook script failed' in str(exc):
                        typer.echo('Project creation failed. Check the error message above.', err=True)
                    else:
                        typer.echo(f'Project creation failed: {exc!s}', err=True)

                    sys.exit(1)

            typer.echo(f'Your project "{project_name}" was created.')

            if install_project:
                if package_manager == 'pip':
                    typer.echo(
                        f'To run it, navigate to the directory: "cd {project_name}", '
                        f'activate the virtual environment in ".venv" ("source .venv/bin/activate") '
                        f'and run your project using "python -m {package_name}".'
                    )
                else:
                    typer.echo(
                        f'To run it, navigate to the directory: "cd {project_name}", '
                        f'and run it using "{package_manager} run python -m {package_name}".'
                    )
            elif package_manager == 'pip':
                typer.echo(
                    f'To run it, navigate to the directory: "cd {project_name}", '
                    f'install the dependencies listed in "requirements.txt" '
                    f'and run it using "python -m {package_name}".'
                )
            else:
                install_command = 'sync' if package_manager == 'uv' else 'install'
                typer.echo(
                    f'To run it, navigate to the directory: "cd {project_name}", '
                    f'install the project using "{package_manager} {install_command}", '
                    f'and run it using "{package_manager} run python -m {package_name}".'
                )

            typer.echo(f'See the "{project_name}/README.md" for more information.')

    except KeyboardInterrupt:
        typer.echo('Operation cancelled by user.')


================================================
FILE: src/crawlee/_consts.py
================================================
from __future__ import annotations

METADATA_FILENAME = '__metadata__.json'
"""The name of the metadata file for storage clients."""


================================================
FILE: src/crawlee/_log_config.py
================================================
from __future__ import annotations

import json
import logging
import sys
import textwrap
from typing import TYPE_CHECKING, Any

from colorama import Fore, Style, just_fix_windows_console
from typing_extensions import assert_never

from crawlee import service_locator

if TYPE_CHECKING:
    from crawlee._types import LogLevel

just_fix_windows_console()


_LOG_NAME_COLOR = Fore.LIGHTBLACK_EX

_LOG_LEVEL_COLOR = {
    logging.DEBUG: Fore.BLUE,
    logging.INFO: Fore.GREEN,
    logging.WARNING: Fore.YELLOW,
    logging.ERROR: Fore.RED,
    logging.CRITICAL: Fore.RED,
}

_LOG_LEVEL_SHORT_ALIAS = {
    logging.DEBUG: 'DEBUG',
    logging.INFO: 'INFO ',
    logging.WARNING: 'WARN ',
    logging.ERROR: 'ERROR',
}

# So that all the log messages have the same alignment
_LOG_MESSAGE_INDENT = ' ' * 6


def string_to_log_level(level: LogLevel) -> int:
    """Convert a string representation of a log level to an integer log level."""
    if level == 'DEBUG':
        return logging.DEBUG
    if level == 'INFO':
        return logging.INFO
    if level == 'WARNING':
        return logging.WARNING
    if level == 'ERROR':
        return logging.ERROR
    if level == 'CRITICAL':
        return logging.CRITICAL

    assert_never(level)


def get_configured_log_level() -> int:
    config = service_locator.get_configuration()

    if 'log_level' in config.model_fields_set:
        return string_to_log_level(config.log_level)

    if sys.flags.dev_mode:
        return logging.DEBUG

    return logging.INFO


def configure_logger(logger: logging.Logger, *, remove_old_handlers: bool = False) -> None:
    handler = logging.StreamHandler()
    handler.setFormatter(CrawleeLogFormatter())

    if remove_old_handlers:
        for old_handler in logger.handlers[:]:
            logger.removeHandler(old_handler)

    logger.addHandler(handler)
    logger.setLevel(get_configured_log_level())

    # Do not propagate the log messages to the parent logger to prevent duplicate log messages.
    logger.propagate = False


class CrawleeLogFormatter(logging.Formatter):
    """Log formatter that prints out the log message nicely formatted, with colored level and stringified extra fields.

    It formats the log records so that they:
        - start with the level (colorized, and padded to 5 chars so that it is nicely aligned)
        - then have the actual log message, if it's multiline then it's nicely indented
        - then have the stringified extra log fields
        - then, if an exception is a part of the log record, prints the formatted exception.
    """

    # The fields that are added to the log record with `logger.log(..., extra={...})` are just merged in the log record
    # with the other log record properties, and you can't get them in some nice, isolated way. So, to get the extra
    # fields, we just compare all the properties present in the log record with properties present in an empty log
    # record, and extract all the extra ones not present in the empty log record.
    empty_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None)

    def __init__(
        self,
        include_logger_name: bool = True,  # noqa: FBT001, FBT002
        *args: Any,
        **kwargs: Any,
    ) -> None:
        """Initialize a new instance.

        Args:
            include_logger_name: Include logger name at the beginning of the log line.
            args: Arguments passed to the parent class.
            kwargs: Keyword arguments passed to the parent class.
        """
        super().__init__(*args, **kwargs)
        self.include_logger_name = include_logger_name

    def _get_extra_fields(self, record: logging.LogRecord) -> dict[str, Any]:
        extra_fields: dict[str, Any] = {}
        for key, value in record.__dict__.items():
            if key not in self.empty_record.__dict__:
                extra_fields[key] = value  # noqa: PERF403

        return extra_fields

    def format(self, record: logging.LogRecord) -> str:
        """Format the log record nicely.

        This formats the log record so that it:
            - starts with the level (colorized, and padded to 5 chars so that it is nicely aligned)
            - then has the actual log message, if it's multiline then it's nicely indented
            - then has the stringified extra log fields
            - then, if an exception is a part of the log record, prints the formatted exception.
        """
        logger_name_string = f'{_LOG_NAME_COLOR}[{record.name}]{Style.RESET_ALL} '

        # Colorize the log level, and shorten it to 6 chars tops
        level_color_code = _LOG_LEVEL_COLOR.get(record.levelno, '')
        level_short_alias = _LOG_LEVEL_SHORT_ALIAS.get(record.levelno, record.levelname)
        level_string = f'{level_color_code}{level_short_alias}{Style.RESET_ALL} '

        # Format the extra log record fields, if there were some
        # Just stringify them to JSON and color them gray
        extra_string = ''
        extra = self._get_extra_fields(record)
        if extra:
            extra_string = (
                f' {Fore.LIGHTBLACK_EX}({json.dumps(extra, ensure_ascii=False, default=str)}){Style.RESET_ALL}'
            )

        # Call the parent method so that it populates missing fields in the record
        super().format(record)

        # Format the actual log message
        log_string = self.formatMessage(record)

        # Format the exception, if there is some
        # Basically just print the traceback and indent it a bit
        exception_string = ''
        if record.exc_text:
            exception_string = '\n' + textwrap.indent(record.exc_text.rstrip(), _LOG_MESSAGE_INDENT)
        else:
            exception_string = ''

        if self.include_logger_name:
            # Include logger name at the beginning of the log line
            return f'{logger_name_string}{level_string}{log_string}{extra_string}{exception_string}'

        return f'{level_string}{log_string}{extra_string}{exception_string}'


================================================
FILE: src/crawlee/_request.py
================================================
from __future__ import annotations

from collections.abc import Iterator, MutableMapping
from datetime import datetime
from enum import IntEnum
from typing import TYPE_CHECKING, Annotated, Any, TypedDict, cast

from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PlainSerializer, PlainValidator, TypeAdapter
from yarl import URL

from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, JsonSerializable
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.docs import docs_group
from crawlee._utils.requests import compute_unique_key
from crawlee._utils.urls import validate_http_url

if TYPE_CHECKING:
    from typing_extensions import NotRequired, Required, Self


class RequestState(IntEnum):
    """Crawlee-specific request handling state."""

    UNPROCESSED = 0
    BEFORE_NAV = 1
    AFTER_NAV = 2
    REQUEST_HANDLER = 3
    DONE = 4
    ERROR_HANDLER = 5
    ERROR = 6
    SKIPPED = 7


class CrawleeRequestData(BaseModel):
    """Crawlee-specific configuration stored in the `user_data`."""

    max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
    """Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
    `BasicCrawler`."""

    enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
    """The strategy that was used for enqueuing the request."""

    state: RequestState = RequestState.UNPROCESSED
    """Describes the request's current lifecycle state."""

    session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
    """The number of finished session rotations for this request."""

    skip_navigation: Annotated[bool, Field(alias='skipNavigation')] = False

    last_proxy_tier: Annotated[int | None, Field(alias='lastProxyTier')] = None
    """The last proxy tier used to process the request."""

    forefront: Annotated[bool, Field()] = False
    """Indicate whether the request should be enqueued at the front of the queue."""

    crawl_depth: Annotated[int, Field(alias='crawlDepth')] = 0
    """The depth of the request in the crawl tree."""

    session_id: Annotated[str | None, Field()] = None
    """ID of a session to which the request is bound."""


class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
    """Represents the `user_data` part of a Request.

    Apart from the well-known attributes (`label` and `__crawlee`), it can also contain arbitrary JSON-compatible
    values.
    """

    model_config = ConfigDict(extra='allow')
    __pydantic_extra__: dict[str, JsonSerializable] = Field(init=False)

    crawlee_data: Annotated[CrawleeRequestData | None, Field(alias='__crawlee')] = None
    """Crawlee-specific configuration stored in the `user_data`."""

    label: Annotated[str | None, Field()] = None
    """Label used for request routing."""

    def __getitem__(self, key: str) -> JsonSerializable:
        return self.__pydantic_extra__[key]

    def __setitem__(self, key: str, value: JsonSerializable) -> None:
        if key == 'label':
            if value is not None and not isinstance(value, str):
                raise ValueError('`label` must be str or None')

            self.label = value

        self.__pydantic_extra__[key] = value

    def __delitem__(self, key: str) -> None:
        del self.__pydantic_extra__[key]

    def __iter__(self) -> Iterator[str]:  # ty: ignore[invalid-method-override]
        yield from self.__pydantic_extra__

    def __len__(self) -> int:
        return len(self.__pydantic_extra__)

    def __eq__(self, other: object) -> bool:
        if isinstance(other, BaseModel):
            return super().__eq__(other)

        if isinstance(other, dict):
            return self.model_dump() == other

        return NotImplemented

    def __hash__(self) -> int:
        """Return hash based on the model fields."""
        data = self.model_dump()
        return hash(tuple(sorted(data.items())))


user_data_adapter = TypeAdapter(UserData)


@docs_group('Other')
class RequestOptions(TypedDict):
    """Options that can be used to customize request creation.

    This type exactly matches the parameters of `Request.from_url` method.
    """

    url: Required[str]
    method: NotRequired[HttpMethod]
    headers: NotRequired[HttpHeaders | dict[str, str] | None]
    payload: NotRequired[HttpPayload | str | None]
    label: NotRequired[str | None]
    session_id: NotRequired[str | None]
    unique_key: NotRequired[str | None]
    id: NotRequired[str | None]
    keep_url_fragment: NotRequired[bool]
    use_extended_unique_key: NotRequired[bool]
    always_enqueue: NotRequired[bool]
    user_data: NotRequired[dict[str, JsonSerializable]]
    no_retry: NotRequired[bool]
    enqueue_strategy: NotRequired[EnqueueStrategy]
    max_retries: NotRequired[int | None]


@docs_group('Storage data')
class Request(BaseModel):
    """Represents a request in the Crawlee framework, containing the necessary information for crawling operations.

    The `Request` class is one of the core components in Crawlee, utilized by various components such as request
    providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests,
    including the URL, HTTP method, headers, payload, and user data. The user data allows custom information
    to be stored and persisted throughout the request lifecycle, including its retries.

    Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used
    for request deduplication, controlling retries, handling state management, and enabling configuration for session
    rotation and proxy handling.

    The recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically
    generates a unique key and identifier based on the URL and request parameters.

    ### Usage

    ```python
    from crawlee import Request

    request = Request.from_url('https://crawlee.dev')
    ```
    """

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

    unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
    """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
    to the same URL.

    If `unique_key` is not provided, then it is automatically generated by normalizing the URL.
    For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`
    of `http://www.example.com/something`.

    Pass an arbitrary non-empty text value to the `unique_key` property to override the default behavior
    and specify which URLs shall be considered equal.
    """

    url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
    """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
    and fragments."""

    method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
    """HTTP request method."""

    payload: Annotated[
        HttpPayload | None,
        BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
        PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
        Field(frozen=True),
    ] = None
    """HTTP request payload."""

    # Workaround for Pydantic and type checkers when using Annotated with default_factory
    if TYPE_CHECKING:
        headers: HttpHeaders = HttpHeaders()
        """HTTP request headers."""

        user_data: dict[str, JsonSerializable] = {}
        """Custom user data assigned to the request. Use this to save any request related data to the
        request's scope, keeping them accessible on retries, failures etc.
        """

    else:
        headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)]
        """HTTP request headers."""

        user_data: Annotated[
            dict[str, JsonSerializable],  # Internally, the model contains `UserData`, this is just for convenience
            Field(alias='userData', default_factory=UserData),
            PlainValidator(user_data_adapter.validate_python),
            PlainSerializer(
                lambda instance: user_data_adapter.dump_python(
                    instance,
                    by_alias=True,
                    exclude_none=False,
                    exclude_unset=True,
                    exclude_defaults=True,
                )
            ),
        ]
        """Custom user data assigned to the request. Use this to save any request related data to the
        request's scope, keeping them accessible on retries, failures etc.
        """

    retry_count: Annotated[int, Field(alias='retryCount')] = 0
    """Number of times the request has been retried."""

    no_retry: Annotated[bool, Field(alias='noRetry')] = False
    """If set to `True`, the request will not be retried in case of failure."""

    loaded_url: Annotated[str | None, BeforeValidator(validate_http_url), Field(alias='loadedUrl')] = None
    """URL of the web page that was loaded. This can differ from the original URL in case of redirects."""

    handled_at: Annotated[datetime | None, Field(alias='handledAt')] = None
    """Timestamp when the request was handled."""

    @classmethod
    def from_url(
        cls,
        url: str,
        *,
        method: HttpMethod = 'GET',
        headers: HttpHeaders | dict[str, str] | None = None,
        payload: HttpPayload | str | None = None,
        label: str | None = None,
        session_id: str | None = None,
        unique_key: str | None = None,
        keep_url_fragment: bool = False,
        use_extended_unique_key: bool = False,
        always_enqueue: bool = False,
        enqueue_strategy: EnqueueStrategy | None = None,
        max_retries: int | None = None,
        **kwargs: Any,
    ) -> Self:
        """Create a new `Request` instance from a URL.

        This is recommended constructor for creating new `Request` instances. It generates a `Request` object from
        a given URL with additional options to customize HTTP method, payload, unique key, and other request
        properties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,
        method and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.

        Args:
            url: The URL of the request.
            method: The HTTP method of the request.
            headers: The HTTP headers of the request.
            payload: The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests.
            label: A custom label to differentiate between request types. This is stored in `user_data`, and it is
                used for request routing (different requests go to different handlers).
            session_id: ID of a specific `Session` to which the request will be strictly bound.
                If the session becomes unavailable when the request is processed, a `RequestCollisionError` will be
                raised.
            unique_key: A unique key identifying the request. If not provided, it is automatically computed based on
                the URL and other parameters. Requests with the same `unique_key` are treated as identical.
            keep_url_fragment: Determines whether the URL fragment (e.g., `#section`) should be included in
                the `unique_key` computation. This is only relevant when `unique_key` is not provided.
            use_extended_unique_key: Determines whether to include the HTTP method, ID Session and payload in the
                `unique_key` computation. This is only relevant when `unique_key` is not provided.
            always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
                Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
            enqueue_strategy: The strategy that will be used for enqueuing the request.
            max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
                option of `BasicCrawler`.
            **kwargs: Additional request properties.
        """
        if unique_key is not None and always_enqueue:
            raise ValueError('`always_enqueue` cannot be used with a custom `unique_key`')

        if isinstance(headers, dict) or headers is None:
            headers = HttpHeaders(headers or {})

        if isinstance(payload, str):
            payload = payload.encode()

        unique_key = unique_key or compute_unique_key(
            url,
            method=method,
            headers=headers,
            payload=payload,
            session_id=session_id,
            keep_url_fragment=keep_url_fragment,
            use_extended_unique_key=use_extended_unique_key,
        )

        if always_enqueue:
            unique_key = f'{crypto_random_object_id()}|{unique_key}'

        user_data_dict = kwargs.pop('user_data', {}) or {}
        crawlee_data_dict = user_data_dict.get('__crawlee', {})

        if max_retries is not None:
            crawlee_data_dict['maxRetries'] = max_retries

        if enqueue_strategy is not None:
            crawlee_data_dict['enqueueStrategy'] = enqueue_strategy

        crawlee_data = CrawleeRequestData(**crawlee_data_dict)

        if crawlee_data:
            user_data_dict['__crawlee'] = crawlee_data

        request = cls(
            url=url,
            unique_key=unique_key,
            method=method,
            headers=headers,
            payload=payload,
            user_data=user_data_dict,
            **kwargs,
        )

        if label is not None:
            request.user_data['label'] = label

        if session_id is not None:
            request.crawlee_data.session_id = session_id

        return request

    def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None:
        """Get the value of a specific query parameter from the URL."""
        query_params = URL(self.url).query
        return query_params.get(param, default)

    @property
    def label(self) -> str | None:
        """A string used to differentiate between arbitrary request types."""
        return cast('UserData', self.user_data).label

    @property
    def session_id(self) -> str | None:
        """The ID of the bound session, if there is any."""
        return self.crawlee_data.session_id

    @property
    def crawlee_data(self) -> CrawleeRequestData:
        """Crawlee-specific configuration stored in the `user_data`."""
        user_data = cast('UserData', self.user_data)
        if user_data.crawlee_data is None:
            user_data.crawlee_data = CrawleeRequestData()

        return user_data.crawlee_data

    @property
    def crawl_depth(self) -> int:
        """The depth of the request in the crawl tree."""
        return self.crawlee_data.crawl_depth

    @crawl_depth.setter
    def crawl_depth(self, new_value: int) -> None:
        self.crawlee_data.crawl_depth = new_value

    @property
    def state(self) -> RequestState:
        """Crawlee-specific request handling state."""
        return self.crawlee_data.state

    @state.setter
    def state(self, new_state: RequestState) -> None:
        self.crawlee_data.state = new_state

    @property
    def max_retries(self) -> int | None:
        """Crawlee-specific limit on the number of retries of the request."""
        return self.crawlee_data.max_retries

    @property
    def session_rotation_count(self) -> int | None:
        """Crawlee-specific number of finished session rotations for the request."""
        return self.crawlee_data.session_rotation_count

    @session_rotation_count.setter
    def session_rotation_count(self, new_session_rotation_count: int) -> None:
        self.crawlee_data.session_rotation_count = new_session_rotation_count

    @property
    def enqueue_strategy(self) -> EnqueueStrategy:
        """The strategy that was used for enqueuing the request."""
        return self.crawlee_data.enqueue_strategy or 'all'

    @enqueue_strategy.setter
    def enqueue_strategy(self, new_enqueue_strategy: EnqueueStrategy) -> None:
        self.crawlee_data.enqueue_strategy = new_enqueue_strategy

    @property
    def last_proxy_tier(self) -> int | None:
        """The last proxy tier used to process the request."""
        return self.crawlee_data.last_proxy_tier

    @last_proxy_tier.setter
    def last_proxy_tier(self, new_value: int) -> None:
        self.crawlee_data.last_proxy_tier = new_value

    @property
    def forefront(self) -> bool:
        """Indicate whether the request should be enqueued at the front of the queue."""
        return self.crawlee_data.forefront

    @forefront.setter
    def forefront(self, new_value: bool) -> None:
        self.crawlee_data.forefront = new_value

    @property
    def was_already_handled(self) -> bool:
        """Indicates whether the request was handled."""
        return self.handled_at is not None


class RequestWithLock(Request):
    """A crawling request with information about locks."""

    lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')]
    """The timestamp when the lock expires."""


================================================
FILE: src/crawlee/_service_locator.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

from crawlee._utils.docs import docs_group
from crawlee.configuration import Configuration
from crawlee.errors import ServiceConflictError
from crawlee.events import EventManager, LocalEventManager
from crawlee.storage_clients import FileSystemStorageClient, StorageClient

if TYPE_CHECKING:
    from crawlee.storages._storage_instance_manager import StorageInstanceManager

from logging import getLogger

logger = getLogger(__name__)


@docs_group('Configuration')
class ServiceLocator:
    """Service locator for managing the services used by Crawlee.

    All services are initialized to its default value lazily.
    """

    global_storage_instance_manager: StorageInstanceManager | None = None

    def __init__(
        self,
        configuration: Configuration | None = None,
        event_manager: EventManager | None = None,
        storage_client: StorageClient | None = None,
    ) -> None:
        self._configuration = configuration
        self._event_manager = event_manager
        self._storage_client = storage_client

    def get_configuration(self) -> Configuration:
        """Get the configuration."""
        if self._configuration is None:
            logger.debug('No configuration set, implicitly creating and using default Configuration.')
            self._configuration = Configuration()

        return self._configuration

    def set_configuration(self, configuration: Configuration) -> None:
        """Set the configuration.

        Args:
            configuration: The configuration to set.

        Raises:
            ServiceConflictError: If the configuration has already been retrieved before.
        """
        if self._configuration is configuration:
            # Same instance, no need to anything
            return
        if self._configuration:
            raise ServiceConflictError(Configuration, configuration, self._configuration)

        self._configuration = configuration

    def get_event_manager(self) -> EventManager:
        """Get the event manager."""
        if self._event_manager is None:
            logger.debug('No event manager set, implicitly creating and using default LocalEventManager.')
            if self._configuration is None:
                logger.warning(
                    'Implicit creation of event manager will implicitly set configuration as side effect. '
                    'It is advised to explicitly first set the configuration instead.'
                )
            self._event_manager = LocalEventManager().from_config(config=self._configuration)

        return self._event_manager

    def set_event_manager(self, event_manager: EventManager) -> None:
        """Set the event manager.

        Args:
            event_manager: The event manager to set.

        Raises:
            ServiceConflictError: If the event manager has already been retrieved before.
        """
        if self._event_manager is event_manager:
            # Same instance, no need to anything
            return
        if self._event_manager:
            raise ServiceConflictError(EventManager, event_manager, self._event_manager)

        self._event_manager = event_manager

    def get_storage_client(self) -> StorageClient:
        """Get the storage client."""
        if self._storage_client is None:
            logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.')
            if self._configuration is None:
                logger.warning(
                    'Implicit creation of storage client will implicitly set configuration as side effect. '
                    'It is advised to explicitly first set the configuration instead.'
                )
            self._storage_client = FileSystemStorageClient()

        return self._storage_client

    def set_storage_client(self, storage_client: StorageClient) -> None:
        """Set the storage client.

        Args:
            storage_client: The storage client to set.

        Raises:
            ServiceConflictError: If the storage client has already been retrieved before.
        """
        if self._storage_client is storage_client:
            # Same instance, no need to anything
            return
        if self._storage_client:
            raise ServiceConflictError(StorageClient, storage_client, self._storage_client)

        self._storage_client = storage_client

    @property
    def storage_instance_manager(self) -> StorageInstanceManager:
        """Get the storage instance manager. It is global manager shared by all instances of ServiceLocator."""
        if ServiceLocator.global_storage_instance_manager is None:
            # Import here to avoid circular imports.
            from crawlee.storages._storage_instance_manager import StorageInstanceManager  # noqa: PLC0415

            ServiceLocator.global_storage_instance_manager = StorageInstanceManager()

        return ServiceLocator.global_storage_instance_manager


service_locator = ServiceLocator()


================================================
FILE: src/crawlee/_types.py
================================================
from __future__ import annotations

import dataclasses
from collections.abc import Callable, Iterator, Mapping
from copy import deepcopy
from dataclasses import dataclass
from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload

from pydantic import ConfigDict, Field, PlainValidator, RootModel

from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
    import json
    import logging
    import re
    from collections.abc import Callable, Coroutine, Sequence

    from typing_extensions import NotRequired, Required, Self, Unpack

    from crawlee import Glob, Request
    from crawlee._request import RequestOptions
    from crawlee.configuration import Configuration
    from crawlee.http_clients import HttpResponse
    from crawlee.proxy_configuration import ProxyInfo
    from crawlee.sessions import Session
    from crawlee.storage_clients import StorageClient
    from crawlee.storages import KeyValueStore

    # Workaround for https://github.com/pydantic/pydantic/issues/9445
    J = TypeVar('J', bound='JsonSerializable')
    JsonSerializable = list[J] | dict[str, J] | str | bool | int | float | None
else:
    from pydantic import JsonValue as JsonSerializable

T = TypeVar('T')

HttpMethod = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH']

HttpPayload = bytes

RequestTransformAction = Literal['skip', 'unchanged']

EnqueueStrategy = Literal['all', 'same-domain', 'same-hostname', 'same-origin']
"""Enqueue strategy to be used for determining which links to extract and enqueue."""

SkippedReason = Literal['robots_txt']

LogLevel = Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']


def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
    """Convert all header keys to lowercase, strips whitespace, and returns them sorted by key."""
    normalized_headers = {k.lower().strip(): v.strip() for k, v in headers.items()}
    sorted_headers = sorted(normalized_headers.items())
    return dict(sorted_headers)


@docs_group('Other')
class HttpHeaders(RootModel, Mapping[str, str]):
    """A dictionary-like object representing HTTP headers."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

    # Workaround for Pydantic and type checkers when using Annotated with default_factory
    if TYPE_CHECKING:
        root: dict[str, str] = {}
    else:
        root: Annotated[
            dict[str, str],
            PlainValidator(_normalize_headers),
            Field(default_factory=dict),
        ]

    def __getitem__(self, key: str) -> str:
        return self.root[key.lower()]

    def __setitem__(self, key: str, value: str) -> None:
        raise TypeError(f'{self.__class__.__name__} is immutable')

    def __delitem__(self, key: str) -> None:
        raise TypeError(f'{self.__class__.__name__} is immutable')

    def __or__(self, other: HttpHeaders) -> HttpHeaders:
        """Return a new instance of `HttpHeaders` combining this one with another one."""
        combined_headers = {**self.root, **other}
        return HttpHeaders(combined_headers)

    def __ror__(self, other: HttpHeaders) -> HttpHeaders:
        """Support reversed | operation (other | self)."""
        combined_headers = {**other, **self.root}
        return HttpHeaders(combined_headers)

    def __iter__(self) -> Iterator[str]:  # ty: ignore[invalid-method-override]
        yield from self.root

    def __len__(self) -> int:
        return len(self.root)


@docs_group('Configuration')
class ConcurrencySettings:
    """Concurrency settings for AutoscaledPool."""

    def __init__(
        self,
        min_concurrency: int = 1,
        max_concurrency: int = 100,
        max_tasks_per_minute: float = float('inf'),
        desired_concurrency: int = 10,
    ) -> None:
        """Initialize a new instance.

        Args:
            min_concurrency: The minimum number of tasks running in parallel. If you set this value too high
                with respect to the available system memory and CPU, your code might run extremely slow or crash.
            max_concurrency: The maximum number of tasks running in parallel.
            max_tasks_per_minute: The maximum number of tasks per minute the pool can run. By default, this is set
                to infinity, but you can pass any positive, non-zero number.
            desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
                if there is a large enough supply of them. By default, it is `min_concurrency`.
        """
        if min_concurrency < 1:
            raise ValueError('min_concurrency must be 1 or larger')

        if max_concurrency < min_concurrency:
            raise ValueError('max_concurrency cannot be less than min_concurrency')

        if desired_concurrency < min_concurrency:
            raise ValueError('desired_concurrency cannot be less than min_concurrency')

        if desired_concurrency > max_concurrency:
            raise ValueError('desired_concurrency cannot be greater than max_concurrency')

        if max_tasks_per_minute <= 0:
            raise ValueError('max_tasks_per_minute must be positive')

        self.min_concurrency = min_concurrency
        self.max_concurrency = max_concurrency
        self.desired_concurrency = desired_concurrency
        self.max_tasks_per_minute = max_tasks_per_minute


class EnqueueLinksKwargs(TypedDict):
    """Keyword arguments for the `enqueue_links` methods."""

    limit: NotRequired[int]
    """Maximum number of requests to be enqueued."""

    base_url: NotRequired[str]
    """Base URL to be used for relative URLs."""

    strategy: NotRequired[EnqueueStrategy]
    """Enqueue strategy to be used for determining which links to extract and enqueue.

    Options:
        all: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all
            links, including those leading to external websites, are followed.
        same-domain: Enqueue links that share the same domain name as the current page, including any subdomains.
            This strategy is ideal for crawling within the same top-level domain while still allowing for subdomain
            exploration.
        same-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default
            behavior and restricts the crawl to the current hostname, excluding subdomains.
        same-origin: Enqueue links that share the same origin as the current page. The origin is defined by the
            combination of protocol, domain, and port, ensuring a strict scope for the crawl.
    """

    include: NotRequired[Sequence[re.Pattern | Glob]]
    """List of regular expressions or globs that URLs must match to be enqueued."""

    exclude: NotRequired[Sequence[re.Pattern | Glob]]
    """List of regular expressions or globs that URLs must not match to be enqueued."""


class AddRequestsKwargs(EnqueueLinksKwargs):
    """Keyword arguments for the `add_requests` methods."""

    requests: Sequence[str | Request]
    """Requests to be added to the `RequestManager`."""

    rq_id: str | None
    """ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided."""

    rq_name: str | None
    """Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
    """

    rq_alias: str | None
    """Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
    """


class PushDataKwargs(TypedDict):
    """Keyword arguments for dataset's `push_data` method."""


class PushDataFunctionCall(PushDataKwargs):
    data: list[dict[str, Any]] | dict[str, Any]
    dataset_id: str | None
    dataset_name: str | None
    dataset_alias: str | None


class KeyValueStoreInterface(Protocol):
    """The (limited) part of the `KeyValueStore` interface that should be accessible from a request handler."""

    @overload
    async def get_value(self, key: str) -> Any: ...

    @overload
    async def get_value(self, key: str, default_value: T) -> T: ...

    @overload
    async def get_value(self, key: str, default_value: T | None = None) -> T | None: ...

    async def get_value(self, key: str, default_value: T | None = None) -> T | None: ...

    async def set_value(
        self,
        key: str,
        value: Any,
        content_type: str | None = None,
    ) -> None: ...


@dataclass()
class KeyValueStoreValue:
    content: Any
    content_type: str | None


class KeyValueStoreChangeRecords:
    def __init__(self, actual_key_value_store: KeyValueStore) -> None:
        self.updates = dict[str, KeyValueStoreValue]()
        self._actual_key_value_store = actual_key_value_store

    async def set_value(
        self,
        key: str,
        value: Any,
        content_type: str | None = None,
    ) -> None:
        self.updates[key] = KeyValueStoreValue(value, content_type)

    @overload
    async def get_value(self, key: str) -> Any: ...

    @overload
    async def get_value(self, key: str, default_value: T) -> T: ...

    @overload
    async def get_value(self, key: str, default_value: T | None = None) -> T | None: ...

    async def get_value(self, key: str, default_value: T | None = None) -> T | None:
        if key in self.updates:
            return cast('T', self.updates[key].content)

        return await self._actual_key_value_store.get_value(key, default_value)


class RequestHandlerRunResult:
    """Record of calls to storage-related context helpers."""

    def __init__(
        self,
        *,
        key_value_store_getter: GetKeyValueStoreFunction,
        request: Request,
    ) -> None:
        self._key_value_store_getter = key_value_store_getter
        self.add_requests_calls = list[AddRequestsKwargs]()
        self.push_data_calls = list[PushDataFunctionCall]()
        self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()

        # Isolated copies for handler execution
        self._request = deepcopy(request)

    @property
    def request(self) -> Request:
        return self._request

    async def add_requests(
        self,
        requests: Sequence[str | Request],
        rq_id: str | None = None,
        rq_name: str | None = None,
        rq_alias: str | None = None,
        **kwargs: Unpack[EnqueueLinksKwargs],
    ) -> None:
        """Track a call to the `add_requests` context helper."""
        specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
        if specified_params > 1:
            raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.')
        self.add_requests_calls.append(
            AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs)
        )

    async def push_data(
        self,
        data: list[dict[str, Any]] | dict[str, Any],
        dataset_id: str | None = None,
        dataset_name: str | None = None,
        dataset_alias: str | None = None,
        **kwargs: Unpack[PushDataKwargs],
    ) -> None:
        """Track a call to the `push_data` context helper."""
        self.push_data_calls.append(
            PushDataFunctionCall(
                data=data,
                dataset_id=dataset_id,
                dataset_name=dataset_name,
                dataset_alias=dataset_alias,
                **kwargs,
            )
        )

    async def get_key_value_store(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
    ) -> KeyValueStoreInterface:
        if (id, name, alias) not in self.key_value_store_changes:
            self.key_value_store_changes[id, name, alias] = KeyValueStoreChangeRecords(
                await self._key_value_store_getter(id=id, name=name, alias=alias)
            )

        return self.key_value_store_changes[id, name, alias]

    def apply_request_changes(self, target: Request) -> None:
        """Apply tracked changes from handler copy to original request."""
        if self.request.user_data != target.user_data:
            target.user_data = self.request.user_data

        if self.request.headers != target.headers:
            target.headers = self.request.headers


@docs_group('Functions')
class AddRequestsFunction(Protocol):
    """Function for adding requests to the `RequestManager`, with optional filtering.

    It simplifies the process of adding requests to the `RequestManager`. It automatically opens
    the specified one and adds the provided requests.
    """

    def __call__(
        self,
        requests: Sequence[str | Request],
        rq_id: str | None = None,
        rq_name: str | None = None,
        rq_alias: str | None = None,
        **kwargs: Unpack[EnqueueLinksKwargs],
    ) -> Coroutine[None, None, None]:
        """Call dunder method.

        Args:
            requests: Requests to be added to the `RequestManager`.
            rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
                provided.
            rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
                can be provided.
            rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
                can be provided.
            **kwargs: Additional keyword arguments.
        """


@docs_group('Functions')
class EnqueueLinksFunction(Protocol):
    """A function for enqueueing new URLs to crawl based on elements selected by a given selector or explicit requests.

    It adds explicitly passed `requests` to the `RequestManager` or it extracts URLs from the current page and enqueues
    them for further crawling. It allows filtering through selectors and other options. You can also specify labels and
    user data to be associated with the newly created `Request` objects.

    It should not be called with `selector`, `label`, `user_data` or `transform_request_function` arguments together
    with `requests` argument.

    For even more control over the enqueued links you can use combination of `ExtractLinksFunction` and
    `AddRequestsFunction`.
    """

    @overload
    def __call__(
        self,
        *,
        selector: str | None = None,
        attribute: str | None = None,
        label: str | None = None,
        user_data: dict[str, Any] | None = None,
        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
        rq_id: str | None = None,
        rq_name: str | None = None,
        rq_alias: str | None = None,
        **kwargs: Unpack[EnqueueLinksKwargs],
    ) -> Coroutine[None, None, None]: ...

    @overload
    def __call__(
        self,
        *,
        requests: Sequence[str | Request] | None = None,
        rq_id: str | None = None,
        rq_name: str | None = None,
        rq_alias: str | None = None,
        **kwargs: Unpack[EnqueueLinksKwargs],
    ) -> Coroutine[None, None, None]: ...

    def __call__(
        self,
        *,
        selector: str | None = None,
        attribute: str | None = None,
        label: str | None = None,
        user_data: dict[str, Any] | None = None,
        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
        requests: Sequence[str | Request] | None = None,
        rq_id: str | None = None,
        rq_name: str | None = None,
        rq_alias: str | None = None,
        **kwargs: Unpack[EnqueueLinksKwargs],
    ) -> Coroutine[None, None, None]:
        """Call enqueue links function.

        Args:
            selector: A selector used to find the elements containing the links. The behaviour differs based
                on the crawler used:
                - `PlaywrightCrawler` supports CSS and XPath selectors.
                - `ParselCrawler` supports CSS selectors.
                - `BeautifulSoupCrawler` supports CSS selectors.
            attribute: Which node attribute to extract the links from.
            label: Label for the newly created `Request` objects, used for request routing.
            user_data: User data to be provided to the newly created `Request` objects.
            transform_request_function: A function that takes `RequestOptions` and returns either:
                - Modified `RequestOptions` to update the request configuration,
                - `'skip'` to exclude the request from being enqueued,
                - `'unchanged'` to use the original request options without modification.
            requests: Requests to be added to the `RequestManager`.
            rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
                provided.
            rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
                can be provided.
            rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
                can be provided.
            **kwargs: Additional keyword arguments.
        """


@docs_group('Functions')
class ExtractLinksFunction(Protocol):
    """A function for extracting URLs to crawl based on elements selected by a given selector.

    It extracts URLs from the current page and allows filtering through selectors and other options. You can also
    specify labels and user data to be associated with the newly created `Request` objects.
    """

    def __call__(
        self,
        *,
        selector: str = 'a',
        attribute: str = 'href',
        label: str | None = None,
        user_data: dict[str, Any] | None = None,
        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
        **kwargs: Unpack[EnqueueLinksKwargs],
    ) -> Coroutine[None, None, list[Request]]:
        """Call extract links function.

        Args:
            selector: A selector used to find the elements containing the links. The behaviour differs based
                on the crawler used:
                - `PlaywrightCrawler` supports CSS and XPath selectors.
                - `ParselCrawler` supports CSS selectors.
                - `BeautifulSoupCrawler` supports CSS selectors.
            attribute: Which node attribute to extract the links from.
            label: Label for the newly created `Request` objects, used for request routing.
            user_data: User data to be provided to the newly created `Request` objects.
            transform_request_function: A function that takes `RequestOptions` and returns either:
                - Modified `RequestOptions` to update the request configuration,
                - `'skip'` to exclude the request from being enqueued,
                - `'unchanged'` to use the original request options without modification.
            **kwargs: Additional keyword arguments.
        """


@docs_group('Functions')
class GetKeyValueStoreFunction(Protocol):
    """A function for accessing a `KeyValueStore`.

    It retrieves an instance of a `KeyValueStore` based on its ID or name.
    """

    def __call__(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
    ) -> Coroutine[None, None, KeyValueStore]:
        """Call dunder method.

        Args:
            id: The ID of the `KeyValueStore` to get.
            name: The name of the `KeyValueStore` to get (global scope, named storage).
            alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
        """


class GetKeyValueStoreFromRequestHandlerFunction(Protocol):
    """A function for accessing a `KeyValueStore`.

    It retrieves an instance of a `KeyValueStore` based on its ID or name.
    """

    def __call__(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
    ) -> Coroutine[None, None, KeyValueStoreInterface]:
        """Call dunder method.

        Args:
            id: The ID of the `KeyValueStore` to get.
            name: The name of the `KeyValueStore` to get (global scope, named storage).
            alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
        """


@docs_group('Functions')
class PushDataFunction(Protocol):
    """A function for pushing data to a `Dataset`.

    It simplifies the process of adding data to a `Dataset`. It opens the specified one and pushes
    the provided data to it.
    """

    def __call__(
        self,
        data: list[dict[str, Any]] | dict[str, Any],
        dataset_id: str | None = None,
        dataset_name: str | None = None,
        dataset_alias: str | None = None,
        **kwargs: Unpack[PushDataKwargs],
    ) -> Coroutine[None, None, None]:
        """Call dunder method.

        Args:
            data: The data to push to the `Dataset`.
            dataset_id: The ID of the `Dataset` to push the data to.
            dataset_name: The name of the `Dataset` to push the data to (global scope, named storage).
            dataset_alias: The alias of the `Dataset` to push the data to (run scope, unnamed storage).
            **kwargs: Additional keyword arguments.
        """


@docs_group('Functions')
class SendRequestFunction(Protocol):
    """A function for sending HTTP requests.

    It simplifies the process of sending HTTP requests. It is implemented by the crawling context and is used
    within request handlers to send additional HTTP requests to target URLs.
    """

    def __call__(
        self,
        url: str,
        *,
        method: HttpMethod = 'GET',
        payload: HttpPayload | None = None,
        headers: HttpHeaders | dict[str, str] | None = None,
    ) -> Coroutine[None, None, HttpResponse]:
        """Call send request function.

        Args:
            url: The URL to send the request to.
            method: The HTTP method to use.
            headers: The headers to include in the request.
            payload: The payload to include in the request.

        Returns:
            The HTTP response received from the server.
        """


@docs_group('Other')
@dataclasses.dataclass
class PageSnapshot:
    """Snapshot of a crawled page."""

    screenshot: bytes | None = None
    """Screenshot of the page format."""

    html: str | None = None
    """HTML content of the page."""

    def __bool__(self) -> bool:
        return bool(self.screenshot or self.html)


@docs_group('Functions')
class UseStateFunction(Protocol):
    """A function for managing state within the crawling context.

    It allows the use of persistent state across multiple crawls.

    Warning:
        This is an experimental feature. The behavior and interface may change in future versions.
    """

    def __call__(
        self,
        default_value: dict[str, JsonSerializable] | None = None,
    ) -> Coroutine[None, None, dict[str, JsonSerializable]]:
        """Call dunder method.

        Args:
            default_value: The default value to initialize the state if it is not already set.

        Returns:
            The current state.
        """


@dataclass(frozen=True)
@docs_group('Crawling contexts')
class BasicCrawlingContext:
    """Basic crawling context.

    It represents the fundamental crawling context used by the `BasicCrawler`. It is extended by more
    specific crawlers to provide additional functionality.
    """

    request: Request
    """Request object for the current page being processed."""

    session: Session | None
    """Session object for the current page being processed."""

    proxy_info: ProxyInfo | None
    """Proxy information for the current page being processed."""

    send_request: SendRequestFunction
    """Send request crawling context helper function."""

    add_requests: AddRequestsFunction
    """Add requests crawling context helper function."""

    push_data: PushDataFunction
    """Push data crawling context helper function."""

    use_state: UseStateFunction
    """Use state crawling context helper function."""

    get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction
    """Get key-value store crawling context helper function."""

    log: logging.Logger
    """Logger instance."""

    async def get_snapshot(self) -> PageSnapshot:
        """Get snapshot of crawled page."""
        return PageSnapshot()

    def __hash__(self) -> int:
        """Return hash of the context. Each context is considered unique."""
        return id(self)

    def create_modified_copy(
        self,
        push_data: PushDataFunction | None = None,
        add_requests: AddRequestsFunction | None = None,
        get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
    ) -> Self:
        """Create a modified copy of the crawling context with specified changes."""
        modifications = dict[str, Any]()

        if push_data is not None:
            modifications['push_data'] = push_data
        if add_requests is not None:
            modifications['add_requests'] = add_requests
        if get_key_value_store is not None:
            modifications['get_key_value_store'] = get_key_value_store

        return dataclasses.replace(self, **modifications)


class GetDataKwargs(TypedDict):
    """Keyword arguments for dataset's `get_data` method."""

    offset: NotRequired[int]
    """Skips the specified number of items at the start."""

    limit: NotRequired[int | None]
    """The maximum number of items to retrieve. Unlimited if None."""

    clean: NotRequired[bool]
    """Return only non-empty items and excludes hidden fields. Shortcut for `skip_hidden` and `skip_empty`."""

    desc: NotRequired[bool]
    """Set to True to sort results in descending order."""

    fields: NotRequired[list[str]]
    """Fields to include in each item. Sorts fields as specified if provided."""

    omit: NotRequired[list[str]]
    """Fields to exclude from each item."""

    unwind: NotRequired[list[str]]
    """Unwinds items by a specified array field, turning each element into a separate item."""

    skip_empty: NotRequired[bool]
    """Excludes empty items from the results if True."""

    skip_hidden: NotRequired[bool]
    """Excludes fields starting with '#' if True."""

    flatten: NotRequired[list[str]]
    """Fields to be flattened in returned items."""

    view: NotRequired[str]
    """Specifies the dataset view to be used."""


class ExportToKwargs(TypedDict):
    """Keyword arguments for dataset's `export_to` method."""

    key: Required[str]
    """The key under which to save the data."""

    content_type: NotRequired[Literal['json', 'csv']]
    """The format in which to export the data. Either 'json' or 'csv'."""

    to_kvs_id: NotRequired[str]
    """ID of the key-value store to save the exported file."""

    to_kvs_name: NotRequired[str]
    """Name of the key-value store to save the exported file."""

    to_kvs_storage_client: NotRequired[StorageClient]
    """The storage client to use for saving the exported file."""

    to_kvs_configuration: NotRequired[Configuration]
    """The configuration to use for saving the exported file."""


class ExportDataJsonKwargs(TypedDict):
    """Keyword arguments for dataset's `export_data_json` method."""

    skipkeys: NotRequired[bool]
    """If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped
    instead of raising a `TypeError`."""

    ensure_ascii: NotRequired[bool]
    """Determines if non-ASCII characters should be escaped in the output JSON string."""

    check_circular: NotRequired[bool]
    """If False (default: True), skips the circular reference check for container types. A circular reference will
    result in a `RecursionError` or worse if unchecked."""

    allow_nan: NotRequired[bool]
    """If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply
    with the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)."""

    cls: NotRequired[type[json.JSONEncoder]]
    """Allows specifying a custom JSON encoder."""

    indent: NotRequired[int]
    """Specifies the number of spaces for indentation in the pretty-printed JSON output."""

    separators: NotRequired[tuple[str, str]]
    """A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')
    otherwise."""

    default: NotRequired[Callable]
    """A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version
    of the object or raise a `TypeError`."""

    sort_keys: NotRequired[bool]
    """Specifies whether the output JSON object should have keys sorted alphabetically."""


class ExportDataCsvKwargs(TypedDict):
    """Keyword arguments for dataset's `export_data_csv` method."""

    dialect: NotRequired[str]
    """Specifies a dialect to be used in CSV parsing and writing."""

    delimiter: NotRequired[str]
    """A one-character string used to separate fields. Defaults to ','."""

    doublequote: NotRequired[bool]
    """Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;
    when False, the `escapechar` is used as a prefix. Defaults to True."""

    escapechar: NotRequired[str]
    """A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`
    if `doublequote` is False. Defaults to None, disabling escaping."""

    lineterminator: NotRequired[str]
    """The string used to terminate lines produced by the writer. Defaults to '\\r\\n'."""

    quotechar: NotRequired[str]
    """A one-character string used to quote fields containing special characters, like the delimiter or quotechar,
    or fields containing new-line characters. Defaults to '\"'."""

    quoting: NotRequired[int]
    """Controls when quotes should be generated by the writer and recognized by the reader. Can take any of
    the `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`."""

    skipinitialspace: NotRequired[bool]
    """When True, spaces immediately following the delimiter are ignored. Defaults to False."""

    strict: NotRequired[bool]
    """When True, raises an exception on bad CSV input. Defaults to False."""


================================================
FILE: src/crawlee/_utils/__init__.py
================================================


================================================
FILE: src/crawlee/_utils/blocked.py
================================================
from __future__ import annotations

# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/utils/src/internals/blocked.ts

CLOUDFLARE_RETRY_CSS_SELECTORS = [
    '#turnstile-wrapper iframe[src^="https://challenges.cloudflare.com"]',
]

RETRY_CSS_SELECTORS = [
    *CLOUDFLARE_RETRY_CSS_SELECTORS,
    'div#infoDiv0 a[href*="//www.google.com/policies/terms/"]',
    'iframe[src*="_Incapsula_Resource"]',
]
"""
CSS selectors for elements that should trigger a retry, as the crawler is likely getting blocked.
"""

ROTATE_PROXY_ERRORS = [
    'ECONNRESET',
    'ECONNREFUSED',
    'ERR_PROXY_CONNECTION_FAILED',
    'ERR_TUNNEL_CONNECTION_FAILED',
    'Proxy responded with',
    'unsuccessful tunnel',
    'TunnelUnsuccessful',
]
"""
Content of proxy errors that should trigger a retry, as the proxy is likely getting blocked / is malfunctioning.
"""


================================================
FILE: src/crawlee/_utils/byte_size.py
================================================
from __future__ import annotations

from dataclasses import dataclass
from typing import Any

_BYTES_PER_KB = 1024
_BYTES_PER_MB = _BYTES_PER_KB**2
_BYTES_PER_GB = _BYTES_PER_KB**3
_BYTES_PER_TB = _BYTES_PER_KB**4


@dataclass(frozen=True)
class ByteSize:
    """Represents a byte size."""

    bytes: int

    def __post_init__(self) -> None:
        if self.bytes < 0:
            raise ValueError('ByteSize cannot be negative')

    @classmethod
    def validate(cls, value: Any) -> ByteSize:
        if isinstance(value, ByteSize):
            return value

        if not isinstance(value, (float, int)):
            raise TypeError('Value must be numeric')

        return cls(int(value))

    @classmethod
    def from_kb(cls, kb: float) -> ByteSize:
        return cls(int(kb * _BYTES_PER_KB))

    @classmethod
    def from_mb(cls, mb: float) -> ByteSize:
        return cls(int(mb * _BYTES_PER_MB))

    @classmethod
    def from_gb(cls, gb: float) -> ByteSize:
        return cls(int(gb * _BYTES_PER_GB))

    @classmethod
    def from_tb(cls, tb: float) -> ByteSize:
        return cls(int(tb * _BYTES_PER_TB))

    def to_kb(self) -> float:
        return self.bytes / _BYTES_PER_KB

    def to_mb(self) -> float:
        return self.bytes / _BYTES_PER_MB

    def to_gb(self) -> float:
        return self.bytes / _BYTES_PER_GB

    def to_tb(self) -> float:
        return self.bytes / _BYTES_PER_TB

    def __str__(self) -> str:
        if self.bytes >= _BYTES_PER_TB:
            return f'{self.to_tb():.2f} TB'
        if self.bytes >= _BYTES_PER_GB:
            return f'{self.to_gb():.2f} GB'
        if self.bytes >= _BYTES_PER_MB:
            return f'{self.to_mb():.2f} MB'
        if self.bytes >= _BYTES_PER_KB:
            return f'{self.to_kb():.2f} KB'
        return f'{self.bytes} B'

    def __eq__(self, other: object) -> bool:
        if isinstance(other, ByteSize):
            return self.bytes == other.bytes
        return NotImplemented

    def __hash__(self) -> int:
        """Return hash based on the bytes value."""
        return hash(self.bytes)

    def __lt__(self, other: object) -> bool:
        if isinstance(other, ByteSize):
            return self.bytes < other.bytes
        return NotImplemented

    def __le__(self, other: object) -> bool:
        if isinstance(other, ByteSize):
            return self.bytes <= other.bytes
        return NotImplemented

    def __gt__(self, other: object) -> bool:
        if isinstance(other, ByteSize):
            return self.bytes > other.bytes
        return NotImplemented

    def __ge__(self, other: object) -> bool:
        if isinstance(other, ByteSize):
            return self.bytes >= other.bytes
        return NotImplemented

    def __add__(self, other: object) -> ByteSize:
        if isinstance(other, ByteSize):
            return ByteSize(self.bytes + other.bytes)
        return NotImplemented

    def __sub__(self, other: object) -> ByteSize:
        if isinstance(other, ByteSize):
            result = self.bytes - other.bytes
            if result < 0:
                raise ValueError('Resulting ByteSize cannot be negative')
            return ByteSize(result)
        return NotImplemented

    def __mul__(self, other: object) -> ByteSize:
        if isinstance(other, (int, float)):
            return ByteSize(int(self.bytes * other))

        return NotImplemented

    def __truediv__(self, other: object) -> float:
        if isinstance(other, ByteSize):
            if other.bytes == 0:
                raise ZeroDivisionError('Cannot divide by zero')
            return self.bytes / other.bytes

        return NotImplemented

    def __rmul__(self, other: object) -> ByteSize:
        return self.__mul__(other)


================================================
FILE: src/crawlee/_utils/console.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from collections.abc import Sequence

BORDER = {'TL': '┌', 'TR': '┐', 'BL': '└', 'BR': '┘', 'H': '─', 'V': '│', 'TM': '┬', 'BM': '┴'}


def make_table(rows: Sequence[Sequence[str]], width: int = 100) -> str:
    """Create a text table using Unicode characters.

    Args:
        rows: A list of tuples/lists to be displayed in the table.
        width: Maximum width of the table.
    """
    if not rows:
        return ''

    num_cols = max(len(row) for row in rows)

    if num_cols == 0:
        return ''

    # Normalize the row size by filling missing columns with empty values
    normalized_rows = [list(row) + [''] * (num_cols - len(row)) for row in rows]
    col_widths = [max(len(str(row[i])) for row in normalized_rows) for i in range(num_cols)]
    total_width = sum(col_widths) + (3 * num_cols) + 1

    # If the table size is larger than `width`, set all columns to the same length
    col_widths = col_widths if total_width <= width else [max(3, (width - (3 * num_cols) - 1) // num_cols)] * num_cols

    # Initialize borders
    top_parts, bottom_parts = [BORDER['TL']], [BORDER['BL']]

    for i in range(num_cols):
        h_border = BORDER['H'] * (col_widths[i] + 2)
        top_parts.append(h_border)
        bottom_parts.append(h_border)

        if i < num_cols - 1:
            top_parts.append(BORDER['TM'])
            bottom_parts.append(BORDER['BM'])
        else:
            top_parts.append(BORDER['TR'])
            bottom_parts.append(BORDER['BR'])

    top_border, bottom_border = ''.join(top_parts), ''.join(bottom_parts)

    result = [top_border]

    for row in normalized_rows:
        cells = []

        for i, cell in enumerate(row):
            # Trim the content if the length exceeds the widths of the column
            norm_cell = f'{cell[: col_widths[i] - 3]}...' if len(cell) > col_widths[i] else cell.ljust(col_widths[i])
            cells.append(norm_cell)

        # row: │ cell1 │ cell2 │ ...
        row_str = BORDER['V'] + ''.join(f' {cell} {BORDER["V"]}' for cell in cells)
        result.append(row_str)

    result.append(bottom_border)

    return '\n'.join(result)


================================================
FILE: src/crawlee/_utils/context.py
================================================
from __future__ import annotations

import inspect
from collections.abc import Callable
from functools import wraps
from typing import Any, TypeVar, cast

T = TypeVar('T', bound=Callable[..., Any])


def ensure_context(method: T) -> T:
    """Ensure the (async) context manager is initialized before executing the method.

    This decorator checks if the calling instance has an `active` attribute and verifies that it is set to `True`.
    If the instance is inactive, it raises a `RuntimeError`. Works for both synchronous and asynchronous methods.

    Args:
        method: The method to wrap.

    Returns:
        The wrapped method with context checking applied.

    Raises:
        RuntimeError: If the instance lacks an `active` attribute or is not active.
    """

    @wraps(method)
    def sync_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
        if not hasattr(self, 'active'):
            raise RuntimeError(f'The {self.__class__.__name__} does not have the "active" attribute.')

        if not self.active:
            raise RuntimeError(f'The {self.__class__.__name__} is not active. Use it within the context.')

        return method(self, *args, **kwargs)

    @wraps(method)
    async def async_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
        if not hasattr(self, 'active'):
            raise RuntimeError(f'The {self.__class__.__name__} does not have the "active" attribute.')

        if not self.active:
            raise RuntimeError(f'The {self.__class__.__name__} is not active. Use it within the async context.')

        return await method(self, *args, **kwargs)

    return cast('T', async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper)


================================================
FILE: src/crawlee/_utils/crypto.py
================================================
from __future__ import annotations

import secrets
from hashlib import sha256


def compute_short_hash(data: bytes, *, length: int = 8) -> str:
    """Compute a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it.

    Args:
        data: The binary data to be hashed.
        length: The length of the hash to be returned.

    Returns:
        A substring (prefix) of the hexadecimal hash of the data.
    """
    hash_object = sha256(data)
    return hash_object.hexdigest()[:length]


def crypto_random_object_id(length: int = 17) -> str:
    """Generate a random object ID."""
    chars = 'abcdefghijklmnopqrstuvwxyzABCEDFGHIJKLMNOPQRSTUVWXYZ0123456789'
    return ''.join(secrets.choice(chars) for _ in range(length))


================================================
FILE: src/crawlee/_utils/docs.py
================================================
from __future__ import annotations

from collections.abc import Callable
from typing import Any, Literal, TypeVar

# The order of the rendered API groups is defined in the website/docusaurus.config.js file.
GroupName = Literal[
    'Autoscaling',
    'Browser management',
    'Configuration',
    'Crawlers',
    'Crawling contexts',
    'Errors',
    'Event data',
    'Event managers',
    'Functions',
    'HTTP clients',
    'HTTP parsers',
    'Request loaders',
    'Session management',
    'Statistics',
    'Storage clients',
    'Storage data',
    'Storages',
    'Other',
]

T = TypeVar('T', bound=Callable[..., Any])


def docs_group(group_name: GroupName) -> Callable[[T], T]:  # noqa: ARG001
    """Mark a symbol for rendering and grouping in documentation.

    This decorator is used solely for documentation purposes and does not modify the behavior
    of the decorated callable.

    Args:
        group_name: The documentation group to which the symbol belongs.

    Returns:
        The original callable without modification.
    """

    def wrapper(func: T) -> T:
        return func

    return wrapper


================================================
FILE: src/crawlee/_utils/file.py
================================================
from __future__ import annotations

import asyncio
import csv
import json
import os
import sys
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING, overload

if TYPE_CHECKING:
    from collections.abc import AsyncIterator
    from typing import Any, TextIO

    from typing_extensions import Unpack

    from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs

if sys.platform == 'win32':

    def _write_file(path: Path, data: str | bytes) -> None:
        """Windows-specific file write implementation.

        This implementation writes directly to the file without using a temporary file, because
        they are problematic due to permissions issues on Windows.
        """
        if isinstance(data, bytes):
            path.write_bytes(data)
        elif isinstance(data, str):
            path.write_text(data, encoding='utf-8')
        else:
            raise TypeError(f'Unsupported data type: {type(data)}. Expected str or bytes.')
else:

    def _write_file(path: Path, data: str | bytes) -> None:
        """Linux/Unix-specific file write implementation using temporary files."""
        dir_path = path.parent
        fd, tmp_path = tempfile.mkstemp(
            suffix=f'{path.suffix}.tmp',
            prefix=f'{path.name}.',
            dir=str(dir_path),
        )

        if not isinstance(data, (str, bytes)):
            raise TypeError(f'Unsupported data type: {type(data)}. Expected str or bytes.')

        try:
            if isinstance(data, bytes):
                with os.fdopen(fd, 'wb') as tmp_file:
                    tmp_file.write(data)
            else:
                with os.fdopen(fd, 'w', encoding='utf-8') as tmp_file:
                    tmp_file.write(data)

            # Atomically replace the destination file with the temporary file
            Path(tmp_path).replace(path)
        except Exception:
            Path(tmp_path).unlink(missing_ok=True)
            raise


def infer_mime_type(value: Any) -> str:
    """Infer the MIME content type from the value.

    Args:
        value: The value to infer the content type from.

    Returns:
        The inferred MIME content type.
    """
    # If the value is bytes (or bytearray), return binary content type.
    if isinstance(value, (bytes, bytearray)):
        return 'application/octet-stream'

    # If the value is a dict or list, assume JSON.
    if isinstance(value, (dict, list)):
        return 'application/json; charset=utf-8'

    # If the value is a string, number or boolean, assume plain text.
    if isinstance(value, (str, int, float, bool)):
        return 'text/plain; charset=utf-8'

    # Default fallback.
    return 'application/octet-stream'


async def json_dumps(obj: Any) -> str:
    """Serialize an object to a JSON-formatted string with specific settings.

    Args:
        obj: The object to serialize.

    Returns:
        A string containing the JSON representation of the input object.
    """
    return await asyncio.to_thread(json.dumps, obj, ensure_ascii=False, indent=2, default=str)


@overload
async def atomic_write(
    path: Path,
    data: str,
    *,
    retry_count: int = 0,
) -> None: ...


@overload
async def atomic_write(
    path: Path,
    data: bytes,
    *,
    retry_count: int = 0,
) -> None: ...


async def atomic_write(
    path: Path,
    data: str | bytes,
    *,
    retry_count: int = 0,
) -> None:
    """Write data to a file atomically to prevent data corruption or partial writes.

    This function handles both text and binary data. The binary mode is automatically
    detected based on the data type (bytes = binary, str = text). It ensures atomic
    writing by creating a temporary file and then atomically replacing the target file,
    which prevents data corruption if the process is interrupted during the write operation.

    Args:
        path: The path to the destination file.
        data: The data to write to the file (string or bytes).
        retry_count: Internal parameter to track the number of retry attempts (default: 0).
    """
    max_retries = 3

    try:
        # Use the platform-specific write function resolved at import time.
        await asyncio.to_thread(_write_file, path, data)
    except (FileNotFoundError, PermissionError):
        if retry_count < max_retries:
            return await atomic_write(
                path,
                data,
                retry_count=retry_count + 1,
            )
        # If we reach the maximum number of retries, raise the exception.
        raise


async def export_json_to_stream(
    iterator: AsyncIterator[dict[str, Any]],
    dst: TextIO,
    **kwargs: Unpack[ExportDataJsonKwargs],
) -> None:
    items = [item async for item in iterator]
    json.dump(items, dst, **kwargs)


async def export_csv_to_stream(
    iterator: AsyncIterator[dict[str, Any]],
    dst: TextIO,
    **kwargs: Unpack[ExportDataCsvKwargs],
) -> None:
    # Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
    # The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
    # to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
    # conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
    if 'lineterminator' not in kwargs:
        kwargs['lineterminator'] = '\n'

    writer = csv.writer(dst, **kwargs)
    write_header = True

    # Iterate over the dataset and write to CSV.
    async for item in iterator:
        if not item:
            continue

        if write_header:
            writer.writerow(item.keys())
            write_header = False

        writer.writerow(item.values())


================================================
FILE: src/crawlee/_utils/globs.py
================================================
from __future__ import annotations

import os
import re
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from collections.abc import Sequence


class Glob:
    """Wraps a glob pattern (supports the `*`, `**`, `?` wildcards)."""

    def __init__(self, glob: str) -> None:
        self.glob = glob
        self.regexp = re.compile(_translate(self.glob, recursive=True))


def _translate(
    pat: str, *, recursive: bool = False, include_hidden: bool = False, seps: Sequence[str] | None = None
) -> str:
    """Translate a pathname with shell wildcards to a regular expression.

    If `recursive` is true, the pattern segment '**' will match any number of
    path segments.

    If `include_hidden` is true, wildcards can match path segments beginning
    with a dot ('.').

    If a sequence of separator characters is given to `seps`, they will be
    used to split the pattern into segments and match path separators. If not
    given, os.path.sep and os.path.altsep (where available) are used.

    HACK: This function is copied from CPython stdlib source. It will be released in Python 3.13 as `glob.translate`
    """
    _seps = ((os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)) if seps is None else seps

    escaped_seps = ''.join(map(re.escape, _seps))
    any_sep = f'[{escaped_seps}]' if len(_seps) > 1 else escaped_seps
    not_sep = f'[^{escaped_seps}]'

    if include_hidden:
        one_last_segment = f'{not_sep}+'
        one_segment = f'{one_last_segment}{any_sep}'
        any_segments = f'(?:.+{any_sep})?'
        any_last_segments = '.*'
    else:
        one_last_segment = f'[^{escaped_seps}.]{not_sep}*'
        one_segment = f'{one_last_segment}{any_sep}'
        any_segments = f'(?:{one_segment})*'
        any_last_segments = f'{any_segments}(?:{one_last_segment})?'

    results = []
    parts = re.split(any_sep, pat)
    last_part_idx = len(parts) - 1
    for idx, part in enumerate(parts):
        if part == '*':
            results.append(one_segment if idx < last_part_idx else one_last_segment)
        elif recursive and part == '**':
            if idx < last_part_idx:
                if parts[idx + 1] != '**':
                    results.append(any_segments)
            else:
                results.append(any_last_segments)
        else:
            if part:
                if not include_hidden and part[0] in '*?':
                    results.append(r'(?!\.)')
                results.extend(_fnmatch_translate(part, f'{not_sep}*', not_sep))
            if idx < last_part_idx:
                results.append(any_sep)
    res = ''.join(results)
    return rf'(?s:{res})\Z'


def _fnmatch_translate(pat: str, star: str, question_mark: str) -> list[str]:
    """Copy of fnmatch._translate from Python 3.13."""
    res = list[str]()
    add = res.append
    i, n = 0, len(pat)
    while i < n:
        c = pat[i]
        i = i + 1
        if c == '*':
            # compress consecutive `*` into one
            if (not res) or res[-1] is not star:
                add(star)
        elif c == '?':
            add(question_mark)
        elif c == '[':
            j = i
            if j < n and pat[j] == '!':
                j = j + 1
            if j < n and pat[j] == ']':
                j = j + 1
            while j < n and pat[j] != ']':
                j = j + 1
            if j >= n:
                add('\\[')
            else:
                stuff = pat[i:j]
                if '-' not in stuff:
                    stuff = stuff.replace('\\', r'\\')
                else:
                    chunks = []
                    k = i + 2 if pat[i] == '!' else i + 1
                    while True:
                        k = pat.find('-', k, j)
                        if k < 0:
                            break
                        chunks.append(pat[i:k])
                        i = k + 1
                        k = k + 3
                    chunk = pat[i:j]
                    if chunk:
                        chunks.append(chunk)
                    else:
                        chunks[-1] += '-'
                    # Remove empty ranges -- invalid in RE.
                    for k in range(len(chunks) - 1, 0, -1):
                        if chunks[k - 1][-1] > chunks[k][0]:
                            chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:]
                            del chunks[k]
                    # Escape backslashes and hyphens for set difference (--).
                    # Hyphens that create ranges shouldn't be escaped.
                    stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') for s in chunks)
                # Escape set operations (&&, ~~ and ||).
                stuff = re.sub(r'([&~|])', r'\\\1', stuff)
                i = j + 1
                if not stuff:
                    # Empty range: never match.
                    add('(?!)')
                elif stuff == '!':
                    # Negated empty range: match any character.
                    add('.')
                else:
                    if stuff[0] == '!':
                        stuff = '^' + stuff[1:]
                    elif stuff[0] in ('^', '['):
                        stuff = '\\' + stuff
                    add(f'[{stuff}]')
        else:
            add(re.escape(c))
    return res


================================================
FILE: src/crawlee/_utils/html_to_text.py
================================================
# This file contains shared constants used by different implementations of html_to_text function.
from __future__ import annotations

import re

# Tags based on Javascript implementation of htmlToText from:
# https://github.com/apify/crawlee/blob/master/packages/utils/src/internals/cheerio.ts#L11
# Originally added here: https://github.com/apify/apify-ts/commit/4c0e5e3e7377536a449bb7b205132348ad3b0fe9
SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'}
BLOCK_TAGS = {
    'p',
    'h1',
    'h2',
    'h3',
    'h4',
    'h5',
    'h6',
    'ol',
    'ul',
    'li',
    'pre',
    'address',
    'blockquote',
    'dl',
    'div',
    'fieldset',
    'form',
    'table',
    'tr',
    'select',
    'option',
}

_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE = re.compile(r'(^|\s)$')
_EMPTY_OR_ENDS_WITH_NEW_LINE = re.compile(r'(^|\n)$')
_ANY_CONSECUTIVE_WHITE_SPACES = re.compile(r'\s+')


================================================
FILE: src/crawlee/_utils/models.py
================================================
from __future__ import annotations

from contextlib import suppress
from datetime import timedelta
from typing import TYPE_CHECKING, Annotated, Any

from pydantic import PlainSerializer, TypeAdapter, ValidationError, WrapValidator

if TYPE_CHECKING:
    from collections.abc import Callable

"""Utility types for Pydantic models."""


def _timedelta_to_ms(td: timedelta | None) -> float | None:
    if td == timedelta.max:
        return float('inf')
    if td is None:
        return td
    return round(td.total_seconds() * 1000)


def _timedelta_to_secs(td: timedelta | None) -> float | None:
    if td == timedelta.max:
        return float('inf')
    if td is None:
        return td
    return td.total_seconds()


_number_parser = TypeAdapter(float)


def _timedelta_from_ms(value: float | timedelta | Any | None, handler: Callable[[Any], timedelta]) -> timedelta | None:
    if value == float('inf'):
        return timedelta.max

    # If the value is a string-encoded number, decode it
    if isinstance(value, str):
        with suppress(ValidationError):
            value = _number_parser.validate_python(value)

    if not isinstance(value, (int, float)):
        return handler(value)

    return timedelta(milliseconds=value)


def _timedelta_from_secs(
    value: float | timedelta | Any | None,
    handler: Callable[[Any], timedelta],
) -> timedelta | None:
    if value == float('inf'):
        return timedelta.max

    # If the value is a string-encoded number, decode it
    if isinstance(value, str):
        with suppress(ValidationError):
            value = _number_parser.validate_python(value)

    if not isinstance(value, (int, float)):
        return handler(value)

    return timedelta(seconds=value)


timedelta_ms = Annotated[timedelta, PlainSerializer(_timedelta_to_ms), WrapValidator(_timedelta_from_ms)]
timedelta_secs = Annotated[timedelta, PlainSerializer(_timedelta_to_secs), WrapValidator(_timedelta_from_secs)]


================================================
FILE: src/crawlee/_utils/raise_if_too_many_kwargs.py
================================================
from typing import Any


def raise_if_too_many_kwargs(max_kwargs: int = 1, **kwargs: Any) -> None:
    """Raise ValueError if there are more non-None kwargs then max_kwargs."""
    none_kwargs_names = [f'"{kwarg_name}"' for kwarg_name, value in kwargs.items() if value is not None]
    if len(none_kwargs_names) > max_kwargs:
        all_kwargs_names = [f'"{kwarg_name}"' for kwarg_name in kwargs]
        raise ValueError(
            f'Only one of {", ".join(all_kwargs_names)} can be specified, but following arguments were '
            f'specified: {", ".join(none_kwargs_names)}.'
        )


================================================
FILE: src/crawlee/_utils/recoverable_state.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING, Generic, Literal, TypeVar

from pydantic import BaseModel

from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.events._types import Event, EventPersistStateData

if TYPE_CHECKING:
    import logging
    from collections.abc import Callable, Coroutine

    from crawlee.storages import KeyValueStore

TStateModel = TypeVar('TStateModel', bound=BaseModel)


class RecoverableState(Generic[TStateModel]):
    """A class for managing persistent recoverable state using a Pydantic model.

    This class facilitates state persistence to a `KeyValueStore`, allowing data to be saved and retrieved
    across migrations or restarts. It manages the loading, saving, and resetting of state data,
    with optional persistence capabilities.

    The state is represented by a Pydantic model that can be serialized to and deserialized from JSON.
    The class automatically hooks into the event system to persist state when needed.

    Type Parameters:
        TStateModel: A Pydantic BaseModel type that defines the structure of the state data.
                     Typically, it should be inferred from the `default_state` constructor parameter.
    """

    def __init__(
        self,
        *,
        default_state: TStateModel,
        persist_state_key: str,
        persistence_enabled: Literal[True, False, 'explicit_only'] = False,
        persist_state_kvs_name: str | None = None,
        persist_state_kvs_id: str | None = None,
        persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
        logger: logging.Logger,
    ) -> None:
        """Initialize a new recoverable state object.

        Args:
            default_state: The default state model instance to use when no persisted state is found.
                A deep copy is made each time the state is used.
            persist_state_key: The key under which the state is stored in the KeyValueStore
            persistence_enabled: Flag to enable or disable state persistence. Use 'explicit_only' if you want to be able
                to save the state manually, but without any automatic persistence.
            persist_state_kvs_name: The name of the KeyValueStore to use for persistence.
                If neither a name nor and id are supplied, the default store will be used.
            persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence.
                If neither a name nor and id are supplied, the default store will be used.
            persist_state_kvs_factory: Factory that can be awaited to create KeyValueStore to use for persistence. If
                not provided, a system-wide KeyValueStore will be used, based on service locator configuration.
            logger: A logger instance for logging operations related to state persistence
        """
        raise_if_too_many_kwargs(
            persist_state_kvs_name=persist_state_kvs_name,
            persist_state_kvs_id=persist_state_kvs_id,
            persist_state_kvs_factory=persist_state_kvs_factory,
        )
        if not persist_state_kvs_factory:
            logger.debug(
                'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore '
                'based on service_locator configuration, potentially calling service_locator.set_storage_client in the '
                'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid '
                'global side effects.'
            )

        self._default_state = default_state
        self._state_type: type[TStateModel] = self._default_state.__class__
        self._state: TStateModel | None = None
        self._persistence_enabled = persistence_enabled
        self._persist_state_key = persist_state_key
        if persist_state_kvs_factory is None:

            async def kvs_factory() -> KeyValueStore:
                from crawlee.storages import KeyValueStore  # noqa: PLC0415 avoid circular import

                return await KeyValueStore.open(name=persist_state_kvs_name, id=persist_state_kvs_id)

            self._persist_state_kvs_factory = kvs_factory
        else:
            self._persist_state_kvs_factory = persist_state_kvs_factory

        self._key_value_store: KeyValueStore | None = None
        self._log = logger

    async def initialize(self) -> TStateModel:
        """Initialize the recoverable state.

        This method must be called before using the recoverable state. It loads the saved state
        if persistence is enabled and registers the object to listen for PERSIST_STATE events.

        Returns:
            The loaded state model
        """
        if self._persistence_enabled is False:
            self._state = self._default_state.model_copy(deep=True)
            return self.current_value

        # Import here to avoid circular imports.

        self._key_value_store = await self._persist_state_kvs_factory()

        await self._load_saved_state()

        if self._persistence_enabled is True:
            # Import here to avoid circular imports.
            from crawlee import service_locator  # noqa: PLC0415

            event_manager = service_locator.get_event_manager()
            event_manager.on(event=Event.PERSIST_STATE, listener=self.persist_state)

        return self.current_value

    async def teardown(self) -> None:
        """Clean up resources used by the recoverable state.

        If persistence is enabled, this method deregisters the object from PERSIST_STATE events
        and persists the current state one last time.
        """
        if not self._persistence_enabled:
            return

        if self._persistence_enabled is True:
            # Import here to avoid circular imports.
            from crawlee import service_locator  # noqa: PLC0415

            event_manager = service_locator.get_event_manager()
            event_manager.off(event=Event.PERSIST_STATE, listener=self.persist_state)
            await self.persist_state()

    @property
    def current_value(self) -> TStateModel:
        """Get the current state."""
        if self._state is None:
            raise RuntimeError('Recoverable state has not yet been loaded')

        return self._state

    @property
    def is_initialized(self) -> bool:
        """Check if the state has already been initialized."""
        return self._state is not None

    async def has_persisted_state(self) -> bool:
        """Check if there is any persisted state in the key-value store."""
        if not self._persistence_enabled:
            return False

        if self._key_value_store is None:
            raise RuntimeError('Recoverable state has not yet been initialized')

        return await self._key_value_store.record_exists(self._persist_state_key)

    async def reset(self) -> None:
        """Reset the state to the default values and clear any persisted state.

        Resets the current state to the default state and, if persistence is enabled,
        clears the persisted state from the KeyValueStore.
        """
        self._state = self._default_state.model_copy(deep=True)

        if self._persistence_enabled:
            if self._key_value_store is None:
                raise RuntimeError('Recoverable state has not yet been initialized')

            await self._key_value_store.set_value(self._persist_state_key, None)

    async def persist_state(self, event_data: EventPersistStateData | None = None) -> None:
        """Persist the current state to the KeyValueStore.

        This method is typically called in response to a PERSIST_STATE event, but can also be called
        directly when needed.

        Args:
            event_data: Optional data associated with a PERSIST_STATE event
        """
        self._log.debug(
            f'Persisting RecoverableState (model={self._default_state.__class__.__name__}, event_data={event_data}).'
        )

        if self._key_value_store is None or self._state is None:
            raise RuntimeError('Recoverable state has not yet been initialized')

        if self._persistence_enabled is True or self._persistence_enabled == 'explicit_only':
            await self._key_value_store.set_value(
                self._persist_state_key,
                self._state.model_dump(mode='json', by_alias=True),
                'application/json',
            )
        else:
            self._log.debug('Persistence is not enabled - not doing anything')

    async def _load_saved_state(self) -> None:
        if self._key_value_store is None:
            raise RuntimeError('Recoverable state has not yet been initialized')

        stored_state = await self._key_value_store.get_value(self._persist_state_key)
        if stored_state is None:
            self._state = self._default_state.model_copy(deep=True)
        else:
            self._state = self._state_type.model_validate(stored_state)


================================================
FILE: src/crawlee/_utils/recurring_task.py
================================================
from __future__ import annotations

import asyncio
import inspect
from logging import getLogger
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from collections.abc import Callable
    from datetime import timedelta
    from types import TracebackType

    from typing_extensions import Self

logger = getLogger(__name__)


class RecurringTask:
    """Class for creating and managing recurring tasks.

    Attributes:
        func: The function to be executed repeatedly.
        delay: The time delay (in seconds) between function calls.
        task: The underlying task object.
    """

    def __init__(self, func: Callable, delay: timedelta) -> None:
        logger.debug(
            'Calling RecurringTask.__init__(func={%s}, delay={%s})...',
            func.__name__ if hasattr(func, '__name__') else func.__class__.__name__,
            delay,
        )
        self.func = func
        self.delay = delay
        self.task: asyncio.Task | None = None

    async def __aenter__(self) -> Self:
        self.start()
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        await self.stop()

    async def _wrapper(self) -> None:
        """Continuously execute the provided function with the specified delay.

        Run the function in a loop, waiting for the configured delay between executions.
        Supports both synchronous and asynchronous functions.
        """
        sleep_time_secs = self.delay.total_seconds()
        while True:
            await self.func() if inspect.iscoroutinefunction(self.func) else self.func()
            await asyncio.sleep(sleep_time_secs)

    def start(self) -> None:
        """Start the recurring task execution."""
        name = self.func.__name__ if hasattr(self.func, '__name__') else self.func.__class__.__name__
        self.task = asyncio.create_task(
            self._wrapper(),
            name=f'Task-recurring-{name}',
        )

    async def stop(self) -> None:
        """Stop the recurring task execution."""
        if self.task:
            self.task.cancel()
            # Ensure the task has a chance to properly handle the cancellation and any potential exceptions.
            await asyncio.gather(self.task, return_exceptions=True)


================================================
FILE: src/crawlee/_utils/requests.py
================================================
from __future__ import annotations

from logging import getLogger
from typing import TYPE_CHECKING

from yarl import URL

from crawlee._utils.crypto import compute_short_hash

if TYPE_CHECKING:
    from crawlee._types import HttpHeaders, HttpMethod, HttpPayload

logger = getLogger(__name__)


def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
    """Normalize a URL.

    This function cleans and standardizes a URL by removing leading and trailing whitespaces,
    converting the scheme and netloc to lower case, stripping unwanted tracking parameters
    (specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically,
    and optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally
    identical but differ in trivial ways (such as parameter order or casing) are treated as the same.

    Args:
        url: The URL to be normalized.
        keep_url_fragment: Flag to determine whether the fragment part of the URL should be retained.

    Returns:
        A string containing the normalized URL.
    """
    # Parse the URL
    parsed_url = URL(url.strip())

    # Remove any 'utm_' parameters
    search_params = [(k, v) for k, v in parsed_url.query.items() if not k.startswith('utm_')]

    # Construct the new query string
    sorted_search_params = sorted(search_params)

    # Construct the final URL
    yarl_new_url = parsed_url.with_query(sorted_search_params)
    yarl_new_url = yarl_new_url.with_path(
        yarl_new_url.path.removesuffix('/'), keep_query=True, keep_fragment=keep_url_fragment
    )

    return str(yarl_new_url).lower()


def compute_unique_key(
    url: str,
    method: HttpMethod = 'GET',
    headers: HttpHeaders | None = None,
    payload: HttpPayload | None = None,
    session_id: str | None = None,
    *,
    keep_url_fragment: bool = False,
    use_extended_unique_key: bool = False,
) -> str:
    """Compute a unique key for caching & deduplication of requests.

    This function computes a unique key by normalizing the provided URL and method. If `use_extended_unique_key`
    is True and a payload is provided, the payload is hashed and included in the key. Otherwise, the unique key
    is just the normalized URL. Additionally, if HTTP headers are provided, the whitelisted headers are hashed
    and included in the key.

    Args:
        url: The request URL.
        method: The HTTP method.
        headers: The HTTP headers.
        payload: The data to be sent as the request body.
        keep_url_fragment: A flag indicating whether to keep the URL fragment.
        use_extended_unique_key: A flag indicating whether to include a hashed payload in the key.
        session_id: The ID of a specific `Session` to which the request will be strictly bound

    Returns:
        A string representing the unique key for the request.
    """
    # Normalize the URL.
    try:
        normalized_url = normalize_url(url, keep_url_fragment=keep_url_fragment)
    except Exception as exc:
        logger.warning(f'Failed to normalize URL: {exc}')
        normalized_url = url

    # Normalize the method.
    normalized_method = method.upper()

    # Compute and return the extended unique key if required.
    if use_extended_unique_key:
        payload_hash = _get_payload_hash(payload)
        headers_hash = _get_headers_hash(headers)
        normalized_session = '' if session_id is None else session_id.lower()

        # Return the extended unique key. Use pipe as a separator of the different parts of the unique key.
        extended_part = f'{normalized_method}|{headers_hash}|{payload_hash}'
        if normalized_session:
            extended_part = f'{extended_part}|{normalized_session}'
        return f'{extended_part}|{normalized_url}'

    # Log information if there is a non-GET request with a payload.
    if normalized_method != 'GET' and payload:
        logger.info(
            f'{normalized_method} request with a payload detected. By default, requests to the same URL with '
            'different methods or payloads will be deduplicated. Use "use_extended_unique_key" to include payload '
            'and headers in the unique key and avoid deduplication in these cases.'
        )

    # Return the normalized URL as the unique key.
    return normalized_url


def _get_payload_hash(payload: HttpPayload | None) -> str:
    payload_in_bytes = b'' if payload is None else payload
    return compute_short_hash(payload_in_bytes)


def _get_headers_hash(headers: HttpHeaders | None) -> str:
    # HTTP headers which will be included in the hash computation.
    whitelisted_headers = {'accept', 'accept-language', 'authorization', 'content-type'}

    if headers is None:
        normalized_headers = b''
    else:
        filtered_headers = {key: value for key, value in headers.items() if key in whitelisted_headers}
        normalized_headers = '|'.join(f'{k}:{v}' for k, v in filtered_headers.items()).encode('utf-8')

    return compute_short_hash(normalized_headers)


================================================
FILE: src/crawlee/_utils/robots.py
================================================
from __future__ import annotations

from logging import getLogger
from typing import TYPE_CHECKING

from protego import Protego
from yarl import URL

from crawlee._utils.sitemap import Sitemap
from crawlee._utils.web import is_status_code_client_error

if TYPE_CHECKING:
    from typing_extensions import Self

    from crawlee.http_clients import HttpClient
    from crawlee.proxy_configuration import ProxyInfo


logger = getLogger(__name__)


class RobotsTxtFile:
    def __init__(
        self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
    ) -> None:
        self._robots = robots
        self._original_url = URL(url).origin()
        self._http_client = http_client
        self._proxy_info = proxy_info

    @classmethod
    async def from_content(cls, url: str, content: str) -> Self:
        """Create a `RobotsTxtFile` instance from the given content.

        Args:
            url: The URL associated with the robots.txt file.
            content: The raw string content of the robots.txt file to be parsed.
        """
        robots = Protego.parse(content)
        return cls(url, robots)

    @classmethod
    async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
        """Determine the location of a robots.txt file for a URL and fetch it.

        Args:
            url: The URL whose domain will be used to find the corresponding robots.txt file.
            http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
            proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
        """
        robots_url = URL(url).with_path('/robots.txt')
        return await cls.load(str(robots_url), http_client, proxy_info)

    @classmethod
    async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
        """Load the robots.txt file for a given URL.

        Args:
            url: The direct URL of the robots.txt file to be loaded.
            http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
            proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
        """
        try:
            response = await http_client.send_request(url, proxy_info=proxy_info)

            body = (
                b'User-agent: *\nAllow: /'
                if is_status_code_client_error(response.status_code)
                else await response.read()
            )
            robots = Protego.parse(body.decode('utf-8'))

        except Exception as e:
            logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')

            robots = Protego.parse('User-agent: *\nAllow: /')

        return cls(url, robots, http_client=http_client, proxy_info=proxy_info)

    def is_allowed(self, url: str, user_agent: str = '*') -> bool:
        """Check if the given URL is allowed for the given user agent.

        Args:
            url: The URL to check against the robots.txt rules.
            user_agent: The user-agent string to check permissions for. Defaults to '*' which matches any user-agent.
        """
        check_url = URL(url)
        if check_url.origin() != self._original_url:
            return True
        return bool(self._robots.can_fetch(str(check_url), user_agent))

    def get_sitemaps(self) -> list[str]:
        """Get the list of sitemaps urls from the robots.txt file."""
        return list(self._robots.sitemaps)

    def get_crawl_delay(self, user_agent: str = '*') -> int | None:
        """Get the crawl delay for the given user agent.

        Args:
            user_agent: The user-agent string to check the crawl delay for. Defaults to '*' which matches any
                user-agent.
        """
        crawl_delay = self._robots.crawl_delay(user_agent)
        return int(crawl_delay) if crawl_delay is not None else None

    async def parse_sitemaps(self) -> Sitemap:
        """Parse the sitemaps from the robots.txt file and return a `Sitemap` instance."""
        sitemaps = self.get_sitemaps()
        if not self._http_client:
            raise ValueError('HTTP client is required to parse sitemaps.')

        return await Sitemap.load(sitemaps, self._http_client, self._proxy_info)

    async def parse_urls_from_sitemaps(self) -> list[str]:
        """Parse the sitemaps in the robots.txt file and return a list URLs."""
        sitemap = await self.parse_sitemaps()
        return sitemap.urls


================================================
FILE: src/crawlee/_utils/sitemap.py
================================================
from __future__ import annotations

import asyncio
import re
import zlib
from codecs import getincrementaldecoder
from collections import defaultdict
from contextlib import suppress
from dataclasses import dataclass
from datetime import datetime, timedelta
from hashlib import sha256
from logging import getLogger
from typing import TYPE_CHECKING, Literal, TypedDict
from xml.sax import SAXParseException
from xml.sax.expatreader import ExpatParser
from xml.sax.handler import ContentHandler

from typing_extensions import NotRequired, override
from yarl import URL

from crawlee._utils.web import is_status_code_successful
from crawlee.errors import ProxyError

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator
    from xml.sax.xmlreader import AttributesImpl

    from crawlee.http_clients import HttpClient
    from crawlee.proxy_configuration import ProxyInfo

logger = getLogger(__name__)

VALID_CHANGE_FREQS = {'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'}
SITEMAP_HEADERS = {'accept': 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8'}
SITEMAP_URL_PATTERN = re.compile(r'\/sitemap\.(?:xml|txt)(?:\.gz)?$', re.IGNORECASE)
COMMON_SITEMAP_PATHS = ['/sitemap.xml', '/sitemap.txt', '/sitemap_index.xml']


@dataclass()
class SitemapUrl:
    loc: str
    lastmod: datetime | None = None
    changefreq: str | None = None
    priority: float | None = None
    origin_sitemap_url: str | None = None


@dataclass()
class NestedSitemap:
    loc: str
    origin_sitemap_url: str | None = None


class ParseSitemapOptions(TypedDict, total=False):
    emit_nested_sitemaps: bool
    max_depth: int
    sitemap_retries: int
    timeout: timedelta | None


class SitemapSource(TypedDict):
    type: Literal['url', 'raw']
    url: NotRequired[str]
    content: NotRequired[str]
    depth: NotRequired[int]


class _SitemapItem(TypedDict, total=False):
    type: Literal['url', 'sitemap_url']
    loc: str
    url: str
    lastmod: datetime | None
    changefreq: str | None
    priority: float | None


class _XMLSaxSitemapHandler(ContentHandler):
    def __init__(self) -> None:
        super().__init__()
        self._root_tag_name: str | None = None
        self._current_tag: str | None = None
        self._current_url: _SitemapItem = {}
        self._buffer: str = ''
        self._items: list[_SitemapItem] = []

    @property
    def items(self) -> list[_SitemapItem]:
        return self._items

    @override
    def startElement(self, name: str, attrs: AttributesImpl) -> None:
        if self._root_tag_name is None and name in ('urlset', 'sitemapindex'):
            self._root_tag_name = name

        if name in ('loc', 'lastmod', 'changefreq', 'priority'):
            self._current_tag = name
            self._buffer = ''

    def characters(self, content: str) -> None:
        if self._current_tag:
            self._buffer += content

    @override
    def endElement(self, name: str) -> None:
        if name == self._current_tag:
            text = self._buffer.strip()

            if name == 'loc':
                if self._root_tag_name == 'sitemapindex':
                    self._items.append({'type': 'sitemap_url', 'url': text})
                else:
                    self._current_url['loc'] = text

            elif name == 'lastmod' and text:
                with suppress(ValueError):
                    self._current_url['lastmod'] = datetime.fromisoformat(text.replace('Z', '+00:00'))

            elif name == 'priority' and text:
                with suppress(ValueError):
                    self._current_url['priority'] = float(text)

            elif name == 'changefreq' and text in VALID_CHANGE_FREQS:
                self._current_url['changefreq'] = text

            self.current_tag = None

        if name == 'url' and 'loc' in self._current_url:
            self.items.append({'type': 'url', **self._current_url})
            self._current_url = {}


class _TxtSitemapParser:
    """Parser for plaintext sitemaps that processes data as a stream."""

    def __init__(self) -> None:
        self._buffer = ''

    async def process_chunk(self, chunk: str) -> AsyncGenerator[_SitemapItem, None]:
        """Process a chunk of text data and yield items one by one."""
        self._buffer += chunk

        # Process complete lines
        if '\n' in self._buffer:
            lines = self._buffer.split('\n')
            # Last element might be incomplete, save for next chunk
            self._buffer = lines.pop()

            for line in lines:
                url = line.strip()
                if url:
                    yield {'type': 'url', 'loc': url}

    async def flush(self) -> AsyncGenerator[_SitemapItem, None]:
        """Process any remaining data in the buffer, yielding items one by one."""
        if self._buffer:
            url = self._buffer.strip()
            if url:
                yield {'type': 'url', 'loc': url}
            self.buffer = ''

    def close(self) -> None:
        """Clean up resources."""
        self._buffer = ''


class _XmlSitemapParser:
    """Parser for XML sitemaps using SAX to process data as a stream."""

    def __init__(self) -> None:
        self._parser = ExpatParser()
        self._handler = _XMLSaxSitemapHandler()
        self._parser.setContentHandler(self._handler)

    async def process_chunk(self, chunk: str) -> AsyncGenerator[_SitemapItem, None]:
        """Process a chunk of XML data and yield items one by one."""
        try:
            self._parser.feed(chunk)

            # If we get here, the XML was valid and complete
            for item in self._handler.items:
                yield item

            self._handler.items.clear()

        except Exception as e:
            logger.warning(f'Failed to parse XML data chunk: {e}', exc_info=True)

    async def flush(self) -> AsyncGenerator[_SitemapItem, None]:
        """Process any remaining data in the buffer, yielding items one by one."""
        try:
            self._parser.flush()

            for item in self._handler.items:
                yield item

            self._handler.items.clear()

        except Exception as e:
            logger.warning(f'Failed to parse remaining XML data: {e}')

    def close(self) -> None:
        """Clean up resources."""
        with suppress(SAXParseException):
            self._parser.close()


def _get_parser(content_type: str = '', url: str | None = None) -> _XmlSitemapParser | _TxtSitemapParser:
    """Create appropriate parser based on content type and URL."""
    if 'text/plain' in content_type.lower() or (url and URL(url).path.endswith('.txt')):
        return _TxtSitemapParser()
    # Default to XML parser for most cases
    return _XmlSitemapParser()


def _get_origin_url(source: SitemapSource) -> str:
    """Determine the origin URL for a sitemap source."""
    if source['type'] == 'url' and 'url' in source:
        return source['url']
    if source['type'] == 'raw' and 'content' in source:
        # For raw content sources, create a consistent identifier
        return f'raw://{sha256(source["content"].encode()).hexdigest()}'
    return ''


async def _process_sitemap_item(
    item: _SitemapItem,
    source: SitemapSource,
    depth: int,
    visited_sitemap_urls: set[str],
    sources: list[SitemapSource],
    *,
    emit_nested_sitemaps: bool,
) -> AsyncGenerator[SitemapUrl | NestedSitemap | None, None]:
    """Process a sitemap item and yield appropriate results."""
    item_copy = item.copy()  # Work with a copy to avoid modifying the original

    if 'type' not in item_copy:
        return

    item_type = item_copy.pop('type')

    # Handle sitemap URL references (nested sitemaps)
    if item_type == 'sitemap_url' and 'url' in item_copy:
        sitemap_url = item_copy['url']
        if sitemap_url and sitemap_url not in visited_sitemap_urls:
            # Add to processing queue
            sources.append(SitemapSource(type='url', url=sitemap_url, depth=depth + 1))

            # Output the nested sitemap reference if requested
            if emit_nested_sitemaps:
                yield NestedSitemap(loc=sitemap_url, origin_sitemap_url=None)

    # Handle individual URL entries
    elif item_type == 'url' and 'loc' in item_copy:
        # Determine the origin sitemap URL for tracking purposes
        origin_url = _get_origin_url(source)

        # Create and yield the sitemap URL object
        yield SitemapUrl(
            loc=item_copy['loc'],
            lastmod=item_copy.get('lastmod'),
            changefreq=item_copy.get('changefreq'),
            priority=item_copy.get('priority'),
            origin_sitemap_url=origin_url,
        )


async def _process_raw_source(
    source: SitemapSource,
    depth: int,
    visited_sitemap_urls: set[str],
    sources: list[SitemapSource],
    *,
    emit_nested_sitemaps: bool,
) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]:
    """Process a raw content sitemap source."""
    if 'content' not in source:
        logger.warning(f'Raw source missing content: {source}')
        return

    content = source['content']
    parser = _get_parser('text/xml')

    try:
        # Process the content
        async for item in parser.process_chunk(content):
            async for result in _process_sitemap_item(
                item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps
            ):
                if result:
                    yield result

        # Process any remaining content
        async for item in parser.flush():
            async for result in _process_sitemap_item(
                item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps
            ):
                if result:
                    yield result
    except Exception as e:
        logger.warning(f'Failed to parse raw sitemap content: {e}')
    finally:
        parser.close()


async def _fetch_and_process_sitemap(
    http_client: HttpClient,
    source: SitemapSource,
    depth: int,
    visited_sitemap_urls: set[str],
    sources: list[SitemapSource],
    retries_left: int,
    *,
    proxy_info: ProxyInfo | None = None,
    timeout: timedelta | None = None,
    emit_nested_sitemaps: bool,
) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]:
    """Fetch a sitemap from a URL and process its content."""
    if 'url' not in source:
        return

    sitemap_url = source['url']

    try:
        while retries_left > 0:
            retries_left -= 1
            async with http_client.stream(
                sitemap_url, method='GET', headers=SITEMAP_HEADERS, proxy_info=proxy_info, timeout=timeout
            ) as response:
                # Determine content type and compression
                content_type = response.headers.get('content-type', '')

                decoder = getincrementaldecoder('utf-8')(errors='replace')

                # Create appropriate parser
                parser = _get_parser(content_type, sitemap_url)
                decompressor = None
                try:
                    # Process chunks as they arrive
                    first_chunk = True
                    async for raw_chunk in response.read_stream():
                        # Check if the first chunk is a valid gzip header
                        if first_chunk and raw_chunk.startswith(b'\x1f\x8b'):
                            decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16)
                        first_chunk = False

                        chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk
                        text_chunk = decoder.decode(chunk)
                        async for item in parser.process_chunk(text_chunk):
                            async for result in _process_sitemap_item(
                                item,
                                source,
                                depth,
                                visited_sitemap_urls,
                                sources,
                                emit_nested_sitemaps=emit_nested_sitemaps,
                            ):
                                if result:
                                    yield result

                    # Process any remaining content
                    async for item in parser.flush():
                        async for result in _process_sitemap_item(
                            item,
                            source,
                            depth,
                            visited_sitemap_urls,
                            sources,
                            emit_nested_sitemaps=emit_nested_sitemaps,
                        ):
                            if result:
                                yield result
                finally:
                    parser.close()
                break

    except Exception as e:
        if retries_left > 0:
            logger.warning(f'Error fetching sitemap {sitemap_url}: {e}. Retries left: {retries_left}')
            await asyncio.sleep(1)  # Brief pause before retry


class Sitemap:
    def __init__(self, urls: list[str]) -> None:
        self._urls = urls

    @property
    def urls(self) -> list[str]:
        return self._urls

    @classmethod
    async def try_common_names(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Sitemap:
        base_url = URL(url)
        sitemap_urls = [str(base_url.with_path(path)) for path in COMMON_SITEMAP_PATHS]
        return await cls.load(sitemap_urls, http_client, proxy_info)

    @classmethod
    async def load(
        cls,
        urls: str | list[str],
        http_client: HttpClient,
        proxy_info: ProxyInfo | None = None,
        parse_sitemap_options: ParseSitemapOptions | None = None,
    ) -> Sitemap:
        if isinstance(urls, str):
            urls = [urls]
        return await cls.parse(
            [SitemapSource(type='url', url=url) for url in urls], http_client, proxy_info, parse_sitemap_options
        )

    @classmethod
    async def from_xml_string(cls, content: str) -> Sitemap:
        return await cls.parse([SitemapSource(type='raw', content=content)])

    @classmethod
    async def parse(
        cls,
        sources: list[SitemapSource],
        http_client: HttpClient | None = None,
        proxy_info: ProxyInfo | None = None,
        parse_sitemap_options: ParseSitemapOptions | None = None,
    ) -> Sitemap:
        urls = [item.loc async for item in parse_sitemap(sources, http_client, proxy_info, parse_sitemap_options)]
        return cls(urls)


async def parse_sitemap(
    initial_sources: list[SitemapSource],
    http_client: HttpClient | None = None,
    proxy_info: ProxyInfo | None = None,
    options: ParseSitemapOptions | None = None,
) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]:
    """Parse sitemap(s) and yield URLs found in them.

    This function coordinates the process of fetching and parsing sitemaps,
    handling both URL-based and raw content sources. It follows nested sitemaps
    up to the specified maximum depth.
    """
    # Set default options
    default_timeout = timedelta(seconds=30)
    if options:
        emit_nested_sitemaps = options['emit_nested_sitemaps']
        max_depth = options['max_depth']
        sitemap_retries = options['sitemap_retries']
        timeout = options.get('timeout', default_timeout)
    else:
        emit_nested_sitemaps = False
        max_depth = float('inf')
        sitemap_retries = 3
        timeout = default_timeout

    # Setup working state
    sources = list(initial_sources)
    visited_sitemap_urls: set[str] = set()

    # Process sources until the queue is empty
    while sources:
        source = sources.pop(0)
        depth = source.get('depth', 0)

        # Skip if we've reached max depth
        if depth > max_depth:
            logger.debug(f'Skipping sitemap {source.get("url", "")} - exceeded max depth {max_depth}')
            continue

        # Process based on source type
        if source['type'] == 'raw':
            async for result in _process_raw_source(
                source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps
            ):
                yield result

        elif source['type'] == 'url' and 'url' in source:
            # Add to visited set before processing to avoid duplicates
            if http_client is None:
                raise RuntimeError('HttpClient must be provided for URL-based sitemap sources.')

            visited_sitemap_urls.add(source['url'])

            async for result in _fetch_and_process_sitemap(
                http_client,
                source,
                depth,
                visited_sitemap_urls,
                sources,
                sitemap_retries,
                emit_nested_sitemaps=emit_nested_sitemaps,
                proxy_info=proxy_info,
                timeout=timeout,
            ):
                yield result
        else:
            logger.warning(f'Invalid source configuration: {source}')


async def _merge_async_generators(*generators: AsyncGenerator) -> AsyncGenerator:
    queue: asyncio.Queue = asyncio.Queue()

    end_feed = object()

    async def feed(gen: AsyncGenerator) -> None:
        try:
            async for item in gen:
                await queue.put(item)
        except Exception:
            logger.warning(f'Error in generator: {gen}', exc_info=True)
        finally:
            await queue.put(end_feed)

    tasks = [asyncio.create_task(feed(gen)) for gen in generators]
    remaining_tasks = len(tasks)

    try:
        while remaining_tasks > 0:
            item = await queue.get()
            if item is end_feed:
                remaining_tasks -= 1
            else:
                yield item
    finally:
        for task in tasks:
            task.cancel()
        await asyncio.gather(*tasks, return_exceptions=True)


async def _discover_for_hostname(
    hostname: str,
    hostname_urls: list[str],
    *,
    http_client: HttpClient,
    proxy_info: ProxyInfo | None = None,
    request_timeout: timedelta,
    method_for_checking: Literal['HEAD', 'GET'] = 'HEAD',
) -> AsyncGenerator[str, None]:
    # Import here to avoid circular imports.
    from crawlee._utils.robots import RobotsTxtFile  # noqa: PLC0415

    domain_seen: set[str] = set()
    hostname_urls = list(set(hostname_urls))  # Remove duplicates

    def _check_and_add(url: str) -> bool:
        if url in domain_seen:
            return False
        domain_seen.add(url)
        return True

    # Try getting sitemaps from robots.txt first
    robots = await RobotsTxtFile.find(url=hostname_urls[0], http_client=http_client, proxy_info=proxy_info)
    for sitemap_url in robots.get_sitemaps():
        if _check_and_add(sitemap_url):
            yield sitemap_url

    # Check maybe provided URLs have sitemap url
    matching_sitemap_urls = [url for url in hostname_urls if SITEMAP_URL_PATTERN.search(url)]

    if matching_sitemap_urls:
        for sitemap_url in matching_sitemap_urls:
            if _check_and_add(sitemap_url):
                yield sitemap_url
    else:
        # Check common sitemap locations
        base_url = URL(hostname_urls[0])
        for path in COMMON_SITEMAP_PATHS:
            candidate = str(base_url.with_path(path))
            if candidate in domain_seen:
                continue
            try:
                response = await http_client.send_request(
                    candidate, method=method_for_checking, proxy_info=proxy_info, timeout=request_timeout
                )
                if is_status_code_successful(response.status_code) and _check_and_add(candidate):
                    yield candidate
            except ProxyError:
                logger.warning(f'Proxy error when checking {candidate} with sitemap discovery for {hostname}')
            except asyncio.TimeoutError:
                logger.warning(f'Timeout when checking {candidate} with sitemap discovery for {hostname}')
            except Exception:
                logger.warning(f'Error when checking {candidate} with sitemap discovery for {hostname}', exc_info=True)


async def discover_valid_sitemaps(
    urls: list[str],
    *,
    http_client: HttpClient,
    proxy_info: ProxyInfo | None = None,
    request_timeout: timedelta = timedelta(seconds=20),
    method_for_checking: Literal['HEAD', 'GET'] = 'HEAD',
) -> AsyncGenerator[str, None]:
    """Discover related sitemaps for the given URLs.

    Args:
        urls: List of URLs to discover sitemaps for.
        http_client: `HttpClient` to use for making requests.
        proxy_info: Proxy configuration to use for requests.
        request_timeout: Timeout for each request when checking for sitemaps.
        method_for_checking: HTTP method to use when checking for sitemap existence (HEAD or GET).
    """
    # Use a set to track seen sitemap URLs and avoid duplicates
    seen = set()

    grouped_urls = defaultdict(list)
    for url in urls:
        try:
            hostname = URL(url).host
        except ValueError:
            logger.warning(f'Invalid URL {url} skipped')
            continue

        if not hostname:
            logger.warning(f'URL {url} without host skipped')
            continue

        grouped_urls[hostname].append(url)

    generators = [
        _discover_for_hostname(
            hostname,
            hostname_urls,
            http_client=http_client,
            proxy_info=proxy_info,
            request_timeout=request_timeout,
            method_for_checking=method_for_checking,
        )
        for hostname, hostname_urls in grouped_urls.items()
    ]

    async for sitemap_url in _merge_async_generators(*generators):
        if sitemap_url not in seen:
            seen.add(sitemap_url)
            yield sitemap_url


================================================
FILE: src/crawlee/_utils/system.py
================================================
from __future__ import annotations

import os
import sys
from contextlib import suppress
from datetime import datetime, timezone
from logging import getLogger
from typing import TYPE_CHECKING, Annotated

import psutil
from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator

from crawlee._utils.byte_size import ByteSize

logger = getLogger(__name__)

if sys.platform == 'linux':
    """Get the most suitable available used memory metric.

    `Proportional Set Size (PSS)`, is the amount of own memory and memory shared with other processes, accounted in a
    way that the shared amount is divided evenly between the processes that share it. Available on Linux. Suitable for
    avoiding overestimation by counting the same shared memory used by children processes multiple times.

    `Resident Set Size (RSS)` is the non-swapped physical memory a process has used; it includes shared memory. It
    should be available everywhere.
    """

    def _get_used_memory(process: psutil.Process) -> int:
        return int(process.memory_full_info().pss)
else:

    def _get_used_memory(process: psutil.Process) -> int:
        return int(process.memory_info().rss)


class CpuInfo(BaseModel):
    """Information about the CPU usage."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

    used_ratio: Annotated[float, Field(alias='usedRatio')]
    """The ratio of CPU currently in use, represented as a float between 0 and 1."""

    # Workaround for Pydantic and type checkers when using Annotated with default_factory
    if TYPE_CHECKING:
        created_at: datetime = datetime.now(timezone.utc)
        """The time at which the measurement was taken."""
    else:
        created_at: Annotated[
            datetime,
            Field(
                alias='createdAt',
                default_factory=lambda: datetime.now(timezone.utc),
            ),
        ]
        """The time at which the measurement was taken."""


class MemoryUsageInfo(BaseModel):
    """Information about the memory usage."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

    current_size: Annotated[
        ByteSize,
        PlainValidator(ByteSize.validate),
        PlainSerializer(lambda size: size.bytes),
        Field(alias='currentSize'),
    ]
    """Memory usage of the current Python process and its children."""

    # Workaround for Pydantic and type checkers when using Annotated with default_factory
    if TYPE_CHECKING:
        created_at: datetime = datetime.now(timezone.utc)
        """The time at which the measurement was taken."""
    else:
        created_at: Annotated[
            datetime,
            Field(
                alias='createdAt',
                default_factory=lambda: datetime.now(timezone.utc),
            ),
        ]
        """The time at which the measurement was taken."""


class MemoryInfo(MemoryUsageInfo):
    """Information about system memory."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

    total_size: Annotated[
        ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='totalSize')
    ]
    """Total memory available in the system."""

    system_wide_used_size: Annotated[
        ByteSize,
        PlainValidator(ByteSize.validate),
        PlainSerializer(lambda size: size.bytes),
        Field(alias='systemWideUsedSize'),
    ]
    """Total memory used by all processes system-wide (including non-crawlee processes)."""


def get_cpu_info() -> CpuInfo:
    """Retrieve the current CPU usage.

    It utilizes the `psutil` library. Function `psutil.cpu_percent()` returns a float representing the current
    system-wide CPU utilization as a percentage.
    """
    logger.debug('Calling get_cpu_info()...')
    cpu_percent = psutil.cpu_percent(interval=0.1)
    return CpuInfo(used_ratio=cpu_percent / 100)


def get_memory_info() -> MemoryInfo:
    """Retrieve the current memory usage of the process and its children.

    It utilizes the `psutil` library.
    """
    logger.debug('Calling get_memory_info()...')
    current_process = psutil.Process(os.getpid())

    # Retrieve estimated memory usage of the current process.
    current_size_bytes = _get_used_memory(current_process)

    # Sum memory usage by all children processes, try to exclude shared memory from the sum if allowed by OS.
    for child in current_process.children(recursive=True):
        # Ignore any NoSuchProcess exception that might occur if a child process ends before we retrieve
        # its memory usage.
        with suppress(psutil.NoSuchProcess):
            current_size_bytes += _get_used_memory(child)

    vm = psutil.virtual_memory()

    return MemoryInfo(
        total_size=ByteSize(vm.total),
        current_size=ByteSize(current_size_bytes),
        system_wide_used_size=ByteSize(vm.total - vm.available),
    )


================================================
FILE: src/crawlee/_utils/time.py
================================================
from __future__ import annotations

import time
from contextlib import contextmanager
from dataclasses import dataclass
from datetime import timedelta
from typing import TYPE_CHECKING

from async_timeout import Timeout, timeout

if TYPE_CHECKING:
    from collections.abc import Iterator
    from types import TracebackType

_SECONDS_PER_MINUTE = 60
_SECONDS_PER_HOUR = 3600


@dataclass
class TimerResult:
    wall: float | None = None
    cpu: float | None = None


@contextmanager
def measure_time() -> Iterator[TimerResult]:
    """Measure the execution time (wall-clock and CPU) between the start and end of the with-block."""
    result = TimerResult()
    before_wall = time.monotonic()
    before_cpu = time.thread_time()

    try:
        yield result
    finally:
        after_wall = time.monotonic()
        after_cpu = time.thread_time()
        result.wall = after_wall - before_wall
        result.cpu = after_cpu - before_cpu


class SharedTimeout:
    """Keeps track of a time budget shared by multiple independent async operations.

    Provides a reusable, non-reentrant context manager interface.
    """

    def __init__(self, timeout: timedelta) -> None:
        self._remaining_timeout = timeout
        self._active_timeout: Timeout | None = None
        self._activation_timestamp: float | None = None

    async def __aenter__(self) -> timedelta:
        if self._active_timeout is not None or self._activation_timestamp is not None:
            raise RuntimeError('A shared timeout context cannot be entered twice at the same time')

        self._activation_timestamp = time.monotonic()
        self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
        await new_timeout.__aenter__()
        return self._remaining_timeout

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        if self._active_timeout is None or self._activation_timestamp is None:
            raise RuntimeError('Logic error')

        await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
        elapsed = time.monotonic() - self._activation_timestamp
        self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)

        self._active_timeout = None
        self._activation_timestamp = None


def format_duration(duration: timedelta | None) -> str:
    """Format a timedelta into a human-readable string with appropriate units."""
    if duration is None:
        return 'None'

    total_seconds = duration.total_seconds()

    if total_seconds == 0:
        return '0s'

    # For very small durations, show in milliseconds
    if total_seconds < 1:
        milliseconds = total_seconds * 1000
        if milliseconds < 1:
            microseconds = total_seconds * 1_000_000
            return f'{microseconds:.1f}μs'
        return f'{milliseconds:.1f}ms'

    # For durations less than 60 seconds, show in seconds
    if total_seconds < _SECONDS_PER_MINUTE:
        return f'{total_seconds:.2f}s'

    # For durations less than 1 hour, show in minutes and seconds
    if total_seconds < _SECONDS_PER_HOUR:
        minutes = int(total_seconds // _SECONDS_PER_MINUTE)
        seconds = total_seconds % _SECONDS_PER_MINUTE
        if seconds == 0:
            return f'{minutes}min'
        return f'{minutes}min {seconds:.1f}s'

    # For longer durations, show in hours, minutes, and seconds
    hours = int(total_seconds // _SECONDS_PER_HOUR)
    remaining_seconds = total_seconds % _SECONDS_PER_HOUR
    minutes = int(remaining_seconds // _SECONDS_PER_MINUTE)
    seconds = remaining_seconds % _SECONDS_PER_MINUTE

    result = f'{hours}h'
    if minutes > 0:
        result += f' {minutes}min'
    if seconds > 0:
        result += f' {seconds:.1f}s'

    return result


================================================
FILE: src/crawlee/_utils/try_import.py
================================================
import sys
from collections.abc import Iterator
from contextlib import contextmanager
from dataclasses import dataclass
from types import ModuleType
from typing import Any


@contextmanager
def try_import(module_name: str, *symbol_names: str) -> Iterator[None]:
    """Context manager to attempt importing symbols into a module.

    If an `ImportError` is raised during the import, the symbol will be replaced with a `FailedImport` object.
    """
    try:
        yield
    except ImportError as e:
        for symbol_name in symbol_names:
            setattr(sys.modules[module_name], symbol_name, FailedImport(e.args[0]))


def install_import_hook(module_name: str) -> None:
    """Install an import hook for a specified module."""
    sys.modules[module_name].__class__ = ImportWrapper


@dataclass
class FailedImport:
    """Represent a placeholder for a failed import."""

    message: str
    """The error message associated with the failed import."""


class ImportWrapper(ModuleType):
    """A wrapper class for modules to handle attribute access for failed imports."""

    def __getattribute__(self, name: str) -> Any:
        result = super().__getattribute__(name)

        if isinstance(result, FailedImport):
            raise ImportError(result.message)  # noqa: TRY004

        return result


================================================
FILE: src/crawlee/_utils/urls.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

from pydantic import AnyHttpUrl, TypeAdapter
from yarl import URL

if TYPE_CHECKING:
    from collections.abc import Iterator
    from logging import Logger


def is_url_absolute(url: str) -> bool:
    """Check if a URL is absolute."""
    url_parsed = URL(url)

    # We don't use .absolute because in yarl.URL, it is always True for links that start with '//'
    return bool(url_parsed.scheme) and bool(url_parsed.raw_authority)


def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
    """Convert a relative URL to an absolute URL using a base URL."""
    return str(URL(base_url).join(URL(relative_url)))


def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
    """Convert an iterator of relative URLs to absolute URLs using a base URL."""
    for url in urls:
        if is_url_absolute(url):
            yield url
        else:
            converted_url = convert_to_absolute_url(base_url, url)
            # Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
            if not is_url_absolute(converted_url):
                if logger:
                    logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
                continue
            yield converted_url


_http_url_adapter = TypeAdapter(AnyHttpUrl)


def validate_http_url(value: str | None) -> str | None:
    """Validate the given HTTP URL.

    Raises:
        pydantic.ValidationError: If the URL is not valid.
    """
    if value is not None:
        _http_url_adapter.validate_python(value)

    return value


================================================
FILE: src/crawlee/_utils/wait.py
================================================
from __future__ import annotations

import asyncio
from contextlib import suppress
from typing import TYPE_CHECKING, TypeVar

if TYPE_CHECKING:
    from collections.abc import Awaitable, Callable, Sequence
    from datetime import timedelta
    from logging import Logger

T = TypeVar('T')


async def wait_for(
    operation: Callable[[], Awaitable[T]],
    *,
    timeout: timedelta,
    timeout_message: str | None = None,
    max_retries: int = 1,
    logger: Logger,
) -> T:
    """Wait for an async operation to complete.

    If the wait times out, `TimeoutError` is raised and the future is cancelled.
    Optionally retry on error.

    Args:
        operation: A function that returns the future to wait for.
        timeout: How long should we wait before cancelling the future.
        timeout_message: Message to be included in the `TimeoutError` in case of timeout.
        max_retries: How many times should the operation be attempted.
        logger: Used to report information about retries as they happen.
    """
    for iteration in range(1, max_retries + 1):
        try:
            return await asyncio.wait_for(operation(), timeout.total_seconds())
        except asyncio.TimeoutError as ex:  # noqa: PERF203
            raise asyncio.TimeoutError(timeout_message) from ex
        except Exception as e:
            if iteration == max_retries:
                raise

            logger.warning(f'{e!s}: retrying ({iteration}/{max_retries})')

    raise RuntimeError('Unreachable code')


async def wait_for_all_tasks_for_finish(
    tasks: Sequence[asyncio.Task],
    *,
    logger: Logger,
    timeout: timedelta | None = None,
) -> None:
    """Wait for all tasks to finish or until the timeout is reached.

    Args:
        tasks: A sequence of asyncio tasks to wait for.
        logger: Logger to use for reporting.
        timeout: How long should we wait before cancelling the tasks.
    """
    if not tasks:
        return

    timeout_secs = timeout.total_seconds() if timeout else None
    try:
        _, pending = await asyncio.wait(tasks, timeout=timeout_secs)
        if pending:
            logger.warning('Waiting timeout reached; canceling unfinished tasks.')
    except asyncio.CancelledError:
        logger.warning('Asyncio wait was cancelled; canceling unfinished tasks.')
        raise
    finally:
        for task in tasks:
            if not task.done():
                task.cancel()
                with suppress(asyncio.CancelledError):
                    await task
            # If task is done, access the result to clear any exceptions
            else:
                try:
                    task.result()
                except asyncio.CancelledError:
                    pass
                except Exception as e:
                    logger.warning(f'Task raised an exception: {e}')


================================================
FILE: src/crawlee/_utils/web.py
================================================
from __future__ import annotations

from http import HTTPStatus


def is_status_code_client_error(value: int) -> bool:
    """Return `True` for 4xx status codes, `False` otherwise."""
    return HTTPStatus.BAD_REQUEST <= value < HTTPStatus.INTERNAL_SERVER_ERROR


def is_status_code_server_error(value: int) -> bool:
    """Return `True` for 5xx status codes, `False` otherwise."""
    return value >= HTTPStatus.INTERNAL_SERVER_ERROR


def is_status_code_successful(value: int) -> bool:
    """Return `True` for 2xx and 3xx status codes, `False` otherwise."""
    return HTTPStatus.OK <= value < HTTPStatus.BAD_REQUEST


================================================
FILE: src/crawlee/browsers/__init__.py
================================================
from crawlee._utils.try_import import install_import_hook as _install_import_hook
from crawlee._utils.try_import import try_import as _try_import

from ._types import BrowserType, CrawleePage

_install_import_hook(__name__)


# The following imports are wrapped in try_import to handle optional dependencies,
# ensuring the module can still function even if these dependencies are missing.
with _try_import(__name__, 'BrowserPool'):
    from ._browser_pool import BrowserPool
with _try_import(__name__, 'PlaywrightBrowserController'):
    from ._playwright_browser_controller import PlaywrightBrowserController
with _try_import(__name__, 'PlaywrightBrowserPlugin'):
    from ._playwright_browser_plugin import PlaywrightBrowserPlugin
with _try_import(__name__, 'PlaywrightPersistentBrowser'):
    from ._playwright_browser import PlaywrightPersistentBrowser


__all__ = [
    'BrowserPool',
    'BrowserType',
    'CrawleePage',
    'PlaywrightBrowserController',
    'PlaywrightBrowserPlugin',
    'PlaywrightPersistentBrowser',
]


================================================
FILE: src/crawlee/browsers/_browser_controller.py
================================================
# Inspiration: https://github.com/apify/crawlee/blob/v3.10.1/packages/browser-pool/src/abstract-classes/browser-controller.ts

from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any

from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
    from collections.abc import Mapping
    from datetime import datetime, timedelta

    from playwright.async_api import Page

    from crawlee.browsers._types import BrowserType
    from crawlee.proxy_configuration import ProxyInfo


@docs_group('Browser management')
class BrowserController(ABC):
    """An abstract base class for managing browser instance and their pages."""

    AUTOMATION_LIBRARY: str | None = None
    """The name of the automation library that the controller is using."""

    @property
    @abstractmethod
    def pages(self) -> list[Page]:
        """Return the list of opened pages."""

    @property
    @abstractmethod
    def total_opened_pages(self) -> int:
        """Return the total number of pages opened since the browser was launched."""

    @property
    @abstractmethod
    def pages_count(self) -> int:
        """Return the number of currently open pages."""

    @property
    @abstractmethod
    def last_page_opened_at(self) -> datetime:
        """Return the time when the last page was opened."""

    @property
    @abstractmethod
    def idle_time(self) -> timedelta:
        """Return the idle time of the browser controller."""

    @property
    @abstractmethod
    def has_free_capacity(self) -> bool:
        """Return if the browser has free capacity to open a new page."""

    @property
    @abstractmethod
    def is_browser_connected(self) -> bool:
        """Return if the browser is closed."""

    @property
    @abstractmethod
    def browser_type(self) -> BrowserType:
        """Return the type of the browser."""

    @abstractmethod
    async def new_page(
        self,
        browser_new_context_options: Mapping[str, Any] | None = None,
        proxy_info: ProxyInfo | None = None,
    ) -> Page:
        """Create a new page with the given context options.

        Args:
            browser_new_context_options: Keyword arguments to pass to the browser new context method. These options
                are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
                Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
            proxy_info: The proxy configuration to use for the new page.

        Returns:
            Page: The newly created page.

        Raises:
            ValueError: If the browser has reached the maximum number of open pages.
        """

    @abstractmethod
    async def close(self, *, force: bool = False) -> None:
        """Close the browser.

        Args:
            force: Whether to force close all open pages before closing the browser.

        Raises:
            ValueError: If there are still open pages when trying to close the browser.
        """


================================================
FILE: src/crawlee/browsers/_browser_plugin.py
================================================
# Inspiration: https://github.com/apify/crawlee/blob/v3.10.1/packages/browser-pool/src/abstract-classes/browser-plugin.ts

from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any

from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
    from collections.abc import Mapping
    from types import TracebackType

    from crawlee.browsers._browser_controller import BrowserController
    from crawlee.browsers._types import BrowserType


@docs_group('Browser management')
class BrowserPlugin(ABC):
    """An abstract base class for browser plugins.

    Browser plugins act as wrappers around browser automation tools like Playwright,
    providing a unified interface for interacting with browsers.
    """

    AUTOMATION_LIBRARY: str | None = None
    """The name of the automation library that the plugin is managing."""

    @property
    @abstractmethod
    def active(self) -> bool:
        """Indicate whether the context is active."""

    @property
    @abstractmethod
    def browser_type(self) -> BrowserType:
        """Return the browser type name."""

    @property
    @abstractmethod
    def browser_launch_options(self) -> Mapping[str, Any]:
        """Return the options for the `browser.launch` method.

        Keyword arguments to pass to the browser launch method. These options are provided directly to Playwright's
        `browser_type.launch` method. For more details, refer to the Playwright documentation:
         https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.
        """

    @property
    @abstractmethod
    def browser_new_context_options(self) -> Mapping[str, Any]:
        """Return the options for the `browser.new_context` method.

        Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's
        `browser.new_context` method. For more details, refer to the Playwright documentation:
        https://playwright.dev/python/docs/api/class-browser#browser-new-context.
        """

    @property
    @abstractmethod
    def max_open_pages_per_browser(self) -> int:
        """Return the maximum number of pages that can be opened in a single browser."""

    @abstractmethod
    async def __aenter__(self) -> BrowserPlugin:
        """Enter the context manager and initialize the browser plugin.

        Raises:
            RuntimeError: If the context manager is already active.
        """

    @abstractmethod
    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        """Exit the context manager and close the browser plugin.

        Raises:
            RuntimeError: If the context manager is not active.
        """

    @abstractmethod
    async def new_browser(self) -> BrowserController:
        """Create a new browser instance.

        Returns:
            A new browser instance wrapped in a controller.
        """


================================================
FILE: src/crawlee/browsers/_browser_pool.py
================================================
# Inspiration: https://github.com/apify/crawlee/tree/v3.10.1/packages/browser-pool/

from __future__ import annotations

import asyncio
import itertools
from collections import defaultdict
from datetime import timedelta
from logging import getLogger
from typing import TYPE_CHECKING, Any
from weakref import WeakValueDictionary

from crawlee._utils.context import ensure_context
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.docs import docs_group
from crawlee._utils.recurring_task import RecurringTask
from crawlee.browsers._browser_controller import BrowserController
from crawlee.browsers._playwright_browser_plugin import PlaywrightBrowserPlugin
from crawlee.browsers._types import BrowserType, CrawleePage

if TYPE_CHECKING:
    from collections.abc import Awaitable, Callable, Mapping, Sequence
    from pathlib import Path
    from types import TracebackType

    from crawlee.browsers._browser_plugin import BrowserPlugin
    from crawlee.fingerprint_suite import FingerprintGenerator
    from crawlee.proxy_configuration import ProxyInfo

logger = getLogger(__name__)


@docs_group('Browser management')
class BrowserPool:
    """Manage a pool of browsers and pages, handling their lifecycle and resource allocation.

    The `BrowserPool` is responsible for opening and closing browsers, managing pages within those browsers,
    and handling the overall lifecycle of these resources. It provides flexible configuration via
    constructor options, which include various hooks that allow for the insertion of custom behavior
    at different stages of the browser and page lifecycles.

    The browsers in the pool can be in one of three states: active, inactive, or closed.
    """

    _GENERATED_PAGE_ID_LENGTH = 8
    """The length of the newly generated page ID."""

    def __init__(
        self,
        plugins: Sequence[BrowserPlugin] | None = None,
        *,
        operation_timeout: timedelta = timedelta(seconds=15),
        browser_inactive_threshold: timedelta = timedelta(seconds=10),
        identify_inactive_browsers_interval: timedelta = timedelta(seconds=20),
        close_inactive_browsers_interval: timedelta = timedelta(seconds=30),
        retire_browser_after_page_count: int = 100,
    ) -> None:
        """Initialize a new instance.

        Args:
            plugins: Browser plugins serve as wrappers around various browser automation libraries,
                providing a consistent interface across different libraries.
            operation_timeout: Operations of the underlying automation libraries, such as launching a browser
                or opening a new page, can sometimes get stuck. To prevent `BrowserPool` from becoming unresponsive,
                we add a timeout to these operations.
            browser_inactive_threshold: The period of inactivity after which a browser is considered as inactive.
            identify_inactive_browsers_interval: The period of inactivity after which a browser is considered
                as retired.
            close_inactive_browsers_interval: The interval at which the pool checks for inactive browsers
                and closes them. The browser is considered as inactive if it has no active pages and has been idle
                for the specified period. The browser is considered as retired if it has no active pages and has total
                pages count greater than or equal to `retire_browser_after_page_count`.
            retire_browser_after_page_count: The maximum number of processed pages after which the browser is considered
                as retired.
        """
        self._plugins = plugins or [PlaywrightBrowserPlugin()]
        self._operation_timeout = operation_timeout
        self._browser_inactive_threshold = browser_inactive_threshold

        self._active_browsers = list[BrowserController]()
        """A list of browsers currently active and being used to open pages."""

        self._inactive_browsers = list[BrowserController]()
        """A list of browsers currently inactive and not being used to open new pages,
        but may still contain open pages."""

        self._identify_inactive_browsers_task = RecurringTask(
            self._identify_inactive_browsers,
            identify_inactive_browsers_interval,
        )

        self._close_inactive_browsers_task = RecurringTask(
            self._close_inactive_browsers,
            close_inactive_browsers_interval,
        )

        self._total_pages_count = 0
        self._retire_browser_after_page_count = retire_browser_after_page_count
        self._pages = WeakValueDictionary[str, CrawleePage]()  # Track the pages in the pool
        self._plugins_cycle = itertools.cycle(self._plugins)  # Cycle through the plugins

        self._pre_page_create_hooks: list[
            Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]
        ] = []
        self._post_page_create_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = []
        self._pre_page_close_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = []
        self._post_page_close_hooks: list[Callable[[str, BrowserController], Awaitable[None]]] = []

        # Flag to indicate the context state.
        self._active = False

    @classmethod
    def with_default_plugin(
        cls,
        *,
        browser_type: BrowserType | None = None,
        user_data_dir: str | Path | None = None,
        browser_launch_options: Mapping[str, Any] | None = None,
        browser_new_context_options: Mapping[str, Any] | None = None,
        headless: bool | None = None,
        fingerprint_generator: FingerprintGenerator | None = None,
        use_incognito_pages: bool | None = False,
        **kwargs: Any,
    ) -> BrowserPool:
        """Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.

        Args:
            browser_type: The type of browser to launch:
                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
                    the system.
            user_data_dir: Path to a user data directory, which stores browser session data like cookies
                and local storage.
            browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
                directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright
                documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.
            browser_new_context_options: Keyword arguments to pass to the browser new context method. These options
                are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
                Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
            headless: Whether to run the browser in headless mode.
            fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used
                to generate browser fingerprints together with consistent headers.
            use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
                own context that is destroyed once the page is closed or crashes.
            kwargs: Additional arguments for default constructor.
        """
        plugin_options: dict = defaultdict(dict)
        plugin_options['browser_launch_options'] = dict(browser_launch_options) if browser_launch_options else {}
        plugin_options['browser_new_context_options'] = browser_new_context_options or {}

        if headless is not None:
            plugin_options['browser_launch_options']['headless'] = headless

        if use_incognito_pages is not None:
            plugin_options['use_incognito_pages'] = use_incognito_pages

        if browser_type:
            plugin_options['browser_type'] = browser_type

        if user_data_dir:
            plugin_options['user_data_dir'] = user_data_dir

        plugin = PlaywrightBrowserPlugin(
            **plugin_options,
            fingerprint_generator=fingerprint_generator,
        )
        return cls(plugins=[plugin], **kwargs)

    @property
    def plugins(self) -> Sequence[BrowserPlugin]:
        """Return the browser plugins."""
        return self._plugins

    @property
    def active_browsers(self) -> Sequence[BrowserController]:
        """Return the active browsers in the pool."""
        return self._active_browsers

    @property
    def inactive_browsers(self) -> Sequence[BrowserController]:
        """Return the inactive browsers in the pool."""
        return self._inactive_browsers

    @property
    def pages(self) -> Mapping[str, CrawleePage]:
        """Return the pages in the pool."""
        return self._pages

    @property
    def total_pages_count(self) -> int:
        """Return the total number of pages opened since the browser pool was launched."""
        return self._total_pages_count

    @property
    def active(self) -> bool:
        """Indicate whether the context is active."""
        return self._active

    async def __aenter__(self) -> BrowserPool:
        """Enter the context manager and initialize all browser plugins.

        Raises:
            RuntimeError: If the context manager is already active.
        """
        if self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is already active.')

        self._active = True
        # Start the recurring tasks for identifying and closing inactive browsers
        self._identify_inactive_browsers_task.start()
        self._close_inactive_browsers_task.start()

        timeout = self._operation_timeout.total_seconds()

        try:
            for plugin in self._plugins:
                await asyncio.wait_for(plugin.__aenter__(), timeout)
        except asyncio.TimeoutError:
            logger.warning(f'Initializing of the browser plugin {plugin} timed out, will be skipped.')

        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        """Exit the context manager and close all browser plugins.

        Raises:
            RuntimeError: If the context manager is not active.
        """
        if not self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is not active.')

        await self._identify_inactive_browsers_task.stop()
        await self._close_inactive_browsers_task.stop()

        for browser in self._active_browsers + self._inactive_browsers:
            await browser.close(force=True)
        self._active_browsers.clear()
        self._inactive_browsers.clear()

        for plugin in self._plugins:
            await plugin.__aexit__(exc_type, exc_value, exc_traceback)

        self._active = False

    @ensure_context
    async def new_page(
        self,
        *,
        page_id: str | None = None,
        browser_plugin: BrowserPlugin | None = None,
        proxy_info: ProxyInfo | None = None,
    ) -> CrawleePage:
        """Open a new page in a browser using the specified or a random browser plugin.

        Args:
            page_id: The ID to assign to the new page. If not provided, a random ID is generated.
            browser_plugin: browser_plugin: The browser plugin to use for creating the new page.
                If not provided, the next plugin in the rotation is used.
            proxy_info: The proxy configuration to use for the new page.

        Returns:
            The newly created browser page.
        """
        if page_id in self.pages:
            raise ValueError(f'Page with ID: {page_id} already exists.')

        if browser_plugin and browser_plugin not in self.plugins:
            raise ValueError('Provided browser_plugin is not one of the plugins used by BrowserPool.')

        page_id = page_id or crypto_random_object_id(self._GENERATED_PAGE_ID_LENGTH)
        plugin = browser_plugin or next(self._plugins_cycle)

        return await self._get_new_page(page_id, plugin, proxy_info)

    @ensure_context
    async def new_page_with_each_plugin(self) -> Sequence[CrawleePage]:
        """Create a new page with each browser plugin in the pool.

        This method is useful for running scripts in multiple environments simultaneously, typically for testing
        or website analysis. Each page is created using a different browser plugin, allowing you to interact
        with various browser types concurrently.

        Returns:
            A list of newly created pages, one for each plugin in the pool.
        """
        pages_coroutines = [self.new_page(browser_plugin=plugin) for plugin in self._plugins]
        return await asyncio.gather(*pages_coroutines)

    async def _get_new_page(
        self,
        page_id: str,
        plugin: BrowserPlugin,
        proxy_info: ProxyInfo | None,
    ) -> CrawleePage:
        """Initialize a new browser page using the specified plugin.

        Select a browser with available capacity or launch a new one if needed. Create a new page in the selected
        browser with the provided proxy settings.
        """
        timeout = self._operation_timeout.total_seconds()
        browser_controller = self._pick_browser_with_free_capacity(plugin)

        try:
            if not browser_controller:
                browser_controller = await asyncio.wait_for(self._launch_new_browser(plugin), timeout)
            browser_new_context_options = dict(plugin.browser_new_context_options)

            await self._execute_hooks(
                self._pre_page_create_hooks, page_id, browser_controller, browser_new_context_options, proxy_info
            )

            page = await asyncio.wait_for(
                browser_controller.new_page(
                    browser_new_context_options=browser_new_context_options,
                    proxy_info=proxy_info,
                ),
                timeout,
            )
        except asyncio.TimeoutError as exc:
            raise TimeoutError(f'Creating a new page with plugin {plugin} timed out.') from exc
        except RuntimeError as exc:
            raise RuntimeError('Browser pool is not initialized.') from exc

        if browser_controller.total_opened_pages >= self._retire_browser_after_page_count:
            self._retire_browser(browser_controller)

        crawlee_page = CrawleePage(id=page_id, page=page, browser_type=plugin.browser_type)
        self._pages[page_id] = crawlee_page
        self._total_pages_count += 1

        await self._execute_hooks(self._post_page_create_hooks, crawlee_page, browser_controller)

        self._override_page_close(crawlee_page, browser_controller)

        return crawlee_page

    def _pick_browser_with_free_capacity(
        self,
        browser_plugin: BrowserPlugin,
    ) -> BrowserController | None:
        """Pick a browser with free capacity that matches the specified plugin."""
        for browser in self._active_browsers:
            if browser.has_free_capacity and browser.AUTOMATION_LIBRARY == browser_plugin.AUTOMATION_LIBRARY:
                return browser

        return None

    def _retire_browser(self, browser: BrowserController) -> None:
        """Retire a browser by moving it to the inactive list."""
        if browser in self._active_browsers:
            self._active_browsers.remove(browser)
            self._inactive_browsers.append(browser)

    async def _launch_new_browser(self, plugin: BrowserPlugin) -> BrowserController:
        """Launch a new browser instance using the specified plugin."""
        browser = await plugin.new_browser()
        self._active_browsers.append(browser)
        return browser

    def _identify_inactive_browsers(self) -> None:
        """Identify inactive browsers and move them to the inactive list if their idle time exceeds the threshold."""
        for browser in list(self._active_browsers):
            if browser.idle_time >= self._browser_inactive_threshold:
                self._active_browsers.remove(browser)
                self._inactive_browsers.append(browser)

    async def _close_inactive_browsers(self) -> None:
        """Close the browsers that have no active pages and have been idle for a certain period."""
        for browser in list(self._inactive_browsers):
            if not browser.pages:
                await browser.close()
                self._inactive_browsers.remove(browser)

    async def _execute_hooks(self, hooks: list[Callable[..., Awaitable[None]]], *args: Any) -> None:
        """Execute the provided hooks with the given arguments."""
        for hook in hooks:
            await hook(*args)

    def _override_page_close(self, crawlee_page: CrawleePage, browser_controller: BrowserController) -> None:
        """Override the page's close method to execute pre and post close hooks."""
        if self._pre_page_close_hooks or self._post_page_close_hooks:
            original_close = crawlee_page.page.close

            async def close_with_hooks(*args: Any, **kwargs: Any) -> None:
                try:
                    await self._execute_hooks(self._pre_page_close_hooks, crawlee_page, browser_controller)
                finally:
                    await original_close(*args, **kwargs)
                await self._execute_hooks(self._post_page_close_hooks, crawlee_page.id, browser_controller)

            crawlee_page.page.close: Callable[..., Awaitable[None]] = close_with_hooks

    def pre_page_create_hook(
        self, hook: Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]
    ) -> Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]:
        """Register a hook to be called just before a new page is created.

        The hook receives the page ID, `BrowserController`, `browser_new_context_options`, and `ProxyInfo`.
        Note that depending on the `BrowserController` implementation, `browser_new_context_options` may not
        apply to every page individually. For example, `PlaywrightBrowserController` with
        ``use_incognito_pages=False`` shares a single context across all pages, so the options are applied
        only when the context is first created.
        """
        self._pre_page_create_hooks.append(hook)

        return hook

    def post_page_create_hook(
        self, hook: Callable[[CrawleePage, BrowserController], Awaitable[None]]
    ) -> Callable[[CrawleePage, BrowserController], Awaitable[None]]:
        """Register a hook to be called right after a new page is created.

        The hook receives the newly created `CrawleePage` and the `BrowserController`. Use it to apply
        changes to all pages, such as injecting scripts or configuring request interception.
        """
        self._post_page_create_hooks.append(hook)

        return hook

    def pre_page_close_hook(
        self, hook: Callable[[CrawleePage, BrowserController], Awaitable[None]]
    ) -> Callable[[CrawleePage, BrowserController], Awaitable[None]]:
        """Register a hook to be called just before a page is closed.

        The hook receives the `CrawleePage` and the `BrowserController`. Use it to collect last-second data,
        such as taking a screenshot or saving page state before the page is destroyed.
        """
        self._pre_page_close_hooks.append(hook)

        return hook

    def post_page_close_hook(
        self, hook: Callable[[str, BrowserController], Awaitable[None]]
    ) -> Callable[[str, BrowserController], Awaitable[None]]:
        """Register a hook to be called right after a page is closed.

        The hook receives the page ID and the `BrowserController`. Use it for cleanup or logging
        after a page's lifecycle ends.
        """
        self._post_page_close_hooks.append(hook)

        return hook


================================================
FILE: src/crawlee/browsers/_playwright_browser.py
================================================
from __future__ import annotations

import asyncio
import shutil
import tempfile
from logging import getLogger
from pathlib import Path
from typing import TYPE_CHECKING, Any

from playwright.async_api import Browser
from typing_extensions import override

from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
    from playwright.async_api import BrowserContext, BrowserType, CDPSession, Page

logger = getLogger(__name__)


@docs_group('Browser management')
class PlaywrightPersistentBrowser(Browser):
    """A wrapper for Playwright's `Browser` that operates with a persistent context.

    It utilizes Playwright's persistent browser context feature, maintaining user data across sessions.
    While it follows the same interface as Playwright's `Browser` class, there is no abstract base class
    enforcing this. There is a limitation that only a single persistent context is allowed.
    """

    _TMP_DIR_PREFIX = 'apify-playwright-firefox-taac-'

    def __init__(
        self,
        browser_type: BrowserType,
        user_data_dir: str | Path | None,
        browser_launch_options: dict[str, Any],
    ) -> None:
        self._browser_type = browser_type
        self._browser_launch_options = browser_launch_options
        self._user_data_dir = user_data_dir
        self._temp_dir: Path | None = None

        self._context: BrowserContext | None = None
        self._is_connected = True

    @property
    def browser_type(self) -> BrowserType:
        return self._browser_type

    @property
    def contexts(self) -> list[BrowserContext]:
        return [self._context] if self._context else []

    def is_connected(self) -> bool:
        return self._is_connected

    async def new_context(self, **context_options: Any) -> BrowserContext:
        """Create persistent context instead of regular one. Merge launch options with context options."""
        if self._context:
            raise RuntimeError('Persistent browser can have only one context')

        launch_options = self._browser_launch_options | context_options

        if self._user_data_dir:
            user_data_dir = self._user_data_dir
        else:
            user_data_dir = tempfile.mkdtemp(prefix=self._TMP_DIR_PREFIX)
            self._temp_dir = Path(user_data_dir)

        self._context = await self._browser_type.launch_persistent_context(
            user_data_dir=user_data_dir, **launch_options
        )

        if self._temp_dir:
            self._context.on('close', self._delete_temp_dir)

        return self._context

    async def _delete_temp_dir(self, _: BrowserContext | None) -> None:
        if self._temp_dir and self._temp_dir.exists():
            temp_dir = self._temp_dir
            await asyncio.to_thread(shutil.rmtree, temp_dir, ignore_errors=True)

    @override
    async def close(self, **kwargs: Any) -> None:
        """Close browser by closing its context."""
        if self._context:
            await self._context.close()
            self._context = None
        self._is_connected = False
        await asyncio.sleep(0.1)
        await self._delete_temp_dir(self._context)

    @property
    @override
    def version(self) -> str:
        raise NotImplementedError('Persistent browser does not support version.')

    async def new_page(self, **kwargs: Any) -> Page:
        raise NotImplementedError('Persistent browser does not support new page.')

    @override
    async def new_browser_cdp_session(self) -> CDPSession:
        raise NotImplementedError('Persistent browser does not support new browser CDP session.')

    async def start_tracing(self, **kwargs: Any) -> None:
        raise NotImplementedError('Persistent browser does not support tracing.')

    async def stop_tracing(self, **kwargs: Any) -> bytes:
        raise NotImplementedError('Persistent browser does not support tracing.')


================================================
FILE: src/crawlee/browsers/_playwright_browser_controller.py
================================================
# Inspiration: https://github.com/apify/crawlee/blob/v3.10.1/packages/browser-pool/src/playwright/playwright-controller.ts

from __future__ import annotations

from asyncio import Lock
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING, Any, cast

from browserforge.injectors.playwright import AsyncNewContext
from playwright.async_api import Browser, BrowserContext, Page, ProxySettings
from typing_extensions import override

from crawlee._utils.docs import docs_group
from crawlee.browsers._browser_controller import BrowserController
from crawlee.fingerprint_suite import HeaderGenerator
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type

if TYPE_CHECKING:
    from collections.abc import Mapping

    from crawlee.browsers._playwright_browser import PlaywrightPersistentBrowser
    from crawlee.browsers._types import BrowserType
    from crawlee.fingerprint_suite import FingerprintGenerator
    from crawlee.proxy_configuration import ProxyInfo

from logging import getLogger

logger = getLogger(__name__)


@docs_group('Browser management')
class PlaywrightBrowserController(BrowserController):
    """Controller for managing Playwright browser instances and their pages.

    It provides methods to control browser instances, manage their pages, and handle context-specific
    configurations. It enforces limits on the number of open pages and tracks their state.
    """

    AUTOMATION_LIBRARY = 'playwright'
    _DEFAULT_HEADER_GENERATOR = HeaderGenerator()

    def __init__(
        self,
        browser: Browser | PlaywrightPersistentBrowser,
        *,
        max_open_pages_per_browser: int = 20,
        use_incognito_pages: bool = False,
        header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,
        fingerprint_generator: FingerprintGenerator | None = None,
    ) -> None:
        """Initialize a new instance.

        Args:
            browser: The browser instance to control.
            max_open_pages_per_browser: The maximum number of pages that can be open at the same time.
            use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
                own context that is destroyed once the page is closed or crashes.
            header_generator: An optional `HeaderGenerator` instance used to generate and manage HTTP headers for
                requests made by the browser. By default, a predefined header generator is used. Set to `None` to
                disable automatic header modifications.
            fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used
                to generate browser fingerprints together with consistent headers.
        """
        if fingerprint_generator and header_generator is not self._DEFAULT_HEADER_GENERATOR:
            raise ValueError(
                'Do not use `header_generator` and `fingerprint_generator` arguments at the same time. '
                'Choose only one. `fingerprint_generator` generates headers as well.'
            )
        self._browser = browser
        self._max_open_pages_per_browser = max_open_pages_per_browser
        self._header_generator = header_generator
        self._fingerprint_generator = fingerprint_generator
        self._use_incognito_pages = use_incognito_pages

        self._browser_context: BrowserContext | None = (
            self._browser.contexts[0] if len(self._browser.contexts) > 0 else None
        )
        self._pages = list[Page]()
        self._last_page_opened_at = datetime.now(timezone.utc)

        self._total_opened_pages = 0
        self._opening_pages_count = 0

        self._context_creation_lock: Lock | None = None

    async def _get_context_creation_lock(self) -> Lock:
        """Get context checking and creation lock.

        It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to
        memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.
        """
        if self._context_creation_lock:
            return self._context_creation_lock
        self._context_creation_lock = Lock()
        return self._context_creation_lock

    @property
    @override
    def pages(self) -> list[Page]:
        return self._pages

    @property
    @override
    def total_opened_pages(self) -> int:
        return self._total_opened_pages

    @property
    @override
    def pages_count(self) -> int:
        return len(self._pages)

    @property
    @override
    def last_page_opened_at(self) -> datetime:
        return self._last_page_opened_at

    @property
    @override
    def idle_time(self) -> timedelta:
        return datetime.now(timezone.utc) - self._last_page_opened_at

    @property
    @override
    def has_free_capacity(self) -> bool:
        return (self.pages_count + self._opening_pages_count) < self._max_open_pages_per_browser

    @property
    @override
    def is_browser_connected(self) -> bool:
        return self._browser.is_connected()

    @property
    @override
    def browser_type(self) -> BrowserType:
        return cast('BrowserType', self._browser.browser_type.name)

    @override
    async def new_page(
        self,
        browser_new_context_options: Mapping[str, Any] | None = None,
        proxy_info: ProxyInfo | None = None,
    ) -> Page:
        """Create a new page with the given context options.

        Args:
            browser_new_context_options: Keyword arguments to pass to the browser new context method. These options
                are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
                Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
            proxy_info: The proxy configuration to use for the new page.

        Returns:
            Page: The newly created page.

        Raises:
            ValueError: If the browser has reached the maximum number of open pages.
        """
        if not self.has_free_capacity:
            raise ValueError('Cannot open more pages in this browser.')

        self._opening_pages_count += 1

        try:
            if self._use_incognito_pages:
                # In incognito there is exactly one context per one page. Create new context for each new page.
                new_context = await self._create_browser_context(
                    browser_new_context_options=browser_new_context_options,
                    proxy_info=proxy_info,
                )
                page = await new_context.new_page()
            else:
                async with await self._get_context_creation_lock():
                    if not self._browser_context:
                        self._browser_context = await self._create_browser_context(
                            browser_new_context_options=browser_new_context_options,
                            proxy_info=proxy_info,
                        )
                page = await self._browser_context.new_page()

            # Handle page close event
            page.on(event='close', f=self._on_page_close)

            # Update internal state
            self._pages.append(page)
            self._last_page_opened_at = datetime.now(timezone.utc)

            self._total_opened_pages += 1
        finally:
            self._opening_pages_count -= 1
        return page

    @override
    async def close(self, *, force: bool = False) -> None:
        """Close the browser.

        Args:
            force: Whether to force close all open pages before closing the browser.

        Raises:
            ValueError: If there are still open pages when trying to close the browser.
        """
        if self.pages_count > 0 and not force:
            raise ValueError('Cannot close the browser while there are open pages.')

        if self._browser_context:
            await self._browser_context.close()
        await self._browser.close()

    def _on_page_close(self, page: Page) -> None:
        """Handle actions after a page is closed."""
        self._pages.remove(page)

    async def _create_browser_context(
        self,
        browser_new_context_options: Mapping[str, Any] | None = None,
        proxy_info: ProxyInfo | None = None,
    ) -> BrowserContext:
        """Create a new browser context with the specified proxy settings.

        Create context with fingerprints and headers using with `self._fingerprint_generator` if available.
        Create context without fingerprints, but with headers based on `self._header_generator` if available.
        Create context without headers and without fingerprints if neither `self._header_generator` nor
        `self._fingerprint_generator` is available.
        """
        browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
        if proxy_info:
            if browser_new_context_options.get('proxy'):
                logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")

            browser_new_context_options['proxy'] = ProxySettings(
                server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
                username=proxy_info.username,
                password=proxy_info.password,
            )

        if self._fingerprint_generator:
            return await AsyncNewContext(
                browser=self._browser,
                fingerprint=self._fingerprint_generator.generate(),
                **browser_new_context_options,
            )

        if self._header_generator:
            extra_http_headers = dict(
                self._header_generator.get_specific_headers(
                    header_names={
                        'Accept',
                        'Accept-Language',
                        'User-Agent',
                        'sec-ch-ua',
                        'sec-ch-ua-mobile',
                        'sec-ch-ua-platform',
                    },
                    browser_type=fingerprint_browser_type_from_playwright_browser_type(self.browser_type),
                )
            )
        else:
            extra_http_headers = None

        browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
            'extra_http_headers', extra_http_headers
        )
        return await self._browser.new_context(**browser_new_context_options)


================================================
FILE: src/crawlee/browsers/_playwright_browser_plugin.py
================================================
# Inspiration: https://github.com/apify/crawlee/blob/v3.10.1/packages/browser-pool/src/playwright/playwright-plugin.ts

from __future__ import annotations

from logging import getLogger
from typing import TYPE_CHECKING, Any

from playwright.async_api import Playwright, async_playwright
from typing_extensions import override

from crawlee import service_locator
from crawlee._utils.context import ensure_context
from crawlee._utils.docs import docs_group
from crawlee.browsers._browser_plugin import BrowserPlugin
from crawlee.browsers._playwright_browser import PlaywrightPersistentBrowser
from crawlee.browsers._playwright_browser_controller import PlaywrightBrowserController

if TYPE_CHECKING:
    from collections.abc import Mapping
    from pathlib import Path
    from types import TracebackType

    from playwright.async_api._generated import Browser

    from crawlee.browsers._types import BrowserType
    from crawlee.fingerprint_suite import FingerprintGenerator

logger = getLogger(__name__)


@docs_group('Browser management')
class PlaywrightBrowserPlugin(BrowserPlugin):
    """A plugin for managing Playwright automation library.

    It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
    for creating new browser instances and provides a unified interface for interacting with different browser types
    (chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
    mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
    browser instance, ensuring that resource limits are respected.
    """

    AUTOMATION_LIBRARY = 'playwright'

    def __init__(
        self,
        *,
        browser_type: BrowserType = 'chromium',
        user_data_dir: str | Path | None = None,
        browser_launch_options: dict[str, Any] | None = None,
        browser_new_context_options: dict[str, Any] | None = None,
        max_open_pages_per_browser: int = 20,
        use_incognito_pages: bool = False,
        fingerprint_generator: FingerprintGenerator | None = None,
    ) -> None:
        """Initialize a new instance.

        Args:
            browser_type: The type of browser to launch:
                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
                    the system.
            user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
                storage.
            browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
                directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright
                documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.
            browser_new_context_options: Keyword arguments to pass to the browser new context method. These options
                are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
                Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
            max_open_pages_per_browser: The maximum number of pages that can be opened in a single browser instance.
                Once reached, a new browser instance will be launched to handle the excess.
            use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
                own context that is destroyed once the page is closed or crashes.
            fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used
                to generate browser fingerprints together with consistent headers.
        """
        config = service_locator.get_configuration()

        # Default browser launch options are based on the configuration.
        default_launch_browser_options: dict[str, Any] = {
            'headless': config.headless,
            'executable_path': config.default_browser_path,
            'chromium_sandbox': not config.disable_browser_sandbox,
        }
        explicit_browser_launch_options = browser_launch_options or {}

        # Map 'chrome' to 'chromium' with the 'chrome' channel.
        if browser_type == 'chrome':
            browser_type = 'chromium'
            # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
            default_launch_browser_options['channel'] = 'chrome'

            if executable_path := explicit_browser_launch_options.get(
                'executable_path', default_launch_browser_options.get('executable_path')
            ):
                logger.debug(
                    f"Using browser executable from {executable_path}, which takes precedence over 'chrome' channel."
                )

        self._browser_type: BrowserType = browser_type
        self._browser_launch_options: dict[str, Any] = default_launch_browser_options | explicit_browser_launch_options
        self._browser_new_context_options = browser_new_context_options or {}
        self._max_open_pages_per_browser = max_open_pages_per_browser
        self._use_incognito_pages = use_incognito_pages
        self._user_data_dir = user_data_dir

        self._playwright_context_manager = async_playwright()
        self._playwright: Playwright | None = None

        # Flag to indicate the context state.
        self._active = False

        self._fingerprint_generator = fingerprint_generator

    @property
    @override
    def active(self) -> bool:
        return self._active

    @property
    @override
    def browser_type(self) -> BrowserType:
        return self._browser_type

    @property
    @override
    def browser_launch_options(self) -> Mapping[str, Any]:
        """Return the options for the `browser.launch` method.

        Keyword arguments to pass to the browser launch method. These options are provided directly to Playwright's
        `browser_type.launch` method. For more details, refer to the Playwright documentation:
         https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.
        """
        return self._browser_launch_options

    @property
    @override
    def browser_new_context_options(self) -> Mapping[str, Any]:
        """Return the options for the `browser.new_context` method.

        Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's
        `browser.new_context` method. For more details, refer to the Playwright documentation:
        https://playwright.dev/python/docs/api/class-browser#browser-new-context.
        """
        return self._browser_new_context_options

    @property
    @override
    def max_open_pages_per_browser(self) -> int:
        return self._max_open_pages_per_browser

    @override
    async def __aenter__(self) -> PlaywrightBrowserPlugin:
        if self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is already active.')

        self._active = True
        self._playwright = await self._playwright_context_manager.__aenter__()
        return self

    @override
    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        if not self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is not active.')

        await self._playwright_context_manager.__aexit__(exc_type, exc_value, exc_traceback)
        self._playwright_context_manager = async_playwright()
        self._active = False

    @override
    @ensure_context
    async def new_browser(self) -> PlaywrightBrowserController:
        if not self._playwright:
            raise RuntimeError('Playwright browser plugin is not initialized.')

        if self._browser_type == 'chromium':
            browser_type = self._playwright.chromium
        elif self._browser_type == 'firefox':
            browser_type = self._playwright.firefox
        elif self._browser_type == 'webkit':
            browser_type = self._playwright.webkit
        else:
            raise ValueError(f'Invalid browser type: {self._browser_type}')

        if self._use_incognito_pages:
            browser: Browser | PlaywrightPersistentBrowser = await browser_type.launch(**self._browser_launch_options)
        else:
            browser = PlaywrightPersistentBrowser(browser_type, self._user_data_dir, self._browser_launch_options)

        return PlaywrightBrowserController(
            browser,
            use_incognito_pages=self._use_incognito_pages,
            max_open_pages_per_browser=self._max_open_pages_per_browser,
            fingerprint_generator=self._fingerprint_generator,
        )


================================================
FILE: src/crawlee/browsers/_types.py
================================================
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, Literal

if TYPE_CHECKING:
    from playwright.async_api import Page

BrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome']


@dataclass
class CrawleePage:
    """Represents a page object within a browser, with additional metadata for tracking and management."""

    id: str
    browser_type: BrowserType
    page: Page


================================================
FILE: src/crawlee/browsers/py.typed
================================================


================================================
FILE: src/crawlee/configuration.py
================================================
from __future__ import annotations

from datetime import timedelta
from typing import TYPE_CHECKING, Annotated

from pydantic import AliasChoices, BeforeValidator, Field
from pydantic_settings import BaseSettings, SettingsConfigDict

from crawlee._types import LogLevel
from crawlee._utils.docs import docs_group
from crawlee._utils.models import timedelta_ms

if TYPE_CHECKING:
    from typing_extensions import Self

__all__ = ['Configuration']


@docs_group('Configuration')
class Configuration(BaseSettings):
    """Configuration settings for the Crawlee project.

    This class stores common configurable parameters for Crawlee. Default values are provided for all settings,
    so typically, no adjustments are necessary. However, you may modify settings for specific use cases,
    such as changing the default storage directory, the default storage IDs, the timeout for internal
    operations, and more.

    Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
    """

    # TODO: https://github.com/pydantic/pydantic-settings/issues/706
    # Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
    model_config = SettingsConfigDict(populate_by_name=True)

    internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
    """Timeout for the internal asynchronous operations."""

    default_browser_path: Annotated[
        str | None,
        Field(
            validation_alias=AliasChoices(
                'apify_default_browser_path',
                'crawlee_default_browser_path',
            )
        ),
    ] = None
    """Specifies the path to the browser executable. Currently primarily for Playwright-based features. This option
    is passed directly to Playwright's `browser_type.launch` method as `executable_path` argument. For more details,
    refer to the Playwright documentation:
    https://playwright.dev/docs/api/class-browsertype#browser-type-launch.
    """

    disable_browser_sandbox: Annotated[
        bool,
        Field(
            validation_alias=AliasChoices(
                'apify_disable_browser_sandbox',
                'crawlee_disable_browser_sandbox',
            )
        ),
    ] = False
    """Disables the sandbox for the browser. Currently primarily for Playwright-based features. This option
    is passed directly to Playwright's `browser_type.launch` method as `chromium_sandbox`. For more details,
    refer to the Playwright documentation:
    https://playwright.dev/docs/api/class-browsertype#browser-type-launch."""

    log_level: Annotated[
        LogLevel,
        Field(
            validation_alias=AliasChoices(
                'apify_log_level',
                'crawlee_log_level',
            )
        ),
        BeforeValidator(lambda value: str(value).upper()),
    ] = 'INFO'
    """The logging level."""

    purge_on_start: Annotated[
        bool,
        Field(
            validation_alias=AliasChoices(
                'apify_purge_on_start',
                'crawlee_purge_on_start',
            )
        ),
    ] = True
    """Whether to purge the storage on the start. This option is utilized by the storage clients."""

    persist_state_interval: Annotated[
        timedelta_ms,
        Field(
            validation_alias=AliasChoices(
                'apify_persist_state_interval_millis',
                'crawlee_persist_state_interval_millis',
            )
        ),
    ] = timedelta(minutes=1)
    """Interval at which `PersistState` events are emitted. The event ensures the state persistence during
    the crawler run. This option is utilized by the `EventManager`."""

    system_info_interval: Annotated[
        timedelta_ms,
        Field(
            validation_alias=AliasChoices(
                'apify_system_info_interval_millis',
                'crawlee_system_info_interval_millis',
            )
        ),
    ] = timedelta(seconds=1)
    """Interval at which `SystemInfo` events are emitted. The event represents the current status of the system.
    This option is utilized by the `LocalEventManager`."""

    max_used_cpu_ratio: Annotated[
        float,
        Field(
            validation_alias=AliasChoices(
                'apify_max_used_cpu_ratio',
                'crawlee_max_used_cpu_ratio',
            )
        ),
    ] = 0.95
    """The maximum CPU usage ratio. If the CPU usage exceeds this value, the system is considered overloaded.
    This option is used by the `Snapshotter`."""

    max_used_memory_ratio: Annotated[
        float,
        Field(
            validation_alias=AliasChoices(
                'apify_max_used_memory_ratio',
                'crawlee_max_used_memory_ratio',
            )
        ),
    ] = 0.9
    """The maximum memory usage ratio. If the memory usage exceeds this ratio, it is considered overloaded.
    This option is used by the `Snapshotter`."""

    max_event_loop_delay: Annotated[
        timedelta_ms,
        Field(
            validation_alias=AliasChoices(
                'apify_max_event_loop_delay_millis',
                'crawlee_max_event_loop_delay_millis',
            )
        ),
    ] = timedelta(milliseconds=50)
    """The maximum event loop delay. If the event loop delay exceeds this value, it is considered overloaded.
    This option is used by the `Snapshotter`."""

    max_client_errors: Annotated[
        int,
        Field(
            validation_alias=AliasChoices(
                'apify_max_client_errors',
                'crawlee_max_client_errors',
            )
        ),
    ] = 1
    """The maximum number of client errors (HTTP 429) allowed before the system is considered overloaded.
    This option is used by the `Snapshotter`."""

    memory_mbytes: Annotated[
        int | None,
        Field(
            validation_alias=AliasChoices(
                'actor_memory_mbytes',
                'apify_memory_mbytes',
                'crawlee_memory_mbytes',
            )
        ),
    ] = None
    """The maximum used memory in megabytes. This option is utilized by the `Snapshotter`."""

    available_memory_ratio: Annotated[
        float,
        Field(
            validation_alias=AliasChoices(
                'apify_available_memory_ratio',
                'crawlee_available_memory_ratio',
            ),
            gt=0.0,
            le=1.0,
        ),
    ] = 0.25
    """The maximum proportion of system memory to use. If `memory_mbytes` is not provided, this ratio is used to
    calculate the maximum memory. This option is utilized by the `Snapshotter` and supports the dynamic system memory
    scaling."""

    storage_dir: Annotated[
        str,
        Field(
            validation_alias=AliasChoices(
                'apify_local_storage_dir',
                'crawlee_storage_dir',
            ),
        ),
    ] = './storage'
    """The path to the storage directory. This option is utilized by the storage clients."""

    headless: Annotated[
        bool,
        Field(
            validation_alias=AliasChoices(
                'apify_headless',
                'crawlee_headless',
            )
        ),
    ] = True
    """Whether to run the browser in headless mode. Currently primarily for Playwright-based features. This option
    is passed directly to Playwright's `browser_type.launch` method as `headless`. For more details,
    refer to the Playwright documentation:
    https://playwright.dev/docs/api/class-browsertype#browser-type-launch.
    """

    @classmethod
    def get_global_configuration(cls) -> Self:
        """Retrieve the global instance of the configuration.

        Mostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`
        instead.
        """
        # Import here to avoid circular imports.
        from crawlee import service_locator  # noqa: PLC0415

        config = service_locator.get_configuration()

        if not isinstance(config, cls):
            raise TypeError(f'Requested global configuration object of type {cls}, but {config.__class__} was found')

        return config


================================================
FILE: src/crawlee/crawlers/__init__.py
================================================
from crawlee._utils.try_import import install_import_hook as _install_import_hook
from crawlee._utils.try_import import try_import as _try_import

from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult

_install_import_hook(__name__)

# The following imports use try_import to handle optional dependencies, as they may not always be available.

with _try_import(__name__, 'BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType'):
    from ._beautifulsoup import BeautifulSoupCrawler, BeautifulSoupCrawlingContext, BeautifulSoupParserType

with _try_import(__name__, 'ParselCrawler', 'ParselCrawlingContext'):
    from ._parsel import ParselCrawler, ParselCrawlingContext

with _try_import(
    __name__,
    'PlaywrightCrawler',
    'PlaywrightCrawlingContext',
    'PlaywrightPostNavCrawlingContext',
    'PlaywrightPreNavCrawlingContext',
):
    from ._playwright import (
        PlaywrightCrawler,
        PlaywrightCrawlingContext,
        PlaywrightPostNavCrawlingContext,
        PlaywrightPreNavCrawlingContext,
    )

with _try_import(
    __name__,
    'AdaptivePlaywrightCrawler',
    'AdaptivePlaywrightCrawlingContext',
    'AdaptivePlaywrightPostNavCrawlingContext',
    'AdaptivePlaywrightPreNavCrawlingContext',
    'AdaptivePlaywrightCrawlerStatisticState',
    'RenderingType',
    'RenderingTypePrediction',
    'RenderingTypePredictor',
):
    from ._adaptive_playwright import (
        AdaptivePlaywrightCrawler,
        AdaptivePlaywrightCrawlerStatisticState,
        AdaptivePlaywrightCrawlingContext,
        AdaptivePlaywrightPostNavCrawlingContext,
        AdaptivePlaywrightPreNavCrawlingContext,
        RenderingType,
        RenderingTypePrediction,
        RenderingTypePredictor,
    )


__all__ = [
    'AbstractHttpCrawler',
    'AbstractHttpParser',
    'AdaptivePlaywrightCrawler',
    'AdaptivePlaywrightCrawlerStatisticState',
    'AdaptivePlaywrightCrawlingContext',
    'AdaptivePlaywrightPostNavCrawlingContext',
    'AdaptivePlaywrightPreNavCrawlingContext',
    'BasicCrawler',
    'BasicCrawlerOptions',
    'BasicCrawlingContext',
    'BeautifulSoupCrawler',
    'BeautifulSoupCrawlingContext',
    'BeautifulSoupParserType',
    'ContextPipeline',
    'HttpCrawler',
    'HttpCrawlerOptions',
    'HttpCrawlingContext',
    'HttpCrawlingResult',
    'ParsedHttpCrawlingContext',
    'ParselCrawler',
    'ParselCrawlingContext',
    'PlaywrightCrawler',
    'PlaywrightCrawlingContext',
    'PlaywrightPostNavCrawlingContext',
    'PlaywrightPreNavCrawlingContext',
    'RenderingType',
    'RenderingTypePrediction',
    'RenderingTypePredictor',
]


================================================
FILE: src/crawlee/crawlers/_abstract_http/__init__.py
================================================
from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
from ._abstract_http_parser import AbstractHttpParser
from ._http_crawling_context import ParsedHttpCrawlingContext

__all__ = [
    'AbstractHttpCrawler',
    'AbstractHttpParser',
    'HttpCrawlerOptions',
    'ParsedHttpCrawlingContext',
]


================================================
FILE: src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
================================================
from __future__ import annotations

import asyncio
import logging
from abc import ABC
from datetime import timedelta
from typing import TYPE_CHECKING, Any, Generic

from more_itertools import partition
from pydantic import ValidationError
from typing_extensions import NotRequired, TypeVar

from crawlee._request import Request, RequestOptions, RequestState
from crawlee._utils.docs import docs_group
from crawlee._utils.time import SharedTimeout
from crawlee._utils.urls import to_absolute_url_iterator
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
from crawlee.errors import SessionError
from crawlee.statistics import StatisticsState

from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator, Awaitable, Callable, Iterator

    from typing_extensions import Unpack

    from crawlee import RequestTransformAction
    from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction

    from ._abstract_http_parser import AbstractHttpParser

TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)


class HttpCrawlerOptions(
    BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
    Generic[TCrawlingContext, TStatisticsState],
):
    """Arguments for the `AbstractHttpCrawler` constructor.

    It is intended for typing forwarded `__init__` arguments in the subclasses.
    """

    navigation_timeout: NotRequired[timedelta | None]
    """Timeout for the HTTP request."""


@docs_group('Crawlers')
class AbstractHttpCrawler(
    BasicCrawler[TCrawlingContext, StatisticsState],
    ABC,
    Generic[TCrawlingContext, TParseResult, TSelectResult],
):
    """A web crawler for performing HTTP requests.

    The `AbstractHttpCrawler` builds on top of the `BasicCrawler`, inheriting all its features. Additionally,
    it implements HTTP communication using HTTP clients. The class allows integration with any HTTP client
    that implements the `HttpClient` interface, provided as an input parameter to the constructor.

    `AbstractHttpCrawler` is a generic class intended to be used with a specific parser for parsing HTTP responses
    and the expected type of `TCrawlingContext` available to the user function. Examples of specific versions include
    `BeautifulSoupCrawler`, `ParselCrawler`, and `HttpCrawler`.

    HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. For websites that
    require client-side JavaScript execution, consider using a browser-based crawler like the `PlaywrightCrawler`.
    """

    def __init__(
        self,
        *,
        parser: AbstractHttpParser[TParseResult, TSelectResult],
        navigation_timeout: timedelta | None = None,
        **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
    ) -> None:
        self._parser = parser
        self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
        self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
        self._post_navigation_hooks: list[Callable[[HttpCrawlingContext], Awaitable[None]]] = []
        self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}

        if '_context_pipeline' not in kwargs:
            raise ValueError(
                'Please pass in a `_context_pipeline`. You should use the '
                'AbstractHttpCrawler._create_static_content_crawler_pipeline() method to initialize it.'
            )

        kwargs.setdefault('_logger', logging.getLogger(self.__class__.__name__))
        super().__init__(**kwargs)

    @classmethod
    def create_parsed_http_crawler_class(
        cls,
        static_parser: AbstractHttpParser[TParseResult, TSelectResult],
    ) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]]:
        """Create a specific version of `AbstractHttpCrawler` class.

        This is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.
        While `AbstractHttpCrawler` allows its two generic parameters to be independent,
        this method simplifies cases where `TParseResult` is used for both generic parameters.
        """

        class _ParsedHttpCrawler(AbstractHttpCrawler):
            def __init__(
                self,
                parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser,  # ty: ignore[invalid-parameter-default]
                **kwargs: Unpack[BasicCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]],
            ) -> None:
                kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline()
                super().__init__(
                    parser=parser,
                    **kwargs,
                )

        return _ParsedHttpCrawler

    def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpCrawlingContext[TParseResult]]:
        """Create static content crawler context pipeline with expected pipeline steps."""
        return (
            ContextPipeline()
            .compose(self._execute_pre_navigation_hooks)
            .compose(self._make_http_request)
            .compose(self._execute_post_navigation_hooks)
            .compose(self._handle_status_code_response)
            .compose(self._parse_http_response)
            .compose(self._handle_blocked_request_by_content)
        )

    async def _execute_pre_navigation_hooks(
        self, context: BasicCrawlingContext
    ) -> AsyncGenerator[BasicCrawlingContext, None]:
        context_id = id(context)
        self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)

        try:
            for hook in self._pre_navigation_hooks:
                async with self._shared_navigation_timeouts[context_id]:
                    await hook(context)

            yield context
        finally:
            self._shared_navigation_timeouts.pop(context_id, None)

    async def _execute_post_navigation_hooks(
        self, context: HttpCrawlingContext
    ) -> AsyncGenerator[HttpCrawlingContext, None]:
        for hook in self._post_navigation_hooks:
            await hook(context)

        yield context

    async def _parse_http_response(
        self, context: HttpCrawlingContext
    ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]:
        """Parse HTTP response and create context enhanced by the parsing result and enqueue links function.

        Args:
            context: The current crawling context, that includes HTTP response.

        Yields:
            The original crawling context enhanced by the parsing result and enqueue links function.
        """
        parsed_content = await self._parser.parse(context.http_response)
        extract_links = self._create_extract_links_function(context, parsed_content)
        yield ParsedHttpCrawlingContext.from_http_crawling_context(
            context=context,
            parsed_content=parsed_content,
            enqueue_links=self._create_enqueue_links_function(context, extract_links),
            extract_links=extract_links,
        )

    def _create_extract_links_function(
        self, context: HttpCrawlingContext, parsed_content: TParseResult
    ) -> ExtractLinksFunction:
        """Create a callback function for extracting links from parsed content.

        Args:
            context: The current crawling context.
            parsed_content: The parsed http response.

        Returns:
            Awaitable that is used for extracting links from parsed content.
        """

        async def extract_links(
            *,
            selector: str = 'a',
            attribute: str = 'href',
            label: str | None = None,
            user_data: dict[str, Any] | None = None,
            transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
            | None = None,
            **kwargs: Unpack[EnqueueLinksKwargs],
        ) -> list[Request]:
            requests = list[Request]()

            base_user_data = user_data or {}

            robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)

            kwargs.setdefault('strategy', 'same-hostname')
            strategy = kwargs.get('strategy', 'same-hostname')

            links_iterator: Iterator[str] = iter(
                self._parser.find_links(parsed_content, selector=selector, attribute=attribute)
            )

            # Get base URL from <base> tag if present
            extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]', 'href'))
            base_url: str = (
                str(extracted_base_urls[0])
                if extracted_base_urls
                else context.request.loaded_url or context.request.url
            )
            links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)

            if robots_txt_file:
                skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
            else:
                skipped = iter([])

            for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
                request_options = RequestOptions(
                    url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
                )

                if transform_request_function:
                    transform_request_options = transform_request_function(request_options)
                    if transform_request_options == 'skip':
                        continue
                    if transform_request_options != 'unchanged':
                        request_options = transform_request_options

                try:
                    request = Request.from_url(**request_options)
                except ValidationError as exc:
                    context.log.debug(
                        f'Skipping URL "{url}" due to invalid format: {exc}. '
                        'This may be caused by a malformed URL or unsupported URL scheme. '
                        'Please ensure the URL is correct and retry.'
                    )
                    continue

                requests.append(request)

            skipped_tasks = [
                asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
            ]
            await asyncio.gather(*skipped_tasks)

            return requests

        return extract_links

    async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
        """Make http request and create context enhanced by HTTP response.

        Args:
            context: The current crawling context.

        Yields:
            The original crawling context enhanced by HTTP response.
        """
        async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
            result = await self._http_client.crawl(
                request=context.request,
                session=context.session,
                proxy_info=context.proxy_info,
                statistics=self._statistics,
                timeout=remaining_timeout,
            )

        context.request.state = RequestState.AFTER_NAV
        yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)

    async def _handle_status_code_response(
        self, context: HttpCrawlingContext
    ) -> AsyncGenerator[HttpCrawlingContext, None]:
        """Validate the HTTP status code and raise appropriate exceptions if needed.

        Args:
            context: The current crawling context containing the HTTP response.

        Raises:
            SessionError: If the status code indicates the session is blocked.
            HttpStatusCodeError: If the status code represents a server error or is explicitly configured as an error.
            HttpClientStatusCodeError: If the status code represents a client error.

        Yields:
            The original crawling context if no errors are detected.
        """
        status_code = context.http_response.status_code
        if self._retry_on_blocked:
            self._raise_for_session_blocked_status_code(context.session, status_code)
        self._raise_for_error_status_code(status_code)
        yield context

    async def _handle_blocked_request_by_content(
        self, context: ParsedHttpCrawlingContext[TParseResult]
    ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]:
        """Try to detect if the request is blocked based on the parsed response content.

        Args:
            context: The current crawling context.

        Raises:
            SessionError: If the request is considered blocked.

        Yields:
            The original crawling context if no blocking is detected.
        """
        if self._retry_on_blocked and (blocked_info := self._parser.is_blocked(context.parsed_content)):
            raise SessionError(blocked_info.reason)
        yield context

    def pre_navigation_hook(self, hook: Callable[[BasicCrawlingContext], Awaitable[None]]) -> None:
        """Register a hook to be called before each navigation.

        Args:
            hook: A coroutine function to be called before each navigation.
        """
        self._pre_navigation_hooks.append(hook)

    def post_navigation_hook(self, hook: Callable[[HttpCrawlingContext], Awaitable[None]]) -> None:
        """Register a hook to be called after each navigation.

        Args:
            hook: A coroutine function to be called after each navigation.
        """
        self._post_navigation_hooks.append(hook)


================================================
FILE: src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py
================================================
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Generic

from crawlee._utils.blocked import RETRY_CSS_SELECTORS
from crawlee._utils.docs import docs_group
from crawlee.crawlers._types import BlockedInfo

from ._http_crawling_context import TParseResult, TSelectResult

if TYPE_CHECKING:
    from collections.abc import Iterable, Sequence

    from crawlee.http_clients import HttpResponse


@docs_group('HTTP parsers')
class AbstractHttpParser(ABC, Generic[TParseResult, TSelectResult]):
    """Parser used for parsing HTTP response and inspecting parsed result to find links or detect blocking."""

    @abstractmethod
    async def parse(self, response: HttpResponse) -> TParseResult:
        """Parse HTTP response.

        Args:
            response: HTTP response to be parsed.

        Returns:
            Parsed HTTP response.
        """

    @abstractmethod
    async def parse_text(self, text: str) -> TParseResult:
        """Parse text containing html.

        Args:
            text: String containing html.

        Returns:
            Parsed text.
        """

    @abstractmethod
    async def select(self, parsed_content: TParseResult, selector: str) -> Sequence[TSelectResult]:
        """Use css selector to select page element and return it.

        Args:
            parsed_content: Content where the page element will be located.
            selector: Css selector used to locate desired html element.

        Returns:
            Selected element.
        """

    def is_blocked(self, parsed_content: TParseResult) -> BlockedInfo:
        """Detect if blocked and return BlockedInfo with additional information.

        Default implementation that expects `is_matching_selector` abstract method to be implemented.
        Override this method if your parser has different way of blockage detection.

        Args:
            parsed_content: Parsed HTTP response. Result of `parse` method.

        Returns:
            `BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty
            string in reason signifies no blockage detected.
        """
        reason = ''
        if parsed_content is not None:
            matched_selectors = [
                selector for selector in RETRY_CSS_SELECTORS if self.is_matching_selector(parsed_content, selector)
            ]

            if matched_selectors:
                reason = (
                    f'Assuming the session is blocked - HTTP response matched the following selectors: '
                    f'{"; ".join(matched_selectors)}'
                )

        return BlockedInfo(reason=reason)

    @abstractmethod
    def is_matching_selector(self, parsed_content: TParseResult, selector: str) -> bool:
        """Find if selector has match in parsed content.

        Args:
            parsed_content: Parsed HTTP response. Result of `parse` method.
            selector: String used to define matching pattern.

        Returns:
            True if selector has match in parsed content.
        """

    @abstractmethod
    def find_links(self, parsed_content: TParseResult, selector: str, attribute: str) -> Iterable[str]:
        """Find all links in result using selector.

        Args:
            parsed_content: Parsed HTTP response. Result of `parse` method.
            selector: String used to define matching pattern for finding links.
            attribute: Which node attribute to extract the links from.

        Returns:
            Iterable of strings that contain found links.
        """


================================================
FILE: src/crawlee/crawlers/_abstract_http/_http_crawling_context.py
================================================
from __future__ import annotations

from dataclasses import dataclass, fields
from typing import Generic

from typing_extensions import Self, TypeVar

from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction, PageSnapshot
from crawlee._utils.docs import docs_group
from crawlee.http_clients import HttpCrawlingResult, HttpResponse

TParseResult = TypeVar('TParseResult')
TSelectResult = TypeVar('TSelectResult')


@dataclass(frozen=True)
@docs_group('Crawling contexts')
class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult):
    """The crawling context used by the `AbstractHttpCrawler`."""

    @classmethod
    def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_response: HttpResponse) -> Self:
        """Initialize a new instance from an existing `BasicCrawlingContext`."""
        context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
        return cls(http_response=http_response, **context_kwargs)

    async def get_snapshot(self) -> PageSnapshot:
        """Get snapshot of crawled page."""
        return PageSnapshot(html=(await self.http_response.read()).decode('utf-8'))


@dataclass(frozen=True)
@docs_group('Crawling contexts')
class ParsedHttpCrawlingContext(HttpCrawlingContext, Generic[TParseResult]):
    """The crawling context used by `AbstractHttpCrawler`.

    It provides access to key objects as well as utility functions for handling crawling tasks.
    """

    parsed_content: TParseResult
    enqueue_links: EnqueueLinksFunction
    extract_links: ExtractLinksFunction

    @classmethod
    def from_http_crawling_context(
        cls,
        context: HttpCrawlingContext,
        parsed_content: TParseResult,
        enqueue_links: EnqueueLinksFunction,
        extract_links: ExtractLinksFunction,
    ) -> Self:
        """Initialize a new instance from an existing `HttpCrawlingContext`."""
        context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
        return cls(
            parsed_content=parsed_content, enqueue_links=enqueue_links, extract_links=extract_links, **context_kwargs
        )


================================================
FILE: src/crawlee/crawlers/_abstract_http/py.typed
================================================


================================================
FILE: src/crawlee/crawlers/_adaptive_playwright/__init__.py
================================================
from crawlee._utils.try_import import install_import_hook as _install_import_hook
from crawlee._utils.try_import import try_import as _try_import

# These imports have only mandatory dependencies, so they are imported directly.
from ._adaptive_playwright_crawling_context import (
    AdaptivePlaywrightCrawlingContext,
    AdaptivePlaywrightPostNavCrawlingContext,
    AdaptivePlaywrightPreNavCrawlingContext,
)

_install_import_hook(__name__)

# The following imports are wrapped in try_import to handle optional dependencies,
# ensuring the module can still function even if these dependencies are missing.
with _try_import(__name__, 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor'):
    from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor
with _try_import(__name__, 'AdaptivePlaywrightCrawler'):
    from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
with _try_import(__name__, 'AdaptivePlaywrightCrawlerStatisticState'):
    from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawlerStatisticState

__all__ = [
    'AdaptivePlaywrightCrawler',
    'AdaptivePlaywrightCrawlerStatisticState',
    'AdaptivePlaywrightCrawlingContext',
    'AdaptivePlaywrightPostNavCrawlingContext',
    'AdaptivePlaywrightPreNavCrawlingContext',
    'RenderingType',
    'RenderingTypePrediction',
    'RenderingTypePredictor',
]


================================================
FILE: src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py
================================================
from __future__ import annotations

import logging
from collections.abc import Awaitable, Callable, Coroutine
from copy import deepcopy
from dataclasses import dataclass
from logging import getLogger
from random import random
from typing import TYPE_CHECKING, Any, Generic, get_args

from bs4 import BeautifulSoup, Tag
from parsel import Selector
from typing_extensions import Self, TypeVar, override

from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
from crawlee._utils.docs import docs_group
from crawlee._utils.wait import wait_for
from crawlee.crawlers import (
    AbstractHttpCrawler,
    AbstractHttpParser,
    BasicCrawler,
    BeautifulSoupParserType,
    HttpCrawlingContext,
    ParsedHttpCrawlingContext,
    PlaywrightCrawler,
    PlaywrightCrawlingContext,
    PlaywrightPostNavCrawlingContext,
    PlaywrightPreNavCrawlingContext,
)
from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser
from crawlee.crawlers._parsel._parsel_parser import ParselParser
from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
from crawlee.statistics import Statistics, StatisticsState

from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState
from ._adaptive_playwright_crawling_context import (
    AdaptivePlaywrightCrawlingContext,
    AdaptivePlaywrightPostNavCrawlingContext,
    AdaptivePlaywrightPreNavCrawlingContext,
)
from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor
from ._result_comparator import create_default_comparator

if TYPE_CHECKING:
    from types import TracebackType

    from typing_extensions import Unpack

    from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions


TStaticParseResult = TypeVar('TStaticParseResult')
TStaticSelectResult = TypeVar('TStaticSelectResult')
TStaticCrawlingContext = TypeVar('TStaticCrawlingContext', bound=ParsedHttpCrawlingContext)


class _NonPersistentStatistics(Statistics):
    """Statistics compliant object that is not supposed to do anything when entering/exiting context.

    To be used in sub crawlers.
    """

    def __init__(self) -> None:
        super().__init__(state_model=StatisticsState)

    async def __aenter__(self) -> Self:
        self._active = True
        await self._state.initialize()
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        self._active = False


@docs_group('Crawlers')
class AdaptivePlaywrightCrawler(
    BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState],
    Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
):
    """An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.

    It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects
    that it may bring a performance benefit.
    It uses specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`.

    ### Usage
    ```python
    from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
        playwright_crawler_specific_kwargs={'browser_type': 'chromium'},
    )

    @crawler.router.default_handler
    async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None:
        # Do some processing using `parsed_content`
        context.log.info(context.parsed_content.title)

        # Locate element h2 within 5 seconds
        h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))
        # Do stuff with element found by the selector
        context.log.info(h2)

        # Find more links and enqueue them.
        await context.enqueue_links()
        # Save some data.
        await context.push_data({'Visited url': context.request.url})

    await crawler.run(['https://crawlee.dev/'])
    ```
    """

    def __init__(
        self,
        *,
        static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult],
        rendering_type_predictor: RenderingTypePredictor | None = None,
        result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,
        result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,
        playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,
        statistics: Statistics[AdaptivePlaywrightCrawlerStatisticState] | None = None,
        **kwargs: Unpack[_BasicCrawlerOptions],
    ) -> None:
        """Initialize a new instance. Recommended way to create instance is to call factory methods.

        Recommended factory methods: `with_beautifulsoup_static_parser`, `with_parsel_static_parser`.

        Args:
            rendering_type_predictor: Object that implements RenderingTypePredictor and is capable of predicting which
                rendering method should be used. If None, then `DefaultRenderingTypePredictor` is used.
            result_checker: Function that evaluates whether crawling result is valid or not.
            result_comparator: Function that compares two crawling results and decides whether they are equivalent.
            static_parser: Implementation of `AbstractHttpParser`. Parser that will be used for static crawling.
            static_crawler_specific_kwargs: `AbstractHttpCrawler` only kwargs that are passed to the sub crawler.
            playwright_crawler_specific_kwargs: `PlaywrightCrawler` only kwargs that are passed to the sub crawler.
            statistics: A custom `Statistics[AdaptivePlaywrightCrawlerStatisticState]` instance, allowing the use of
                non-default configuration.
            kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
        """
        # Adaptive crawling related.
        self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
        self.result_checker = result_checker or (lambda _: True)
        self.result_comparator = result_comparator or create_default_comparator(result_checker)

        # Set default concurrency settings for browser crawlers if not provided
        if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
            kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)

        adaptive_statistics = statistics or Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)

        super().__init__(statistics=adaptive_statistics, **kwargs)

        # Sub crawlers related.
        playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or _PlaywrightCrawlerAdditionalOptions()

        # Each sub crawler will use custom logger .
        static_logger = getLogger('Subcrawler_static')
        static_logger.setLevel(logging.ERROR)
        basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}

        pw_logger = getLogger('Subcrawler_playwright')
        pw_logger.setLevel(logging.ERROR)
        basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}

        # Initialize sub crawlers to create their pipelines.
        static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)

        static_crawler = static_crawler_class(
            parser=static_parser,
            statistics=_NonPersistentStatistics(),
            **basic_crawler_kwargs_for_static_crawler,
        )
        playwright_crawler = PlaywrightCrawler(
            statistics=_NonPersistentStatistics(),
            **playwright_crawler_specific_kwargs,
            **basic_crawler_kwargs_for_pw_crawler,
        )

        # Register pre navigation hooks on sub crawlers
        self._pre_navigation_hooks = list[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]]()
        self._pre_navigation_hooks_pw_only = list[
            Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]
        ]()

        async def adaptive_pre_navigation_hook_static(context: BasicCrawlingContext) -> None:
            for hook in self._pre_navigation_hooks:
                await hook(AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_context(context))

        async def adaptive_pre_navigation_hook_pw(context: PlaywrightPreNavCrawlingContext) -> None:
            for hook in self._pre_navigation_hooks + self._pre_navigation_hooks_pw_only:
                await hook(AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_context(context))

        static_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_static)
        playwright_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_pw)

        # Register post navigation hooks on sub crawlers
        self._post_navigation_hooks = list[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]]()
        self._post_navigation_hooks_pw_only = list[
            Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]
        ]()

        async def adaptive_post_navigation_hook_static(context: HttpCrawlingContext) -> None:
            adaptive_context = await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context)
            for hook in self._post_navigation_hooks:
                await hook(adaptive_context)

        async def adaptive_post_navigation_hook_pw(context: PlaywrightPostNavCrawlingContext) -> None:
            adaptive_context = await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context)
            for hook in self._post_navigation_hooks + self._post_navigation_hooks_pw_only:
                await hook(adaptive_context)

        static_crawler.post_navigation_hook(adaptive_post_navigation_hook_static)
        playwright_crawler.post_navigation_hook(adaptive_post_navigation_hook_pw)

        self._additional_context_managers = [
            *self._additional_context_managers,
            self.rendering_type_predictor,
            static_crawler.statistics,
            playwright_crawler.statistics,
            playwright_crawler._browser_pool,  # noqa: SLF001 # Intentional access to private member.
        ]

        # Sub crawler pipeline related
        self._pw_context_pipeline = playwright_crawler._context_pipeline  # noqa:SLF001  # Intentional access to private member.
        self._static_context_pipeline = static_crawler._context_pipeline  # noqa:SLF001  # Intentional access to private member.
        self._static_parser = static_parser

    @classmethod
    def with_beautifulsoup_static_parser(
        cls,
        rendering_type_predictor: RenderingTypePredictor | None = None,
        result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,
        result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,
        parser_type: BeautifulSoupParserType = 'lxml',
        playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,
        statistics: Statistics[StatisticsState] | None = None,
        **kwargs: Unpack[_BasicCrawlerOptions],
    ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup, Tag]:
        """Create `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content."""
        if statistics is not None:
            adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState)
        else:
            adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)
        return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup, Tag](
            rendering_type_predictor=rendering_type_predictor,
            result_checker=result_checker,
            result_comparator=result_comparator,
            static_parser=BeautifulSoupParser(parser=parser_type),
            playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs,
            statistics=adaptive_statistics,
            **kwargs,
        )

    @classmethod
    def with_parsel_static_parser(
        cls,
        rendering_type_predictor: RenderingTypePredictor | None = None,
        result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,
        result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,
        playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,
        statistics: Statistics[StatisticsState] | None = None,
        **kwargs: Unpack[_BasicCrawlerOptions],
    ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector, Selector]:
        """Create `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content."""
        if statistics is not None:
            adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState)
        else:
            adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)
        return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector, Selector](
            rendering_type_predictor=rendering_type_predictor,
            result_checker=result_checker,
            result_comparator=result_comparator,
            static_parser=ParselParser(),
            playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs,
            statistics=adaptive_statistics,
            **kwargs,
        )

    async def _crawl_one(
        self,
        rendering_type: RenderingType,
        context: BasicCrawlingContext,
        state: dict[str, JsonSerializable] | None = None,
    ) -> SubCrawlerRun:
        """Perform a one request crawl with specific context pipeline and return `SubCrawlerRun`.

        `SubCrawlerRun` contains either result of the crawl or the exception that was thrown during the crawl.
        Sub crawler pipeline call is dynamically created based on the `rendering_type`.
        New copy-like context is created from passed `context` and `state` and is passed to sub crawler pipeline.
        """
        if state is not None:

            async def get_input_state(
                default_value: dict[str, JsonSerializable] | None = None,  # noqa:ARG001  # Intentionally unused arguments. Closure, that generates same output regardless of inputs.
            ) -> dict[str, JsonSerializable]:
                return state

            use_state_function = get_input_state
        else:
            use_state_function = context.use_state

        # New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
        result = RequestHandlerRunResult(
            key_value_store_getter=self.get_key_value_store,
            request=context.request,
        )
        context_linked_to_result = BasicCrawlingContext(
            request=result.request,
            session=context.session,
            proxy_info=context.proxy_info,
            send_request=context.send_request,
            add_requests=result.add_requests,
            push_data=result.push_data,
            get_key_value_store=result.get_key_value_store,
            use_state=use_state_function,
            log=context.log,
        )

        try:
            await wait_for(
                lambda: self._pipeline_call_factory(
                    rendering_type=rendering_type, context_linked_to_result=context_linked_to_result
                ),
                timeout=self._request_handler_timeout,
                timeout_message=(
                    f'{rendering_type=!s} timed out after {self._request_handler_timeout.total_seconds()}seconds'
                ),
                logger=self._logger,
            )
            return SubCrawlerRun(result=result)
        except Exception as e:
            return SubCrawlerRun(exception=e)

    def _pipeline_call_factory(
        self, rendering_type: RenderingType, context_linked_to_result: BasicCrawlingContext
    ) -> Coroutine[Any, Any, None]:
        """Create sub crawler pipeline call."""
        if rendering_type == 'static':

            async def from_static_pipeline_to_top_router(
                context: ParsedHttpCrawlingContext[TStaticParseResult],
            ) -> None:
                adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context(
                    context=context, parser=self._static_parser
                )
                await self.router(adaptive_crawling_context)

            return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router)  # ty: ignore[invalid-argument-type]

        if rendering_type == 'client only':

            async def from_pw_pipeline_to_top_router(context: PlaywrightCrawlingContext) -> None:
                adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context(
                    context=context, parser=self._static_parser
                )
                await self.router(adaptive_crawling_context)

            return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router)  # ty: ignore[invalid-argument-type]

        raise RuntimeError(
            f'Not a valid rendering type. Must be one of the following: {", ".join(get_args(RenderingType))}'
        )

    @override
    async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
        """Override BasicCrawler method that delegates request processing to sub crawlers.

        To decide which sub crawler should process the request it runs `rendering_type_predictor`.
        To check if results are valid it uses `result_checker`.
        To compare results of both sub crawlers it uses `result_comparator`.

        Reference implementation: https://github.com/apify/crawlee/blob/master/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts
        """
        rendering_type_prediction = self.rendering_type_predictor.predict(context.request)
        should_detect_rendering_type = random() < rendering_type_prediction.detection_probability_recommendation

        if not should_detect_rendering_type:
            self.log.debug(
                f'Predicted rendering type {rendering_type_prediction.rendering_type} for {context.request.url}'
            )
            if rendering_type_prediction.rendering_type == 'static':
                context.log.debug(f'Running static request for {context.request.url}')
                self.track_http_only_request_handler_runs()

                static_run = await self._crawl_one(rendering_type='static', context=context)
                if static_run.result and self.result_checker(static_run.result):
                    self._context_result_map[context] = static_run.result
                    return
                if static_run.exception:
                    context.log.exception(
                        msg=f'Static crawler: failed for {context.request.url}', exc_info=static_run.exception
                    )
                else:
                    context.log.warning(f'Static crawler: returned a suspicious result for {context.request.url}')
                    self.track_rendering_type_mispredictions()

        context.log.debug(f'Running browser request handler for {context.request.url}')

        old_state_copy = None

        if should_detect_rendering_type:
            # Save copy of global state from `use_state` before it can be mutated by browser crawl.
            # This copy will be used in the static crawl to make sure they both run with same conditions and to
            # avoid static crawl to modify the state.
            # (This static crawl is performed only to evaluate rendering type detection.)
            kvs = await context.get_key_value_store()
            default_value = dict[str, JsonSerializable]()
            old_state: dict[str, JsonSerializable] = await kvs.get_value(self._CRAWLEE_STATE_KEY, default_value)
            old_state_copy = deepcopy(old_state)

        pw_run = await self._crawl_one('client only', context=context)
        self.track_browser_request_handler_runs()

        if pw_run.exception is not None:
            raise pw_run.exception

        if pw_run.result:
            if should_detect_rendering_type:
                detection_result: RenderingType
                static_run = await self._crawl_one('static', context=context, state=old_state_copy)
                if static_run.result and self.result_comparator(static_run.result, pw_run.result):
                    detection_result = 'static'
                else:
                    detection_result = 'client only'

                context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
                self.rendering_type_predictor.store_result(context.request, detection_result)

            self._context_result_map[context] = pw_run.result

    def pre_navigation_hook(
        self,
        hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
        *,
        playwright_only: bool = False,
    ) -> Callable[[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]], None]:
        """Pre navigation hooks for adaptive crawler are delegated to sub crawlers.

        Optionally parametrized decorator.
        Hooks are wrapped in context that handles possibly missing `page` object by raising `AdaptiveContextError`.
        """

        def register_hooks(hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None:
            if playwright_only:
                self._pre_navigation_hooks_pw_only.append(hook)
            else:
                self._pre_navigation_hooks.append(hook)

        # No parameter in decorator. Execute directly.
        if hook:
            register_hooks(hook)

        # Return parametrized decorator that will be executed through decorator syntax if called with parameter.
        return register_hooks

    def post_navigation_hook(
        self,
        hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]] | None = None,
        *,
        playwright_only: bool = False,
    ) -> Callable[[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]], None]:
        """Post navigation hooks for adaptive crawler are delegated to sub crawlers.

        Optionally parametrized decorator.
        Hooks are wrapped in context that handles possibly missing `page` and `response` objects by raising
        `AdaptiveContextError`.
        """

        def register_hooks(hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None:
            if playwright_only:
                self._post_navigation_hooks_pw_only.append(hook)
            else:
                self._post_navigation_hooks.append(hook)

        # No parameter in decorator. Execute directly.
        if hook:
            register_hooks(hook)

        # Return parametrized decorator that will be executed through decorator syntax if called with parameter.
        return register_hooks

    def track_http_only_request_handler_runs(self) -> None:
        self.statistics.state.http_only_request_handler_runs += 1

    def track_browser_request_handler_runs(self) -> None:
        self.statistics.state.browser_request_handler_runs += 1

    def track_rendering_type_mispredictions(self) -> None:
        self.statistics.state.rendering_type_mispredictions += 1


@dataclass(frozen=True)
class SubCrawlerRun:
    result: RequestHandlerRunResult | None = None
    exception: Exception | None = None


================================================
FILE: src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py
================================================
from __future__ import annotations

from typing import Annotated

from pydantic import ConfigDict, Field

from crawlee._utils.docs import docs_group
from crawlee.statistics import StatisticsState


@docs_group('Statistics')
class AdaptivePlaywrightCrawlerStatisticState(StatisticsState):
    """Statistic data about a crawler run with additional information related to adaptive crawling."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')

    http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0
    """Number representing how many times static http based crawling was used."""

    browser_request_handler_runs: Annotated[int, Field(alias='browser_request_handler_runs')] = 0
    """Number representing how many times browser based crawling was used."""

    rendering_type_mispredictions: Annotated[int, Field(alias='rendering_type_mispredictions')] = 0
    """Number representing how many times the predictor gave incorrect prediction."""


================================================
FILE: src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py
================================================
from __future__ import annotations

from dataclasses import dataclass, fields
from datetime import timedelta
from typing import TYPE_CHECKING, Generic, TypeVar

from playwright.async_api import TimeoutError as PlaywrightTimeoutError

from crawlee._types import BasicCrawlingContext
from crawlee._utils.docs import docs_group
from crawlee.crawlers import AbstractHttpParser, ParsedHttpCrawlingContext, PlaywrightCrawlingContext
from crawlee.crawlers._abstract_http._http_crawling_context import HttpCrawlingContext
from crawlee.crawlers._playwright._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext
from crawlee.crawlers._playwright._types import PlaywrightHttpResponse

if TYPE_CHECKING:
    from collections.abc import Awaitable, Callable, Sequence

    from playwright.async_api import Page, Response
    from typing_extensions import Self

    from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions


TStaticParseResult = TypeVar('TStaticParseResult')
TStaticSelectResult = TypeVar('TStaticSelectResult')


class AdaptiveContextError(RuntimeError):
    pass


@dataclass(frozen=True)
@docs_group('Crawling contexts')
class AdaptivePlaywrightCrawlingContext(
    ParsedHttpCrawlingContext[TStaticParseResult],
    Generic[TStaticParseResult, TStaticSelectResult],
):
    _static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult]
    """The crawling context used by `AdaptivePlaywrightCrawler`.

    It provides access to key objects as well as utility functions for handling crawling tasks.
    """

    _response: Response | None = None
    _infinite_scroll: Callable[[], Awaitable[None]] | None = None
    _page: Page | None = None

    @property
    def page(self) -> Page:
        """The Playwright `Page` object for the current page.

        Raises `AdaptiveContextError` if accessed during static crawling.
        """
        if not self._page:
            raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.')
        return self._page

    @property
    def infinite_scroll(self) -> Callable[[], Awaitable[None]]:
        """A function to perform infinite scrolling on the page.

        This scrolls to the bottom, triggering the loading of additional content if present.
        Raises `AdaptiveContextError` if accessed during static crawling.
        """
        if not self._infinite_scroll:
            raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.')
        return self._infinite_scroll

    @property
    def response(self) -> Response:
        """The Playwright `Response` object containing the response details for the current URL.

        Raises `AdaptiveContextError` if accessed during static crawling.
        """
        if not self._response:
            raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.')
        return self._response

    async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> None:
        """Locate element by css selector and return `None` once it is found.

        If element is not found within timeout, `TimeoutError` is raised.

        Args:
            selector: Css selector to be used to locate specific element on page.
            timeout: Timeout that defines how long the function wait for the selector to appear.
        """
        if await self._static_parser.select(await self.parse_with_static_parser(), selector):
            return
        await self.page.locator(selector).wait_for(timeout=timeout.total_seconds() * 1000)

    async def query_selector_one(
        self, selector: str, timeout: timedelta = timedelta(seconds=5)
    ) -> TStaticSelectResult | None:
        """Locate element by css selector and return first element found.

        If element is not found within timeout, `TimeoutError` is raised.

        Args:
            selector: Css selector to be used to locate specific element on page.
            timeout: Timeout that defines how long the function wait for the selector to appear.

        Returns:
            Result of used static parser `select` method.
        """
        if matches := await self.query_selector_all(selector=selector, timeout=timeout):
            return matches[0]
        return None

    async def query_selector_all(
        self, selector: str, timeout: timedelta = timedelta(seconds=5)
    ) -> Sequence[TStaticSelectResult]:
        """Locate element by css selector and return all elements found.

        If element is not found within timeout, `TimeoutError` is raised.

        Args:
            selector: Css selector to be used to locate specific element on page.
            timeout: Timeout that defines how long the function wait for the selector to appear.

        Returns:
            List of results of used static parser `select` method.
        """
        if static_content := await self._static_parser.select(await self.parse_with_static_parser(), selector):
            # Selector found in static content.
            return static_content

        locator = self.page.locator(selector)
        try:
            await locator.wait_for(timeout=timeout.total_seconds() * 1000)
        except PlaywrightTimeoutError:
            # Selector not found at all.
            return ()

        parsed_selector = await self._static_parser.select(
            await self._static_parser.parse_text(await locator.evaluate('el => el.outerHTML')), selector
        )
        if parsed_selector is not None:
            # Selector found by browser after some wait time and selected by static parser.
            return parsed_selector

        # Selector found by browser after some wait time, but could not be selected by static parser.
        raise AdaptiveContextError(
            'Element exists on the page and Playwright was able to locate it, but the static content parser of selected'
            'static crawler does support such selector.'
        )

    async def parse_with_static_parser(
        self, selector: str | None = None, timeout: timedelta = timedelta(seconds=5)
    ) -> TStaticParseResult:
        """Parse whole page with static parser. If `selector` argument is used, wait for selector first.

        If element is not found within timeout, TimeoutError is raised.

        Args:
            selector: css selector to be used to locate specific element on page.
            timeout: timeout that defines how long the function wait for the selector to appear.

        Returns:
            Result of used static parser `parse_text` method.
        """
        if selector:
            await self.wait_for_selector(selector, timeout)
        if self._page:
            return await self._static_parser.parse_text(await self.page.content())
        return self.parsed_content

    @classmethod
    def from_parsed_http_crawling_context(
        cls,
        context: ParsedHttpCrawlingContext[TStaticParseResult],
        parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult],
    ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult, TStaticSelectResult]:
        """Initialize a new instance from an existing `ParsedHttpCrawlingContext`."""
        return cls(_static_parser=parser, **{field.name: getattr(context, field.name) for field in fields(context)})

    @classmethod
    async def from_playwright_crawling_context(
        cls,
        context: PlaywrightCrawlingContext,
        parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult],
    ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult, TStaticSelectResult]:
        """Initialize a new instance from an existing `PlaywrightCrawlingContext`."""
        context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
        # Remove playwright specific attributes and pass them as private instead to be available as property.
        context_kwargs['_response'] = context_kwargs.pop('response')
        context_kwargs['_page'] = context_kwargs.pop('page')
        context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll')
        # This might not be always available.
        protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0]?.nextHopProtocol')
        http_response = await PlaywrightHttpResponse.from_playwright_response(
            response=context.response, protocol=protocol_guess or ''
        )
        # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
        context_kwargs.pop('block_requests')
        context_kwargs.pop('goto_options')
        return cls(
            parsed_content=await parser.parse(http_response),
            http_response=http_response,
            _static_parser=parser,
            **context_kwargs,
        )


@dataclass(frozen=True)
@docs_group('Crawling contexts')
class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
    """A wrapper around BasicCrawlingContext or AdaptivePlaywrightCrawlingContext.

    Trying to access `page` on this context will raise AdaptiveContextError if wrapped context is BasicCrawlingContext.
    """

    _page: Page | None = None
    block_requests: BlockRequestsFunction | None = None
    """Blocks network requests matching specified URL patterns."""

    goto_options: GotoOptions | None = None
    """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""

    @property
    def page(self) -> Page:
        """The Playwright `Page` object for the current page.

        Raises `AdaptiveContextError` if accessed during static crawling.
        """
        if self._page is not None:
            return self._page
        raise AdaptiveContextError(
            'Page was crawled with static sub crawler and not with crawled with PlaywrightCrawler. For Playwright only '
            'hooks please use `playwright_only`=True when registering the hook. '
            'For example: @crawler.pre_navigation_hook(playwright_only=True)'
        )

    @classmethod
    def from_pre_navigation_context(cls, context: BasicCrawlingContext) -> Self:
        """Initialize a new instance from an existing pre-navigation `BasicCrawlingContext`."""
        context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
        context_kwargs['_page'] = context_kwargs.pop('page', None)

        # For static sub crawler replace block requests by function doing nothing.
        async def dummy_block_requests(
            url_patterns: list[str] | None = None,  # noqa:ARG001
            extra_url_patterns: list[str] | None = None,  # noqa:ARG001
        ) -> None:
            return

        context_kwargs['block_requests'] = context_kwargs.pop('block_requests', dummy_block_requests)
        return cls(**context_kwargs)


@dataclass(frozen=True)
@docs_group('Crawling contexts')
class AdaptivePlaywrightPostNavCrawlingContext(HttpCrawlingContext):
    """A wrapper around HttpCrawlingContext or AdaptivePlaywrightCrawlingContext.

    Trying to access `page` on this context will raise AdaptiveContextError if wrapped context is HttpCrawlingContext.
    """

    _page: Page | None = None
    _response: Response | None = None

    @property
    def page(self) -> Page:
        """The Playwright `Page` object for the current page.

        Raises `AdaptiveContextError` if accessed during static crawling.
        """
        if not self._page:
            raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.')
        return self._page

    @property
    def response(self) -> Response:
        """The Playwright `Response` object containing the response details for the current URL.

        Raises `AdaptiveContextError` if accessed during static crawling.
        """
        if not self._response:
            raise AdaptiveContextError('Response was not crawled with PlaywrightCrawler.')
        return self._response

    @classmethod
    async def from_post_navigation_context(
        cls, context: HttpCrawlingContext | PlaywrightPostNavCrawlingContext
    ) -> Self:
        """Initialize a new instance from an existing post-navigation context."""
        context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}

        context_kwargs['_page'] = context_kwargs.pop('page', None)
        context_kwargs['_response'] = context_kwargs.pop('response', None)

        # block_requests and goto_options are useful only on pre-navigation contexts.
        context_kwargs.pop('block_requests', None)
        context_kwargs.pop('goto_options', None)

        if isinstance(context, PlaywrightPostNavCrawlingContext):
            protocol_guess = await context_kwargs['_page'].evaluate(
                '() => performance.getEntries()[0]?.nextHopProtocol'
            )
            context_kwargs['http_response'] = await PlaywrightHttpResponse.from_playwright_response(
                response=context.response, protocol=protocol_guess or ''
            )
        return cls(**context_kwargs)


================================================
FILE: src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py
================================================
from __future__ import annotations

from abc import ABC, abstractmethod
from collections import defaultdict
from dataclasses import dataclass
from itertools import zip_longest
from logging import getLogger
from statistics import mean
from typing import TYPE_CHECKING, Annotated, Literal
from urllib.parse import urlparse

from jaro import jaro_winkler_metric
from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator
from sklearn.linear_model import LogisticRegression
from typing_extensions import override

from crawlee._utils.docs import docs_group
from crawlee._utils.recoverable_state import RecoverableState

from ._utils import sklearn_model_serializer, sklearn_model_validator

if TYPE_CHECKING:
    from types import TracebackType

    from crawlee import Request

logger = getLogger(__name__)

UrlComponents = list[str]
RenderingType = Literal['static', 'client only']
FeatureVector = tuple[float, float]


class RenderingTypePredictorState(BaseModel):
    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

    model: Annotated[
        LogisticRegression,
        Field(LogisticRegression),
        PlainValidator(sklearn_model_validator),
        PlainSerializer(sklearn_model_serializer),
    ]

    labels_coefficients: Annotated[defaultdict[str, float], Field(alias='labelsCoefficients')]


@docs_group('Other')
@dataclass(frozen=True)
class RenderingTypePrediction:
    """Rendering type recommendation with detection probability recommendation."""

    rendering_type: RenderingType
    """Recommended rendering type."""
    detection_probability_recommendation: float
    """Recommended rendering detection probability. Expected values between 0-1.

    Zero represents absolute confidence in `rendering_type` recommendation.
    One represents no confidence in `rendering_type` recommendation."""


@docs_group('Other')
class RenderingTypePredictor(ABC):
    """Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls."""

    def __init__(self) -> None:
        """Initialize a new instance."""
        # Flag to indicate the state.
        self._active = False

    @abstractmethod
    def predict(self, request: Request) -> RenderingTypePrediction:
        """Get `RenderingTypePrediction` based on the input request.

        Args:
            request: `Request` instance for which the prediction is made.
        """

    @abstractmethod
    def store_result(self, request: Request, rendering_type: RenderingType) -> None:
        """Store prediction results and retrain the model.

        Args:
            request: Used request.
            rendering_type: Known suitable `RenderingType`.
        """

    async def initialize(self) -> None:
        """Initialize additional resources required for the predictor operation."""
        if self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is already active.')
        self._active = True

    async def clear(self) -> None:
        """Clear and release additional resources used by the predictor."""
        if not self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is not active.')
        self._active = False

    async def __aenter__(self) -> RenderingTypePredictor:
        """Initialize the predictor upon entering the context manager."""
        await self.initialize()
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        """Clear the predictor upon exiting the context manager."""
        await self.clear()


@docs_group('Other')
class DefaultRenderingTypePredictor(RenderingTypePredictor):
    """Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls.

    `RenderingTypePredictor` implementation based on logistic regression:
    https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
    """

    def __init__(
        self,
        detection_ratio: float = 0.1,
        *,
        persistence_enabled: bool = False,
        persist_state_key: str = 'rendering-type-predictor-state',
    ) -> None:
        """Initialize a new instance.

        Args:
            detection_ratio: A number between 0 and 1 that determines the desired ratio of rendering type detections.
            persist_state_key: Key in the key-value storage where the trained model parameters will be saved.
            If None, defaults to 'rendering-type-predictor-state'.
            persistence_enabled: Whether to enable persistence of the trained model parameters for reuse.

        """
        super().__init__()

        self._rendering_type_detection_results: dict[RenderingType, dict[str, list[UrlComponents]]] = {
            'static': defaultdict(list),
            'client only': defaultdict(list),
        }
        self._detection_ratio = max(0, min(1, detection_ratio))

        # Used to increase detection probability recommendation for initial recommendations of each label.
        # Reaches 1 (no additional increase) after n samples of specific label is already present in
        # `self._rendering_type_detection_results`.
        n = 3

        self._state = RecoverableState(
            default_state=RenderingTypePredictorState(
                model=LogisticRegression(max_iter=1000), labels_coefficients=defaultdict(lambda: n + 2)
            ),
            persist_state_key=persist_state_key,
            persistence_enabled=persistence_enabled,
            logger=logger,
        )

    @override
    async def initialize(self) -> None:
        """Get current state of the predictor."""
        await super().initialize()

        if not self._state.is_initialized:
            await self._state.initialize()

    @override
    async def clear(self) -> None:
        """Clear the predictor state."""
        await super().clear()

        if self._state.is_initialized:
            await self._state.teardown()

    @override
    def predict(self, request: Request) -> RenderingTypePrediction:
        """Get `RenderingTypePrediction` based on the input request.

        Args:
            request: `Request` instance for which the prediction is made.
        """
        similarity_threshold = 0.1  #  Prediction probability difference threshold to consider prediction unreliable.
        label = request.label or ''

        # Check that the model has already been fitted.
        if hasattr(self._state.current_value.model, 'coef_'):
            url_feature = self._calculate_feature_vector(get_url_components(request.url), label)
            # Are both calls expensive?
            prediction = self._state.current_value.model.predict([url_feature])[0]
            probability = self._state.current_value.model.predict_proba([url_feature])[0]

            if abs(probability[0] - probability[1]) < similarity_threshold:
                # Prediction not reliable.
                detection_probability_recommendation = 1.0
            else:
                detection_probability_recommendation = self._detection_ratio
                # Increase recommendation for uncommon labels.
                detection_probability_recommendation *= self._state.current_value.labels_coefficients[label]

            return RenderingTypePrediction(
                rendering_type=('client only', 'static')[int(prediction)],
                detection_probability_recommendation=detection_probability_recommendation,
            )
        # No data available yet.
        return RenderingTypePrediction(rendering_type='client only', detection_probability_recommendation=1)

    @override
    def store_result(self, request: Request, rendering_type: RenderingType) -> None:
        """Store prediction results and retrain the model.

        Args:
            request: Used `Request` instance.
            rendering_type: Known suitable `RenderingType` for the used `Request` instance.
        """
        label = request.label or ''
        self._rendering_type_detection_results[rendering_type][label].append(get_url_components(request.url))
        if self._state.current_value.labels_coefficients[label] > 1:
            self._state.current_value.labels_coefficients[label] -= 1
        self._retrain()

    def _retrain(self) -> None:
        x: list[FeatureVector] = [(0, 1), (1, 0)]
        y: list[float] = [0, 1]

        for rendering_type, urls_by_label in self._rendering_type_detection_results.items():
            encoded_rendering_type = 1 if rendering_type == 'static' else 0
            for label, urls in urls_by_label.items():
                for url_components in urls:
                    x.append(self._calculate_feature_vector(url_components, label))
                    y.append(encoded_rendering_type)

        self._state.current_value.model.fit(x, y)

    def _calculate_mean_similarity(self, url: UrlComponents, label: str, rendering_type: RenderingType) -> float:
        if not self._rendering_type_detection_results[rendering_type][label]:
            return 0
        return mean(
            calculate_url_similarity(url, known_url_components)
            for known_url_components in self._rendering_type_detection_results[rendering_type][label]
        )

    def _calculate_feature_vector(self, url: UrlComponents, label: str) -> tuple[float, float]:
        return (
            self._calculate_mean_similarity(url, label, 'static'),
            self._calculate_mean_similarity(url, label, 'client only'),
        )


def get_url_components(url: str) -> UrlComponents:
    """Get list of url components where first component is host name."""
    parsed_url = urlparse(url)
    if parsed_url.path:
        return [parsed_url.netloc, *parsed_url.path.strip('/').split('/')]
    return [parsed_url.netloc]


def calculate_url_similarity(url_1: UrlComponents, url_2: UrlComponents) -> float:
    """Calculate url similarity based on host name and path components similarity.

    Return 0 if different host names.
    Compare path components using jaro-wrinkler method and assign 1 or 0 value based on similarity_cutoff for each
    path component. Return their weighted average.
    """
    # Anything with jaro_winkler_metric less than this value is considered completely different,
    # otherwise considered the same.
    similarity_cutoff = 0.8

    if (url_1[0] != url_2[0]) or not url_1 or not url_2:
        return 0
    if url_1 == url_2:
        return 1

    # Each additional path component from longer path is compared to empty string.
    return mean(
        1 if jaro_winkler_metric(path_1, path_2) > similarity_cutoff else 0
        for path_1, path_2 in zip_longest(url_1[1:], url_2[1:], fillvalue='')
    )


================================================
FILE: src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from collections.abc import Callable

    from crawlee._types import RequestHandlerRunResult


def create_default_comparator(
    result_checker: Callable[[RequestHandlerRunResult], bool] | None,
) -> Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool]:
    """Create a default comparator function for evaluating request handler results."""
    if result_checker:
        # Fallback comparator if only user-specific checker is defined.
        return lambda result_1, result_2: result_checker(result_1) and result_checker(result_2)
    # Fallback default comparator.
    return push_data_only_comparator


def full_result_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool:
    """Compare results by comparing all their parts.

    Comparison of `add_requests_calls` will consider same url requests with different parameters as different
    For example following two request will be considered as different requests:
    https://sdk.apify.com/docs/guides/getting-started
    https://sdk.apify.com/docs/guides/getting-started?__hsfp=1136113150&__hssc=7591405.1.173549427712
    """
    return (
        (result_1.push_data_calls == result_2.push_data_calls)
        and (result_1.add_requests_calls == result_2.add_requests_calls)
        and (result_1.key_value_store_changes == result_2.key_value_store_changes)
    )


def push_data_only_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool:
    """Compare results by comparing their push data calls. Ignore other parts of results in comparison."""
    return result_1.push_data_calls == result_2.push_data_calls


================================================
FILE: src/crawlee/crawlers/_adaptive_playwright/_utils.py
================================================
from typing import Any

import numpy as np
from sklearn.linear_model import LogisticRegression


def sklearn_model_validator(v: LogisticRegression | dict[str, Any]) -> LogisticRegression:
    if isinstance(v, LogisticRegression):
        return v

    model = LogisticRegression(max_iter=1000)
    if v.get('is_fitted', False):
        model.coef_ = np.array(v['coef'])
        model.intercept_ = np.array(v['intercept'])
        model.classes_ = np.array(v['classes'])
        model.n_iter_ = np.array(v.get('n_iter', [1000]))

    return model


def sklearn_model_serializer(model: LogisticRegression) -> dict[str, Any]:
    if hasattr(model, 'coef_'):
        return {
            'coef': np.asarray(model.coef_).tolist(),
            'intercept': model.intercept_.tolist(),
            'classes': model.classes_.tolist(),
            'n_iter': model.n_iter_.tolist() if hasattr(model, 'n_iter_') else [1000],
            'is_fitted': True,
            'max_iter': model.max_iter,
            'solver': model.solver,
        }
    return {'is_fitted': False, 'max_iter': model.max_iter, 'solver': model.solver}


================================================
FILE: src/crawlee/crawlers/_basic/__init__.py
================================================
from ._basic_crawler import BasicCrawler, BasicCrawlerOptions
from ._basic_crawling_context import BasicCrawlingContext
from ._context_pipeline import ContextPipeline

__all__ = [
    'BasicCrawler',
    'BasicCrawlerOptions',
    'BasicCrawlingContext',
    'ContextPipeline',
]


================================================
FILE: src/crawlee/crawlers/_basic/_basic_crawler.py
================================================
# Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/basic-crawler/src/internals/basic-crawler.ts
from __future__ import annotations

import asyncio
import functools
import logging
import signal
import sys
import tempfile
import threading
import traceback
from asyncio import CancelledError
from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Sequence
from contextlib import AsyncExitStack, suppress
from datetime import timedelta
from functools import partial
from io import StringIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
from urllib.parse import ParseResult, urlparse
from weakref import WeakKeyDictionary

from cachetools import LRUCache
from tldextract import TLDExtract
from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack, assert_never
from yarl import URL

from crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locator
from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level
from crawlee._request import Request, RequestOptions, RequestState
from crawlee._service_locator import ServiceLocator
from crawlee._types import (
    BasicCrawlingContext,
    EnqueueLinksKwargs,
    ExportDataCsvKwargs,
    ExportDataJsonKwargs,
    GetKeyValueStoreFromRequestHandlerFunction,
    HttpHeaders,
    HttpPayload,
    LogLevel,
    RequestHandlerRunResult,
    SendRequestFunction,
    SkippedReason,
)
from crawlee._utils.docs import docs_group
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
from crawlee._utils.recurring_task import RecurringTask
from crawlee._utils.robots import RobotsTxtFile
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
from crawlee._utils.wait import wait_for
from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error
from crawlee.errors import (
    ContextPipelineInitializationError,
    ContextPipelineInterruptedError,
    HttpClientStatusCodeError,
    HttpStatusCodeError,
    RequestCollisionError,
    RequestHandlerError,
    SessionError,
    UserDefinedErrorHandlerError,
    UserHandlerTimeoutError,
)
from crawlee.events._types import Event, EventCrawlerStatusData
from crawlee.http_clients import ImpitHttpClient
from crawlee.router import Router
from crawlee.sessions import SessionPool
from crawlee.statistics import Statistics, StatisticsState
from crawlee.storages import Dataset, KeyValueStore, RequestQueue

from ._context_pipeline import ContextPipeline
from ._context_utils import swapped_context
from ._logging_utils import (
    get_one_line_error_summary_if_possible,
    reduce_asyncio_timeout_error_to_relevant_traceback_parts,
)

if TYPE_CHECKING:
    import re
    from collections.abc import Iterator
    from contextlib import AbstractAsyncContextManager

    from crawlee._types import (
        ConcurrencySettings,
        EnqueueLinksFunction,
        ExtractLinksFunction,
        GetDataKwargs,
        HttpMethod,
        JsonSerializable,
        PushDataKwargs,
    )
    from crawlee.configuration import Configuration
    from crawlee.events import EventManager
    from crawlee.http_clients import HttpClient, HttpResponse
    from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo
    from crawlee.request_loaders import RequestManager
    from crawlee.sessions import Session
    from crawlee.statistics import FinalStatistics
    from crawlee.storage_clients import StorageClient
    from crawlee.storage_clients.models import DatasetItemsListPage

TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
TRequestIterator = TypeVar('TRequestIterator', str, Request)
TParams = ParamSpec('TParams')
T = TypeVar('T')

ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]


class _BasicCrawlerOptions(TypedDict):
    """Non-generic options the `BasicCrawler` constructor."""

    configuration: NotRequired[Configuration]
    """The `Configuration` instance. Some of its properties are used as defaults for the crawler."""

    event_manager: NotRequired[EventManager]
    """The event manager for managing events for the crawler and all its components."""

    storage_client: NotRequired[StorageClient]
    """The storage client for managing storages for the crawler and all its components."""

    request_manager: NotRequired[RequestManager]
    """Manager of requests that should be processed by the crawler."""

    session_pool: NotRequired[SessionPool]
    """A custom `SessionPool` instance, allowing the use of non-default configuration."""

    proxy_configuration: NotRequired[ProxyConfiguration]
    """HTTP proxy configuration used when making requests."""

    http_client: NotRequired[HttpClient]
    """HTTP client used by `BasicCrawlingContext.send_request` method."""

    max_request_retries: NotRequired[int]
    """Specifies the maximum number of retries allowed for a request if its processing fails.
    This includes retries due to navigation errors or errors thrown from user-supplied functions
    (`request_handler`, `pre_navigation_hooks` etc.).

    This limit does not apply to retries triggered by session rotation (see `max_session_rotations`)."""

    max_requests_per_crawl: NotRequired[int | None]
    """Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.
    Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.
    Due to concurrency settings, the actual number of pages visited may slightly exceed this value."""

    max_session_rotations: NotRequired[int]
    """Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs
    or if the website blocks the request.

    The session rotations are not counted towards the `max_request_retries` limit.
    """

    max_crawl_depth: NotRequired[int | None]
    """Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.
    The crawl depth starts at 0 for initial requests and increases with each subsequent level of links.
    Requests at the maximum depth will still be processed, but no new links will be enqueued from those requests.
    If not set, crawling continues without depth restrictions.
    """

    use_session_pool: NotRequired[bool]
    """Enable the use of a session pool for managing sessions during crawling."""

    retry_on_blocked: NotRequired[bool]
    """If True, the crawler attempts to bypass bot protections automatically."""

    concurrency_settings: NotRequired[ConcurrencySettings]
    """Settings to fine-tune concurrency levels."""

    request_handler_timeout: NotRequired[timedelta]
    """Maximum duration allowed for a single request handler to run."""

    abort_on_error: NotRequired[bool]
    """If True, the crawler stops immediately when any request handler error occurs."""

    configure_logging: NotRequired[bool]
    """If True, the crawler will set up logging infrastructure automatically."""

    statistics_log_format: NotRequired[Literal['table', 'inline']]
    """If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain
    text log messages.
    """

    keep_alive: NotRequired[bool]
    """Flag that can keep crawler running even when there are no requests in queue."""

    additional_http_error_status_codes: NotRequired[Iterable[int]]
    """Additional HTTP status codes to treat as errors, triggering automatic retries when encountered."""

    ignore_http_error_status_codes: NotRequired[Iterable[int]]
    """HTTP status codes that are typically considered errors but should be treated as successful responses."""

    _additional_context_managers: NotRequired[Sequence[AbstractAsyncContextManager]]
    """Additional context managers used throughout the crawler lifecycle. Intended for use by
    subclasses rather than direct instantiation of `BasicCrawler`."""

    _logger: NotRequired[logging.Logger]
    """A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by
    subclasses rather than direct instantiation of `BasicCrawler`."""

    respect_robots_txt_file: NotRequired[bool]
    """If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,
    and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`."""

    status_message_logging_interval: NotRequired[timedelta]
    """Interval for logging the crawler status messages."""

    status_message_callback: NotRequired[
        Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]]
    ]
    """Allows overriding the default status message. The default status message is provided in the parameters.
    Returning `None` suppresses the status message."""

    id: NotRequired[int]
    """Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between
    them."""


class _BasicCrawlerOptionsGeneric(TypedDict, Generic[TCrawlingContext, TStatisticsState]):
    """Generic options the `BasicCrawler` constructor."""

    request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]
    """A callable responsible for handling requests."""

    _context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]]
    """Enables extending the request lifecycle and modifying the crawling context. Intended for use by
    subclasses rather than direct instantiation of `BasicCrawler`."""

    statistics: NotRequired[Statistics[TStatisticsState]]
    """A custom `Statistics` instance, allowing the use of non-default configuration."""


class BasicCrawlerOptions(
    _BasicCrawlerOptions,
    _BasicCrawlerOptionsGeneric[TCrawlingContext, TStatisticsState],
    Generic[TCrawlingContext, TStatisticsState],
):
    """Arguments for the `BasicCrawler` constructor.

    It is intended for typing forwarded `__init__` arguments in the subclasses.
    """


@docs_group('Crawlers')
class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
    """A basic web crawler providing a framework for crawling websites.

    The `BasicCrawler` provides a low-level functionality for crawling websites, allowing users to define their
    own page download and data extraction logic. It is designed mostly to be subclassed by crawlers with specific
    purposes. In most cases, you will want to use a more specialized crawler, such as `HttpCrawler`,
    `BeautifulSoupCrawler`, `ParselCrawler`, or `PlaywrightCrawler`. If you are an advanced user and want full
    control over the crawling process, you can subclass the `BasicCrawler` and implement the request-handling logic
    yourself.

    The crawling process begins with URLs provided by a `RequestProvider` instance. Each request is then
    handled by a user-defined `request_handler` function, which processes the page and extracts the data.

    The `BasicCrawler` includes several common features for crawling, such as:
        - automatic scaling based on the system resources,
        - retries for failed requests,
        - session management,
        - statistics tracking,
        - request routing via labels,
        - proxy rotation,
        - direct storage interaction helpers,
        - and more.
    """

    _CRAWLEE_STATE_KEY = 'CRAWLEE_STATE'
    _request_handler_timeout_text = 'Request handler timed out after'
    __next_id = 0

    def __init__(
        self,
        *,
        configuration: Configuration | None = None,
        event_manager: EventManager | None = None,
        storage_client: StorageClient | None = None,
        request_manager: RequestManager | None = None,
        session_pool: SessionPool | None = None,
        proxy_configuration: ProxyConfiguration | None = None,
        http_client: HttpClient | None = None,
        request_handler: Callable[[TCrawlingContext], Awaitable[None]] | None = None,
        max_request_retries: int = 3,
        max_requests_per_crawl: int | None = None,
        max_session_rotations: int = 10,
        max_crawl_depth: int | None = None,
        use_session_pool: bool = True,
        retry_on_blocked: bool = True,
        additional_http_error_status_codes: Iterable[int] | None = None,
        ignore_http_error_status_codes: Iterable[int] | None = None,
        concurrency_settings: ConcurrencySettings | None = None,
        request_handler_timeout: timedelta = timedelta(minutes=1),
        statistics: Statistics[TStatisticsState] | None = None,
        abort_on_error: bool = False,
        keep_alive: bool = False,
        configure_logging: bool = True,
        statistics_log_format: Literal['table', 'inline'] = 'table',
        respect_robots_txt_file: bool = False,
        status_message_logging_interval: timedelta = timedelta(seconds=10),
        status_message_callback: Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]]
        | None = None,
        id: int | None = None,
        _context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
        _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
        _logger: logging.Logger | None = None,
    ) -> None:
        """Initialize a new instance.

        Args:
            configuration: The `Configuration` instance. Some of its properties are used as defaults for the crawler.
            event_manager: The event manager for managing events for the crawler and all its components.
            storage_client: The storage client for managing storages for the crawler and all its components.
            request_manager: Manager of requests that should be processed by the crawler.
            session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
            proxy_configuration: HTTP proxy configuration used when making requests.
            http_client: HTTP client used by `BasicCrawlingContext.send_request` method.
            request_handler: A callable responsible for handling requests.
            max_request_retries: Specifies the maximum number of retries allowed for a request if its processing fails.
                This includes retries due to navigation errors or errors thrown from user-supplied functions
                (`request_handler`, `pre_navigation_hooks` etc.).
                This limit does not apply to retries triggered by session rotation (see `max_session_rotations`).
            max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
                this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
                no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed
                this value. If used together with `keep_alive`, then the crawler will be kept alive only until
                `max_requests_per_crawl` is achieved.
            max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
                if a proxy error occurs or if the website blocks the request.
                The session rotations are not counted towards the `max_request_retries` limit.
            max_crawl_depth: Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond
                this depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level
                of links. Requests at the maximum depth will still be processed, but no new links will be enqueued
                from those requests. If not set, crawling continues without depth restrictions.
            use_session_pool: Enable the use of a session pool for managing sessions during crawling.
            retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
            additional_http_error_status_codes: Additional HTTP status codes to treat as errors,
                triggering automatic retries when encountered.
            ignore_http_error_status_codes: HTTP status codes that are typically considered errors but should be treated
                as successful responses.
            concurrency_settings: Settings to fine-tune concurrency levels.
            request_handler_timeout: Maximum duration allowed for a single request handler to run.
            statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
            abort_on_error: If True, the crawler stops immediately when any request handler error occurs.
            keep_alive: If True, it will keep crawler alive even if there are no requests in queue.
                Use `crawler.stop()` to exit the crawler.
            configure_logging: If True, the crawler will set up logging infrastructure automatically.
            statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline',
                outputs statistics as plain text log messages.
            respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file
                for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added
                via `EnqueueLinksFunction`
            status_message_logging_interval: Interval for logging the crawler status messages.
            status_message_callback: Allows overriding the default status message. The default status message is
                provided in the parameters. Returning `None` suppresses the status message.
            id: Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state
                between them.
            _context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
                Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
            _additional_context_managers: Additional context managers used throughout the crawler lifecycle.
                Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
            _logger: A logger instance, typically provided by a subclass, for consistent logging labels.
                Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
        """
        if id is None:
            self._id = BasicCrawler.__next_id
            BasicCrawler.__next_id += 1
        else:
            self._id = id

        implicit_event_manager_with_explicit_config = False
        if not configuration:
            configuration = service_locator.get_configuration()
        elif not event_manager:
            implicit_event_manager_with_explicit_config = True

        if not storage_client:
            storage_client = service_locator.get_storage_client()

        if not event_manager:
            event_manager = service_locator.get_event_manager()

        self._service_locator = ServiceLocator(
            configuration=configuration, storage_client=storage_client, event_manager=event_manager
        )

        config = self._service_locator.get_configuration()

        # Core components
        self._request_manager = request_manager
        self._session_pool = session_pool or SessionPool()
        self._proxy_configuration = proxy_configuration

        self._additional_http_error_status_codes = (
            set(additional_http_error_status_codes) if additional_http_error_status_codes else set()
        )
        self._ignore_http_error_status_codes = (
            set(ignore_http_error_status_codes) if ignore_http_error_status_codes else set()
        )

        self._http_client = http_client or ImpitHttpClient()

        # Request router setup
        self._router: Router[TCrawlingContext] | None = None
        if isinstance(cast('Router', request_handler), Router):
            self._router = cast('Router[TCrawlingContext]', request_handler)
        elif request_handler is not None:
            self._router = None
            self.router.default_handler(request_handler)

        # Error, failed & skipped request handlers
        self._error_handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] | None = None
        self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None
        self._on_skipped_request: SkippedRequestCallback | None = None
        self._abort_on_error = abort_on_error

        # Crawler callbacks
        self._status_message_callback = status_message_callback

        # Context of each request with matching result of request handler.
        # Inheritors can use this to override the result of individual request handler runs in `_run_request_handler`.
        self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]()

        # Context pipeline
        self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)  # ty: ignore[invalid-argument-type]

        # Crawl settings
        self._max_request_retries = max_request_retries
        self._max_requests_per_crawl = max_requests_per_crawl
        self._max_session_rotations = max_session_rotations
        self._max_crawl_depth = max_crawl_depth
        self._respect_robots_txt_file = respect_robots_txt_file

        # Timeouts
        self._request_handler_timeout = request_handler_timeout
        self._internal_timeout = (
            config.internal_timeout
            if config.internal_timeout is not None
            else max(2 * request_handler_timeout, timedelta(minutes=5))
        )

        # Retry and session settings
        self._use_session_pool = use_session_pool
        self._retry_on_blocked = retry_on_blocked

        # Logging setup
        if configure_logging:
            root_logger = logging.getLogger()
            configure_logger(root_logger, remove_old_handlers=True)
            httpx_logger = logging.getLogger('httpx')  # Silence HTTPX logger
            httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
        self._logger = _logger or logging.getLogger(__name__)
        if implicit_event_manager_with_explicit_config:
            self._logger.warning(
                'No event manager set, implicitly using event manager from global service_locator.'
                'It is advised to explicitly set the event manager if explicit configuration is used as well.'
            )
        self._statistics_log_format = statistics_log_format

        # Statistics
        if statistics:
            self._statistics = statistics
        else:

            async def persist_state_factory() -> KeyValueStore:
                return await self.get_key_value_store()

            self._statistics = cast(
                'Statistics[TStatisticsState]',
                Statistics.with_default_state(
                    persistence_enabled=True,
                    periodic_message_logger=self._logger,
                    statistics_log_format=self._statistics_log_format,
                    log_message='Current request statistics:',
                    persist_state_kvs_factory=persist_state_factory,
                ),
            )

        # Additional context managers to enter and exit
        self._additional_context_managers = _additional_context_managers or []

        # Internal, not explicitly configurable components
        self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000)
        self._robots_txt_lock = asyncio.Lock()
        self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
        self._snapshotter = Snapshotter.from_config(config)
        self._autoscaled_pool = AutoscaledPool(
            system_status=SystemStatus(self._snapshotter),
            concurrency_settings=concurrency_settings,
            is_finished_function=self.__is_finished_function,
            is_task_ready_function=self.__is_task_ready_function,
            run_task_function=self.__run_task_function,
        )
        self._crawler_state_rec_task = RecurringTask(
            func=self._crawler_state_task, delay=status_message_logging_interval
        )
        self._previous_crawler_state: TStatisticsState | None = None

        # State flags
        self._keep_alive = keep_alive
        self._running = False
        self._has_finished_before = False

        self._failed = False

        self._unexpected_stop = False

    @property
    def log(self) -> logging.Logger:
        """The logger used by the crawler."""
        return self._logger

    @property
    def router(self) -> Router[TCrawlingContext]:
        """The `Router` used to handle each individual crawling request."""
        if self._router is None:
            self._router = Router[TCrawlingContext]()

        return self._router

    @router.setter
    def router(self, router: Router[TCrawlingContext]) -> None:
        if self._router is not None:
            raise RuntimeError('A router is already set')

        self._router = router

    @property
    def statistics(self) -> Statistics[TStatisticsState]:
        """Statistics about the current (or last) crawler run."""
        return self._statistics

    def stop(self, reason: str = 'Stop was called externally.') -> None:
        """Set flag to stop crawler.

        This stops current crawler run regardless of whether all requests were finished.

        Args:
            reason: Reason for stopping that will be used in logs.
        """
        self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
        self._unexpected_stop = True

    def _wrap_handler_with_error_context(
        self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
    ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
        """Decorate error handlers to make their context helpers usable."""

        @functools.wraps(handler)
        async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
            # Original context helpers that are from `RequestHandlerRunResult` will not be committed as the request
            # failed. Modified context provides context helpers with direct access to the storages.
            error_context = context.create_modified_copy(
                push_data=self._push_data,
                get_key_value_store=self.get_key_value_store,
                add_requests=functools.partial(self._add_requests, context),
            )
            return await handler(error_context, exception)

        return wrapped_handler

    def _stop_if_max_requests_count_exceeded(self) -> None:
        """Call `stop` when the maximum number of requests to crawl has been reached."""
        if self._max_requests_per_crawl is None:
            return

        if self._statistics.state.requests_total >= self._max_requests_per_crawl:
            self.stop(
                reason=f'The crawler has reached its limit of {self._max_requests_per_crawl} requests per crawl. '
            )

    async def _get_session(self) -> Session | None:
        """If session pool is being used, try to take a session from it."""
        if not self._use_session_pool:
            return None

        return await wait_for(
            self._session_pool.get_session,
            timeout=self._internal_timeout,
            timeout_message='Fetching a session from the pool timed out after '
            f'{self._internal_timeout.total_seconds()} seconds',
            max_retries=3,
            logger=self._logger,
        )

    async def _get_session_by_id(self, session_id: str | None) -> Session | None:
        """If session pool is being used, try to take a session by id from it."""
        if not self._use_session_pool or not session_id:
            return None

        return await wait_for(
            partial(self._session_pool.get_session_by_id, session_id),
            timeout=self._internal_timeout,
            timeout_message='Fetching a session from the pool timed out after '
            f'{self._internal_timeout.total_seconds()} seconds',
            max_retries=3,
            logger=self._logger,
        )

    async def _get_proxy_info(self, request: Request, session: Session | None) -> ProxyInfo | None:
        """Retrieve a new ProxyInfo object based on crawler configuration and the current request and session."""
        if not self._proxy_configuration:
            return None

        return await self._proxy_configuration.new_proxy_info(
            session_id=session.id if session else None,
            request=request,
            proxy_tier=None,
        )

    async def get_request_manager(self) -> RequestManager:
        """Return the configured request manager. If none is configured, open and return the default request queue."""
        if not self._request_manager:
            self._request_manager = await RequestQueue.open(
                storage_client=self._service_locator.get_storage_client(),
                configuration=self._service_locator.get_configuration(),
            )

        return self._request_manager

    async def get_dataset(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
    ) -> Dataset:
        """Return the `Dataset` with the given ID or name. If none is provided, return the default one."""
        return await Dataset.open(
            id=id,
            name=name,
            alias=alias,
            storage_client=self._service_locator.get_storage_client(),
            configuration=self._service_locator.get_configuration(),
        )

    async def get_key_value_store(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
    ) -> KeyValueStore:
        """Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS."""
        return await KeyValueStore.open(
            id=id,
            name=name,
            alias=alias,
            storage_client=self._service_locator.get_storage_client(),
            configuration=self._service_locator.get_configuration(),
        )

    def error_handler(
        self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext]
    ) -> ErrorHandler[TCrawlingContext]:
        """Register a function to handle errors occurring in request handlers.

        The error handler is invoked after a request handler error occurs and before a retry attempt.
        """
        self._error_handler = self._wrap_handler_with_error_context(handler)
        return handler

    def failed_request_handler(
        self, handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]
    ) -> FailedRequestHandler[TCrawlingContext]:
        """Register a function to handle requests that exceed the maximum retry limit.

        The failed request handler is invoked when a request has failed all retry attempts.
        """
        self._failed_request_handler = self._wrap_handler_with_error_context(handler)
        return handler

    def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
        """Register a function to handle skipped requests.

        The skipped request handler is invoked when a request is skipped due to a collision or other reasons.
        """
        self._on_skipped_request = callback
        return callback

    async def run(
        self,
        requests: Sequence[str | Request] | None = None,
        *,
        purge_request_queue: bool = True,
    ) -> FinalStatistics:
        """Run the crawler until all requests are processed.

        Args:
            requests: The requests to be enqueued before the crawler starts.
            purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default
                request queue will be purged.
        """
        if self._running:
            raise RuntimeError(
                'This crawler instance is already running, you can add more requests to it via `crawler.add_requests()`'
            )

        self._running = True

        if self._has_finished_before:
            await self._statistics.reset()

            if self._use_session_pool:
                await self._session_pool.reset_store()

            request_manager = await self.get_request_manager()
            if purge_request_queue and isinstance(request_manager, RequestQueue):
                await request_manager.drop()
                self._request_manager = await RequestQueue.open(
                    storage_client=self._service_locator.get_storage_client(),
                    configuration=self._service_locator.get_configuration(),
                )

        if requests is not None:
            await self.add_requests(requests)

        interrupted = False

        def sigint_handler() -> None:
            nonlocal interrupted

            if not interrupted:
                interrupted = True
                self._logger.info('Pausing... Press CTRL+C again to force exit.')

            run_task.cancel()

        run_task = asyncio.create_task(self._run_crawler(), name='run_crawler_task')

        if threading.current_thread() is threading.main_thread():  # `add_signal_handler` works only in the main thread
            with suppress(NotImplementedError):  # event loop signal handlers are not supported on Windows
                asyncio.get_running_loop().add_signal_handler(signal.SIGINT, sigint_handler)

        try:
            await run_task
        except CancelledError:
            pass
        finally:
            if threading.current_thread() is threading.main_thread():
                with suppress(NotImplementedError):
                    asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)

        if self._statistics.error_tracker.total > 0:
            self._logger.info(
                'Error analysis:'
                f' total_errors={self._statistics.error_tracker.total}'
                f' unique_errors={self._statistics.error_tracker.unique_error_count}'
            )

        if interrupted:
            self._logger.info(
                f'The crawl was interrupted. To resume, do: CRAWLEE_PURGE_ON_START=0 python {sys.argv[0]}'
            )

        self._running = False
        self._has_finished_before = True

        await self._save_crawler_state()

        final_statistics = self._statistics.calculate()
        if self._statistics_log_format == 'table':
            self._logger.info(f'Final request statistics:\n{final_statistics.to_table()}')
        else:
            self._logger.info('Final request statistics:', extra=final_statistics.to_dict())
        return final_statistics

    async def _run_crawler(self) -> None:
        event_manager = self._service_locator.get_event_manager()

        # Collect the context managers to be entered. Context managers that are already active are excluded,
        # as they were likely entered by the caller, who will also be responsible for exiting them.
        contexts_to_enter = [
            cm
            for cm in (
                event_manager,
                self._snapshotter,
                self._statistics,
                self._session_pool if self._use_session_pool else None,
                self._http_client,
                self._crawler_state_rec_task,
                *self._additional_context_managers,
            )
            if cm and getattr(cm, 'active', False) is False
        ]

        async with AsyncExitStack() as exit_stack:
            for context in contexts_to_enter:
                await exit_stack.enter_async_context(context)  # ty: ignore[invalid-argument-type]

            await self._autoscaled_pool.run()

    async def add_requests(
        self,
        requests: Sequence[str | Request],
        *,
        forefront: bool = False,
        batch_size: int = 1000,
        wait_time_between_batches: timedelta = timedelta(0),
        wait_for_all_requests_to_be_added: bool = False,
        wait_for_all_requests_to_be_added_timeout: timedelta | None = None,
    ) -> None:
        """Add requests to the underlying request manager in batches.

        Args:
            requests: A list of requests to add to the queue.
            forefront: If True, add requests to the forefront of the queue.
            batch_size: The number of requests to add in one batch.
            wait_time_between_batches: Time to wait between adding batches.
            wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning.
            wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added.
        """
        allowed_requests = []
        skipped = []

        for request in requests:
            check_url = request.url if isinstance(request, Request) else request
            if await self._is_allowed_based_on_robots_txt_file(check_url):
                allowed_requests.append(request)
            else:
                skipped.append(request)

        if skipped:
            skipped_tasks = [
                asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
            ]
            await asyncio.gather(*skipped_tasks)
            self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file')

        request_manager = await self.get_request_manager()

        await request_manager.add_requests(
            requests=allowed_requests,
            forefront=forefront,
            batch_size=batch_size,
            wait_time_between_batches=wait_time_between_batches,
            wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added,
            wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout,
        )

    async def use_state(
        self,
        default_value: dict[str, JsonSerializable] | None = None,
    ) -> dict[str, JsonSerializable]:
        kvs = await self.get_key_value_store()
        return await kvs.get_auto_saved_value(f'{self._CRAWLEE_STATE_KEY}_{self._id}', default_value)

    async def _save_crawler_state(self) -> None:
        store = await self.get_key_value_store()
        await store.persist_autosaved_values()

    async def get_data(
        self,
        dataset_id: str | None = None,
        dataset_name: str | None = None,
        dataset_alias: str | None = None,
        **kwargs: Unpack[GetDataKwargs],
    ) -> DatasetItemsListPage:
        """Retrieve data from a `Dataset`.

        This helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified
        one and then retrieves the data based on the provided parameters.

        Args:
            dataset_id: The ID of the `Dataset`.
            dataset_name: The name of the `Dataset` (global scope, named storage).
            dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
            kwargs: Keyword arguments to be passed to the `Dataset.get_data()` method.

        Returns:
            The retrieved data.
        """
        dataset = await Dataset.open(
            id=dataset_id,
            name=dataset_name,
            alias=dataset_alias,
            storage_client=self._service_locator.get_storage_client(),
            configuration=self._service_locator.get_configuration(),
        )
        return await dataset.get_data(**kwargs)

    async def export_data(
        self,
        path: str | Path,
        dataset_id: str | None = None,
        dataset_name: str | None = None,
        dataset_alias: str | None = None,
        **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs],
    ) -> None:
        """Export all items from a Dataset to a JSON or CSV file.

        This method simplifies the process of exporting data collected during crawling. It automatically
        determines the export format based on the file extension (`.json` or `.csv`) and handles
        the conversion of `Dataset` items to the appropriate format.

        Args:
            path: The destination file path. Must end with '.json' or '.csv'.
            dataset_id: The ID of the Dataset to export from.
            dataset_name: The name of the Dataset to export from (global scope, named storage).
            dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
            additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
        """
        dataset = await Dataset.open(
            id=dataset_id,
            name=dataset_name,
            alias=dataset_alias,
            storage_client=self._service_locator.get_storage_client(),
            configuration=self._service_locator.get_configuration(),
        )

        path = Path(path)

        if path.suffix == '.csv':
            dst = StringIO()
            csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
            await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
            await atomic_write(path, dst.getvalue())
        elif path.suffix == '.json':
            dst = StringIO()
            json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
            await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
            await atomic_write(path, dst.getvalue())
        else:
            raise ValueError(f'Unsupported file extension: {path.suffix}')

    async def _push_data(
        self,
        data: list[dict[str, Any]] | dict[str, Any],
        dataset_id: str | None = None,
        dataset_name: str | None = None,
        dataset_alias: str | None = None,
        **kwargs: Unpack[PushDataKwargs],
    ) -> None:
        """Push data to a `Dataset`.

        This helper method simplifies the process of pushing data to a `Dataset`. It opens the specified
        one and then pushes the provided data to it.

        Args:
            data: The data to push to the `Dataset`.
            dataset_id: The ID of the `Dataset`.
            dataset_name: The name of the `Dataset` (global scope, named storage).
            dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
            kwargs: Keyword arguments to be passed to the `Dataset.push_data()` method.
        """
        dataset = await self.get_dataset(id=dataset_id, name=dataset_name, alias=dataset_alias)
        await dataset.push_data(data, **kwargs)

    def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) -> bool:
        if context.request.no_retry:
            return False

        # Do not retry on client errors.
        if isinstance(error, HttpClientStatusCodeError):
            return False

        if isinstance(error, SessionError):
            return ((context.request.session_rotation_count or 0) + 1) < self._max_session_rotations

        max_request_retries = context.request.max_retries
        if max_request_retries is None:
            max_request_retries = self._max_request_retries

        return context.request.retry_count < max_request_retries

    async def _check_url_after_redirects(self, context: TCrawlingContext) -> AsyncGenerator[TCrawlingContext, None]:
        """Ensure that the `loaded_url` still matches the enqueue strategy after redirects.

        Filter out links that redirect outside of the crawled domain.
        """
        if context.request.loaded_url is not None and not self._check_enqueue_strategy(
            context.request.enqueue_strategy,
            origin_url=urlparse(context.request.url),
            target_url=urlparse(context.request.loaded_url),
        ):
            raise ContextPipelineInterruptedError(
                f'Skipping URL {context.request.loaded_url} (redirected from {context.request.url})'
            )

        yield context

    def _create_enqueue_links_function(
        self, context: BasicCrawlingContext, extract_links: ExtractLinksFunction
    ) -> EnqueueLinksFunction:
        """Create a callback function for extracting links from parsed content and enqueuing them to the crawl.

        Args:
            context: The current crawling context.
            extract_links: Function used to extract links from the page.

        Returns:
            Awaitable that is used for extracting links from parsed content and enqueuing them to the crawl.
        """

        async def enqueue_links(
            *,
            selector: str | None = None,
            attribute: str | None = None,
            label: str | None = None,
            user_data: dict[str, Any] | None = None,
            transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
            | None = None,
            requests: Sequence[str | Request] | None = None,
            rq_id: str | None = None,
            rq_name: str | None = None,
            rq_alias: str | None = None,
            **kwargs: Unpack[EnqueueLinksKwargs],
        ) -> None:
            kwargs.setdefault('strategy', 'same-hostname')

            if requests:
                if any((selector, attribute, label, user_data, transform_request_function)):
                    raise ValueError(
                        'You cannot provide `selector`, `attribute`, `label`, `user_data` or '
                        '`transform_request_function` arguments when `requests` is provided.'
                    )
                # Add directly passed requests.
                await context.add_requests(
                    requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
                )
            else:
                # Add requests from extracted links.
                await context.add_requests(
                    await extract_links(
                        selector=selector or 'a',
                        attribute=attribute or 'href',
                        label=label,
                        user_data=user_data,
                        transform_request_function=transform_request_function,
                        **kwargs,
                    ),
                    rq_id=rq_id,
                    rq_name=rq_name,
                    rq_alias=rq_alias,
                    **kwargs,
                )

        return enqueue_links

    def _enqueue_links_filter_iterator(
        self, request_iterator: Iterator[TRequestIterator], origin_url: str, **kwargs: Unpack[EnqueueLinksKwargs]
    ) -> Iterator[TRequestIterator]:
        """Filter requests based on the enqueue strategy and URL patterns."""
        limit = kwargs.get('limit')
        parsed_origin_url = urlparse(origin_url)
        strategy = kwargs.get('strategy', 'all')

        if strategy == 'all' and not parsed_origin_url.hostname:
            self.log.warning(f'Skipping enqueue: Missing hostname in origin_url = {origin_url}.')
            return

        # Emit a `warning` message to the log, only once per call
        warning_flag = True

        for request in request_iterator:
            if isinstance(request, Request):
                if request.enqueue_strategy != strategy:
                    request.enqueue_strategy = strategy
                target_url = request.url
            else:
                target_url = request
            parsed_target_url = urlparse(target_url)

            if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
                self.log.warning(f'Skipping enqueue url: Missing hostname in target_url = {target_url}.')
                warning_flag = False

            if self._check_enqueue_strategy(
                strategy, target_url=parsed_target_url, origin_url=parsed_origin_url
            ) and self._check_url_patterns(target_url, kwargs.get('include'), kwargs.get('exclude')):
                yield request

                if limit is not None:
                    limit -= 1
                    if limit <= 0:
                        break

    def _check_enqueue_strategy(
        self,
        strategy: EnqueueStrategy,
        *,
        target_url: ParseResult,
        origin_url: ParseResult,
    ) -> bool:
        """Check if a URL matches the enqueue_strategy."""
        if strategy == 'all':
            return True

        if origin_url.hostname is None or target_url.hostname is None:
            self.log.debug(
                f'Skipping enqueue: Missing hostname in origin_url = {origin_url.geturl()} or '
                f'target_url = {target_url.geturl()}'
            )
            return False

        if strategy == 'same-hostname':
            return target_url.hostname == origin_url.hostname

        if strategy == 'same-domain':
            origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
            target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
            return origin_domain == target_domain

        if strategy == 'same-origin':
            return (
                target_url.hostname == origin_url.hostname
                and target_url.scheme == origin_url.scheme
                and target_url.port == origin_url.port
            )

        assert_never(strategy)

    def _check_url_patterns(
        self,
        target_url: str,
        include: Sequence[re.Pattern[Any] | Glob] | None,
        exclude: Sequence[re.Pattern[Any] | Glob] | None,
    ) -> bool:
        """Check if a URL matches configured include/exclude patterns."""
        # If the URL matches any `exclude` pattern, reject it
        for pattern in exclude or ():
            if isinstance(pattern, Glob):
                pattern = pattern.regexp  # noqa: PLW2901

            if pattern.match(target_url) is not None:
                return False

        # If there are no `include` patterns and the URL passed all `exclude` patterns, accept the URL
        if include is None:
            return True

        # If the URL matches any `include` pattern, accept it
        for pattern in include:
            if isinstance(pattern, Glob):
                pattern = pattern.regexp  # noqa: PLW2901

            if pattern.match(target_url) is not None:
                return True

        # The URL does not match any `include` pattern - reject it
        return False

    async def _handle_request_retries(
        self,
        context: TCrawlingContext | BasicCrawlingContext,
        error: Exception,
    ) -> None:
        request_manager = await self.get_request_manager()
        request = context.request

        if self._abort_on_error:
            self._logger.exception('Aborting crawler run due to error (abort_on_error=True)', exc_info=error)
            self._failed = True

        if self._should_retry_request(context, error):
            request.retry_count += 1
            reduced_error = str(error).split('\n')[0]
            self.log.warning(
                f'Retrying request to {context.request.url} due to: {reduced_error}. '
                f'{get_one_line_error_summary_if_possible(error)}'
            )
            await self._statistics.error_tracker.add(error=error, context=context)

            if self._error_handler:
                try:
                    new_request = await self._error_handler(context, error)
                except Exception as e:
                    raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
                else:
                    if new_request is not None and new_request != request:
                        await request_manager.add_request(new_request)
                        await self._mark_request_as_handled(request)
                        return

            await request_manager.reclaim_request(request)
        else:
            request.state = RequestState.ERROR
            await self._mark_request_as_handled(request)
            await self._handle_failed_request(context, error)
            self._statistics.record_request_processing_failure(request.unique_key)

    async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
        try:
            context.request.state = RequestState.ERROR_HANDLER

            await wait_for(
                partial(self._handle_request_retries, context, error),
                timeout=self._internal_timeout,
                timeout_message='Handling request failure timed out after '
                f'{self._internal_timeout.total_seconds()} seconds',
                logger=self._logger,
            )
        except UserDefinedErrorHandlerError:
            context.request.state = RequestState.ERROR
            raise
        except Exception as secondary_error:
            self._logger.exception(
                'An exception occurred during handling of failed request. This places the crawler '
                'and its underlying storages into an unknown state and crawling will be terminated.',
                exc_info=secondary_error,
            )
            context.request.state = RequestState.ERROR
            raise

        if context.session:
            context.session.mark_bad()

    async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
        self._logger.error(
            f'Request to {context.request.url} failed and reached maximum retries\n '
            f'{self._get_message_from_error(error)}'
        )
        await self._statistics.error_tracker.add(error=error, context=context)

        if self._failed_request_handler:
            try:
                await self._failed_request_handler(context, error)
            except Exception as e:
                raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e

    async def _handle_skipped_request(
        self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
    ) -> None:
        if need_mark and isinstance(request, Request):
            request.state = RequestState.SKIPPED
            await self._mark_request_as_handled(request)

        url = request.url if isinstance(request, Request) else request

        if self._on_skipped_request:
            try:
                await self._on_skipped_request(url, reason)
            except Exception as e:
                raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e

    def _get_message_from_error(self, error: Exception) -> str:
        """Get error message summary from exception.

        Custom processing to reduce the irrelevant traceback clutter in some cases.
        """
        traceback_parts = traceback.format_exception(type(error), value=error, tb=error.__traceback__, chain=True)
        used_traceback_parts = traceback_parts

        if (
            isinstance(error, asyncio.exceptions.TimeoutError)
            and traceback_parts
            and self._request_handler_timeout_text in traceback_parts[-1]
        ) or isinstance(error, UserHandlerTimeoutError):
            used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
            used_traceback_parts.extend(traceback_parts[-1:])

        return ''.join(used_traceback_parts).strip('\n')

    def _get_only_inner_most_exception(self, error: BaseException) -> BaseException:
        """Get innermost exception by following __cause__ and __context__ attributes of exception."""
        if error.__cause__:
            return self._get_only_inner_most_exception(error.__cause__)
        if error.__context__:
            return self._get_only_inner_most_exception(error.__context__)
        # No __cause__ and no __context__, this is as deep as it can get.
        return error

    def _prepare_send_request_function(
        self,
        session: Session | None,
        proxy_info: ProxyInfo | None,
    ) -> SendRequestFunction:
        async def send_request(
            url: str,
            *,
            method: HttpMethod = 'GET',
            payload: HttpPayload | None = None,
            headers: HttpHeaders | dict[str, str] | None = None,
        ) -> HttpResponse:
            return await self._http_client.send_request(
                url=url,
                method=method,
                payload=payload,
                headers=headers,
                session=session,
                proxy_info=proxy_info,
            )

        return send_request

    def _convert_url_to_request_iterator(self, urls: Sequence[str | Request], base_url: str) -> Iterator[Request]:
        """Convert a sequence of URLs or Request objects to an iterator of Request objects."""
        for url in urls:
            # If the request is a Request object, keep it as it is
            if isinstance(url, Request):
                yield url
            # If the request is a string, convert it to Request object with absolute_url.
            elif isinstance(url, str) and not is_url_absolute(url):
                absolute_url = convert_to_absolute_url(base_url, url)
                yield Request.from_url(absolute_url)
            else:
                yield Request.from_url(url)

    async def _add_requests(
        self,
        context: BasicCrawlingContext,
        requests: Sequence[str | Request],
        rq_id: str | None = None,
        rq_name: str | None = None,
        rq_alias: str | None = None,
        **kwargs: Unpack[EnqueueLinksKwargs],
    ) -> None:
        """Add requests method aware of the crawling context."""
        if rq_id or rq_name or rq_alias:
            request_manager: RequestManager = await RequestQueue.open(
                id=rq_id,
                name=rq_name,
                alias=rq_alias,
                storage_client=self._service_locator.get_storage_client(),
                configuration=self._service_locator.get_configuration(),
            )
        else:
            request_manager = await self.get_request_manager()

        context_aware_requests = list[Request]()
        base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
        requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
        filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
        for dst_request in filter_requests_iterator:
            # Update the crawl depth of the request.
            dst_request.crawl_depth = context.request.crawl_depth + 1

            if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
                context_aware_requests.append(dst_request)

        return await request_manager.add_requests(context_aware_requests)

    async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
        """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
        result = self._context_result_map[context]

        for add_requests_call in result.add_requests_calls:
            await self._add_requests(context, **add_requests_call)

        for push_data_call in result.push_data_calls:
            await self._push_data(**push_data_call)

        await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)

        result.apply_request_changes(target=context.request)

    @staticmethod
    async def _commit_key_value_store_changes(
        result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
    ) -> None:
        """Store key value store changes recorded in result."""
        for (id, name, alias), changes in result.key_value_store_changes.items():
            store = await get_kvs(id=id, name=name, alias=alias)
            for key, value in changes.updates.items():
                await store.set_value(key, value.content, value.content_type)

    async def __is_finished_function(self) -> bool:
        self._stop_if_max_requests_count_exceeded()
        if self._unexpected_stop:
            self._logger.info('The crawler will finish any remaining ongoing requests and shut down.')
            return True

        if self._abort_on_error and self._failed:
            self._failed = False
            return True

        if self._keep_alive:
            return False

        request_manager = await self.get_request_manager()
        return await request_manager.is_finished()

    async def __is_task_ready_function(self) -> bool:
        self._stop_if_max_requests_count_exceeded()
        if self._unexpected_stop:
            self._logger.info(
                'No new requests are allowed because crawler `stop` method was called. '
                'Ongoing requests will be allowed to complete.'
            )
            return False

        request_manager = await self.get_request_manager()
        return not await request_manager.is_empty()

    async def __run_task_function(self) -> None:
        request_manager = await self.get_request_manager()

        request = await wait_for(
            request_manager.fetch_next_request,
            timeout=self._internal_timeout,
            timeout_message=f'Fetching next request failed after {self._internal_timeout.total_seconds()} seconds',
            logger=self._logger,
            max_retries=3,
        )

        if request is None:
            return

        if not (await self._is_allowed_based_on_robots_txt_file(request.url)):
            self._logger.warning(
                f'Skipping request {request.url} ({request.unique_key}) because it is disallowed based on robots.txt'
            )

            await self._handle_skipped_request(request, 'robots_txt', need_mark=True)
            return

        if request.session_id:
            session = await self._get_session_by_id(request.session_id)
        else:
            session = await self._get_session()
        proxy_info = await self._get_proxy_info(request, session)
        result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)

        context = BasicCrawlingContext(
            request=result.request,
            session=session,
            proxy_info=proxy_info,
            send_request=self._prepare_send_request_function(session, proxy_info),
            add_requests=result.add_requests,
            push_data=result.push_data,
            get_key_value_store=result.get_key_value_store,
            use_state=self.use_state,
            log=self._logger,
        )
        self._context_result_map[context] = result

        self._statistics.record_request_processing_start(request.unique_key)

        try:
            request.state = RequestState.REQUEST_HANDLER

            try:
                with swapped_context(context, request):
                    self._check_request_collision(request, session)
                    await self._run_request_handler(context=context)
            except asyncio.TimeoutError as e:
                raise RequestHandlerError(e, context) from e

            await self._commit_request_handler_result(context)

            request.state = RequestState.DONE

            await self._mark_request_as_handled(request)

            if session and session.is_usable:
                session.mark_good()

            self._statistics.record_request_processing_finish(request.unique_key)

        except RequestCollisionError as request_error:
            request.no_retry = True
            await self._handle_request_error(context, request_error)

        except RequestHandlerError as primary_error:
            primary_error = cast(
                'RequestHandlerError[TCrawlingContext]', primary_error
            )  # valid thanks to ContextPipeline

            self._logger.debug(
                'An exception occurred in the user-defined request handler',
                exc_info=primary_error.wrapped_exception,
            )
            await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)

        except SessionError as session_error:
            if not session:
                raise RuntimeError('SessionError raised in a crawling context without a session') from session_error

            if self._error_handler:
                await self._error_handler(context, session_error)

            if self._should_retry_request(context, session_error):
                exc_only = ''.join(traceback.format_exception_only(session_error)).strip()
                self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only)

                if session:
                    session.retire()

                # Increment session rotation count.
                request.session_rotation_count = (request.session_rotation_count or 0) + 1

                await request_manager.reclaim_request(request)
                await self._statistics.error_tracker_retry.add(error=session_error, context=context)
            else:
                await self._mark_request_as_handled(request)

                await self._handle_failed_request(context, session_error)
                self._statistics.record_request_processing_failure(request.unique_key)

        except ContextPipelineInterruptedError as interrupted_error:
            self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)

            await self._mark_request_as_handled(request)

        except ContextPipelineInitializationError as initialization_error:
            self._logger.debug(
                'An exception occurred during the initialization of crawling context',
                exc_info=initialization_error,
            )
            await self._handle_request_error(context, initialization_error.wrapped_exception)

        except Exception as internal_error:
            self._logger.exception(
                'An exception occurred during handling of a request. This places the crawler '
                'and its underlying storages into an unknown state and crawling will be terminated.',
                exc_info=internal_error,
            )
            raise

    async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
        context.request.state = RequestState.BEFORE_NAV
        await self._context_pipeline(
            context,
            lambda final_context: wait_for(
                lambda: self.router(final_context),
                timeout=self._request_handler_timeout,
                timeout_message=f'{self._request_handler_timeout_text}'
                f' {self._request_handler_timeout.total_seconds()} seconds',
                logger=self._logger,
            ),
        )

    def _raise_for_error_status_code(self, status_code: int) -> None:
        """Raise an exception if the given status code is considered an error.

        Args:
            status_code: The HTTP status code to check.

        Raises:
            HttpStatusCodeError: If the status code represents a server error or is explicitly configured as an error.
            HttpClientStatusCodeError: If the status code represents a client error.
        """
        is_ignored_status = status_code in self._ignore_http_error_status_codes
        is_explicit_error = status_code in self._additional_http_error_status_codes

        if is_explicit_error:
            raise HttpStatusCodeError('Error status code (user-configured) returned.', status_code)

        if is_status_code_client_error(status_code) and not is_ignored_status:
            raise HttpClientStatusCodeError('Client error status code returned', status_code)

        if is_status_code_server_error(status_code) and not is_ignored_status:
            raise HttpStatusCodeError('Error status code returned', status_code)

    def _raise_for_session_blocked_status_code(self, session: Session | None, status_code: int) -> None:
        """Raise an exception if the given status code indicates the session is blocked.

        Args:
            session: The session used for the request. If None, no check is performed.
            status_code: The HTTP status code to check.

        Raises:
            SessionError: If the status code indicates the session is blocked.
        """
        if session is not None and session.is_blocked_status_code(
            status_code=status_code,
            ignore_http_error_status_codes=self._ignore_http_error_status_codes,
        ):
            raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')

    def _check_request_collision(self, request: Request, session: Session | None) -> None:
        """Raise an exception if a request cannot access required resources.

        Args:
            request: The `Request` that might require specific resources (like a session).
            session: The `Session` that was retrieved for the request, or `None` if not available.

        Raises:
            RequestCollisionError: If the `Session` referenced by the `Request` is not available.
        """
        if self._use_session_pool and request.session_id and not session:
            raise RequestCollisionError(
                f'The Session (id: {request.session_id}) bound to the Request is no longer available in SessionPool'
            )

    async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool:
        """Check if the URL is allowed based on the robots.txt file.

        Args:
            url: The URL to verify against robots.txt rules. Returns True if crawling this URL is permitted.
        """
        if not self._respect_robots_txt_file:
            return True
        robots_txt_file = await self._get_robots_txt_file_for_url(url)
        return not robots_txt_file or robots_txt_file.is_allowed(url)

    async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
        """Get the RobotsTxtFile for a given URL.

        Args:
            url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
        """
        if not self._respect_robots_txt_file:
            return None
        origin_url = str(URL(url).origin())
        robots_txt_file = self._robots_txt_file_cache.get(origin_url)
        if robots_txt_file:
            return robots_txt_file

        async with self._robots_txt_lock:
            # Check again if the robots.txt file is already cached after acquiring the lock
            robots_txt_file = self._robots_txt_file_cache.get(origin_url)
            if robots_txt_file:
                return robots_txt_file

            # If not cached, fetch the robots.txt file
            robots_txt_file = await self._find_txt_file_for_url(url)
            self._robots_txt_file_cache[origin_url] = robots_txt_file
            return robots_txt_file

    async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
        """Find the robots.txt file for a given URL.

        Args:
            url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
        """
        return await RobotsTxtFile.find(url, self._http_client)

    def _log_status_message(self, message: str, level: LogLevel = 'DEBUG') -> None:
        """Log a status message for the crawler.

        Args:
            message: The status message to log.
            level: The logging level for the message.
        """
        log_level = string_to_log_level(level)
        self.log.log(log_level, message)

    async def _crawler_state_task(self) -> None:
        """Emit a persist state event with the given migration status."""
        event_manager = self._service_locator.get_event_manager()

        current_state = self.statistics.state

        if (
            failed_requests := (
                current_state.requests_failed - (self._previous_crawler_state or current_state).requests_failed
            )
            > 0
        ):
            message = f'Experiencing problems, {failed_requests} failed requests since last status update.'
        else:
            request_manager = await self.get_request_manager()
            total_count = await request_manager.get_total_count()
            if total_count is not None and total_count > 0:
                pages_info = f'{self._statistics.state.requests_finished}/{total_count}'
            else:
                pages_info = str(self._statistics.state.requests_finished)

            message = (
                f'Crawled {pages_info} pages, {self._statistics.state.requests_failed} failed requests, '
                f'desired concurrency {self._autoscaled_pool.desired_concurrency}.'
            )

        if self._status_message_callback:
            new_message = await self._status_message_callback(current_state, self._previous_crawler_state, message)
            if new_message:
                message = new_message
                self._log_status_message(message, level='INFO')
        else:
            self._log_status_message(message, level='INFO')

        event_manager.emit(
            event=Event.CRAWLER_STATUS, event_data=EventCrawlerStatusData(message=message, crawler_id=id(self))
        )

        self._previous_crawler_state = current_state

    async def _mark_request_as_handled(self, request: Request) -> None:
        request_manager = await self.get_request_manager()
        await wait_for(
            lambda: request_manager.mark_request_as_handled(request),
            timeout=self._internal_timeout,
            timeout_message='Marking request as handled timed out after '
            f'{self._internal_timeout.total_seconds()} seconds',
            logger=self._logger,
            max_retries=3,
        )


================================================
FILE: src/crawlee/crawlers/_basic/_basic_crawling_context.py
================================================
from __future__ import annotations

# Do just the re-export because of the circular imports.
from crawlee._types import BasicCrawlingContext  # noqa: F401


================================================
FILE: src/crawlee/crawlers/_basic/_context_pipeline.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Generic, cast

from typing_extensions import TypeVar

from crawlee._types import BasicCrawlingContext
from crawlee._utils.docs import docs_group
from crawlee.errors import (
    ContextPipelineFinalizationError,
    ContextPipelineInitializationError,
    ContextPipelineInterruptedError,
    RequestHandlerError,
    SessionError,
)

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator, Awaitable, Callable, Generator

TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
TMiddlewareCrawlingContext = TypeVar('TMiddlewareCrawlingContext', bound=BasicCrawlingContext)


class _Middleware(Generic[TMiddlewareCrawlingContext, TCrawlingContext]):
    """Helper wrapper class to make the middleware easily observable by open telemetry instrumentation."""

    def __init__(
        self,
        middleware: Callable[
            [TCrawlingContext],
            AsyncGenerator[TMiddlewareCrawlingContext, Exception | None],
        ],
        input_context: TCrawlingContext,
    ) -> None:
        self.generator = middleware(input_context)
        self.input_context = input_context
        self.output_context: TMiddlewareCrawlingContext | None = None

    async def action(self) -> TMiddlewareCrawlingContext:
        self.output_context = await self.generator.__anext__()
        return self.output_context

    async def cleanup(self, final_consumer_exception: Exception | None) -> None:
        try:
            await self.generator.asend(final_consumer_exception)
        except StopAsyncIteration:
            pass
        except ContextPipelineInterruptedError as e:
            raise RuntimeError('Invalid state - pipeline interrupted in the finalization step') from e
        except Exception as e:
            raise ContextPipelineFinalizationError(e, self.output_context or self.input_context) from e
        else:
            raise RuntimeError('The middleware yielded more than once')


@docs_group('Other')
class ContextPipeline(Generic[TCrawlingContext]):
    """Encapsulates the logic of gradually enhancing the crawling context with additional information and utilities.

    The enhancement is done by a chain of middlewares that are added to the pipeline after it's creation.
    """

    def __init__(
        self,
        *,
        _middleware: Callable[
            [TCrawlingContext],
            AsyncGenerator[TMiddlewareCrawlingContext, Exception | None],
        ]
        | None = None,
        _parent: ContextPipeline[BasicCrawlingContext] | None = None,
    ) -> None:
        self._middleware = _middleware
        self._parent = _parent

    def _middleware_chain(self) -> Generator[ContextPipeline[Any], None, None]:
        yield self

        if self._parent is not None:
            yield from self._parent._middleware_chain()  # noqa: SLF001

    async def __call__(
        self,
        crawling_context: BasicCrawlingContext,
        final_context_consumer: Callable[[TCrawlingContext], Awaitable[None]],
    ) -> None:
        """Run a crawling context through the middleware chain and pipe it into a consumer function.

        Exceptions from the consumer function are wrapped together with the final crawling context.
        """
        chain = list(self._middleware_chain())
        cleanup_stack: list[_Middleware[Any]] = []
        final_consumer_exception: Exception | None = None

        try:
            for member in reversed(chain):
                if member._middleware:  # noqa: SLF001
                    middleware_instance = _Middleware(middleware=member._middleware, input_context=crawling_context)  # noqa: SLF001
                    try:
                        result = await middleware_instance.action()
                    except SessionError:  # Session errors get special treatment
                        raise
                    except StopAsyncIteration as e:
                        raise RuntimeError('The middleware did not yield') from e
                    except ContextPipelineInterruptedError:
                        raise
                    except Exception as e:
                        raise ContextPipelineInitializationError(e, crawling_context) from e

                    crawling_context = result
                    cleanup_stack.append(middleware_instance)

            try:
                await final_context_consumer(cast('TCrawlingContext', crawling_context))
            except SessionError as e:  # Session errors get special treatment
                final_consumer_exception = e
                raise
            except Exception as e:
                final_consumer_exception = e
                raise RequestHandlerError(e, crawling_context) from e
        finally:
            for middleware_instance in reversed(cleanup_stack):
                await middleware_instance.cleanup(final_consumer_exception)

    def compose(
        self,
        middleware: Callable[
            [TCrawlingContext],
            AsyncGenerator[TMiddlewareCrawlingContext, None],
        ],
    ) -> ContextPipeline[TMiddlewareCrawlingContext]:
        """Add a middleware to the pipeline.

        The middleware should yield exactly once, and it should yield an (optionally) extended crawling context object.
        The part before the yield can be used for initialization and the part after it for cleanup.

        Returns:
            The extended pipeline instance, providing a fluent interface
        """
        return ContextPipeline[TMiddlewareCrawlingContext](
            _middleware=cast(
                'Callable[[BasicCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, Exception | None]]',
                middleware,
            ),
            _parent=cast('ContextPipeline[BasicCrawlingContext]', self),
        )


================================================
FILE: src/crawlee/crawlers/_basic/_context_utils.py
================================================
from __future__ import annotations

from contextlib import contextmanager
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from collections.abc import Iterator

    from crawlee._request import Request

    from ._basic_crawling_context import BasicCrawlingContext


@contextmanager
def swapped_context(
    context: BasicCrawlingContext,
    request: Request,
) -> Iterator[None]:
    """Replace context's isolated copies with originals after handler execution."""
    try:
        yield
    finally:
        # Restore original context state to avoid side effects between different handlers.
        object.__setattr__(context, 'request', request)


================================================
FILE: src/crawlee/crawlers/_basic/_logging_utils.py
================================================
import asyncio
import re
import traceback

import crawlee.errors


def _get_only_innermost_exception(error: BaseException) -> BaseException:
    """Get innermost exception by following __cause__ and __context__ attributes of exception.

    If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
    """
    if type(error) is crawlee.errors.UserHandlerTimeoutError:
        if error.__cause__:
            return error.__cause__
        if error.__context__:
            return error.__context__
        return error

    if error.__cause__:
        return _get_only_innermost_exception(error.__cause__)
    if error.__context__:
        return _get_only_innermost_exception(error.__context__)
    # No __cause__ and no __context__, this is as deep as it can get.
    return error


def _get_filtered_traceback_parts_for_asyncio_timeout_error(traceback_parts: list[str]) -> list[str]:
    """Extract only the most relevant traceback parts from stack trace."""
    ignore_pattern = (
        r'([\\/]{1}asyncio[\\/]{1})|'  # internal asyncio parts
        r'(Traceback \(most recent call last\))|'  # common part of the stack trace formatting
        r'(asyncio\.exceptions\.CancelledError)'  # internal asyncio exception
    )
    return [
        _strip_pep657_highlighting(traceback_part)
        for traceback_part in traceback_parts
        if not re.findall(ignore_pattern, traceback_part)
    ]


def _strip_pep657_highlighting(traceback_part: str) -> str:
    """Remove PEP 657 highlighting from the traceback."""
    highlight_pattern = r'(\n\s*~*\^+~*\n)$'
    return re.sub(highlight_pattern, '\n', traceback_part)


def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
    timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
) -> list[str]:
    innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
    return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)


def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
    innermost_error = _get_only_innermost_exception(error)
    return traceback.format_exception(
        type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
    )


def get_one_line_error_summary_if_possible(error: Exception) -> str:
    if isinstance(error, asyncio.exceptions.TimeoutError):
        relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
        most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
    elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
        # Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
        # code and third line the topmost user error
        traceback_parts = _get_traceback_parts_for_innermost_exception(error)
        relevant_index_from_start = 3
        most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
    elif 'playwright._impl._errors.Error' in str(error.__class__):
        # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
        # point to deep internals.
        return ''
    else:
        traceback_parts = _get_traceback_parts_for_innermost_exception(error)
        # Commonly last traceback part is type of the error, and the second last part is the relevant file.
        # If there are not enough traceback parts, then we are not sure how to summarize the error.
        relevant_traceback_part_index_from_end = 2
        most_relevant_part = _strip_pep657_highlighting(
            _get_traceback_parts_for_innermost_exception(error)[-relevant_traceback_part_index_from_end]
            if len(traceback_parts) >= relevant_traceback_part_index_from_end
            else ''
        )

    return most_relevant_part.strip('\n ').replace('\n', ', ')


================================================
FILE: src/crawlee/crawlers/_basic/py.typed
================================================


================================================
FILE: src/crawlee/crawlers/_beautifulsoup/__init__.py
================================================
from crawlee._utils.try_import import install_import_hook as _install_import_hook
from crawlee._utils.try_import import try_import as _try_import

_install_import_hook(__name__)

# The following imports are wrapped in try_import to handle optional dependencies,
# ensuring the module can still function even if these dependencies are missing.
with _try_import(__name__, 'BeautifulSoupCrawler'):
    from ._beautifulsoup_crawler import BeautifulSoupCrawler
with _try_import(__name__, 'BeautifulSoupCrawlingContext'):
    from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
with _try_import(__name__, 'BeautifulSoupParserType'):
    from ._beautifulsoup_parser import BeautifulSoupParserType

__all__ = [
    'BeautifulSoupCrawler',
    'BeautifulSoupCrawlingContext',
    'BeautifulSoupParserType',
]


================================================
FILE: src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

from bs4 import BeautifulSoup, Tag

from crawlee._utils.docs import docs_group
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions

from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from typing_extensions import Unpack

    from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext


@docs_group('Crawlers')
class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, BeautifulSoup, Tag]):
    """A web crawler for performing HTTP requests and parsing HTML/XML content.

    The `BeautifulSoupCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features.
    It specifies its own parser `BeautifulSoupParser` which is used to parse `HttpResponse`.
    `BeautifulSoupParser` uses following library for parsing: https://pypi.org/project/beautifulsoup4/

    The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,
    if you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.

    ### Usage

    ```python
    from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext

    crawler = BeautifulSoupCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': context.soup.title.string if context.soup.title else None,
        }

        # Push the extracted data to the default dataset.
        await context.push_data(data)

    await crawler.run(['https://crawlee.dev/'])
    ```
    """

    def __init__(
        self,
        *,
        parser: BeautifulSoupParserType = 'lxml',
        **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
    ) -> None:
        """Initialize a new instance.

        Args:
            parser: The type of parser that should be used by `BeautifulSoup`.
            kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`.
        """

        async def final_step(
            context: ParsedHttpCrawlingContext[BeautifulSoup],
        ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]:
            """Enhance `ParsedHttpCrawlingContext[BeautifulSoup]` with `soup` property."""
            yield BeautifulSoupCrawlingContext.from_parsed_http_crawling_context(context)

        kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step)

        super().__init__(
            parser=BeautifulSoupParser(parser=parser),
            **kwargs,
        )


================================================
FILE: src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py
================================================
from dataclasses import dataclass, fields

from bs4 import BeautifulSoup
from typing_extensions import Self

from crawlee._utils.docs import docs_group
from crawlee.crawlers import ParsedHttpCrawlingContext

from ._utils import html_to_text


@dataclass(frozen=True)
@docs_group('Crawling contexts')
class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext[BeautifulSoup]):
    """The crawling context used by the `BeautifulSoupCrawler`.

    It provides access to key objects as well as utility functions for handling crawling tasks.
    """

    @property
    def soup(self) -> BeautifulSoup:
        """Convenience alias."""
        return self.parsed_content

    @classmethod
    def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[BeautifulSoup]) -> Self:
        """Initialize a new instance from an existing `ParsedHttpCrawlingContext`."""
        return cls(**{field.name: getattr(context, field.name) for field in fields(context)})

    def html_to_text(self) -> str:
        """Convert the parsed HTML content to newline-separated plain text without tags."""
        return html_to_text(self.parsed_content)


================================================
FILE: src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING, Literal

from bs4 import BeautifulSoup, Tag
from typing_extensions import override

from crawlee._utils.docs import docs_group
from crawlee.crawlers._abstract_http import AbstractHttpParser

if TYPE_CHECKING:
    from collections.abc import Iterable, Sequence

    from crawlee.http_clients import HttpResponse


@docs_group('HTTP parsers')
class BeautifulSoupParser(AbstractHttpParser[BeautifulSoup, Tag]):
    """Parser for parsing HTTP response using `BeautifulSoup`."""

    def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None:
        self._parser = parser

    @override
    async def parse(self, response: HttpResponse) -> BeautifulSoup:
        return BeautifulSoup(await response.read(), features=self._parser)

    @override
    async def parse_text(self, text: str) -> BeautifulSoup:
        return BeautifulSoup(text, features=self._parser)

    @override
    def is_matching_selector(self, parsed_content: Tag, selector: str) -> bool:
        return parsed_content.select_one(selector) is not None

    @override
    async def select(self, parsed_content: Tag, selector: str) -> Sequence[Tag]:
        return tuple(match for match in parsed_content.select(selector))

    @override
    def find_links(self, parsed_content: Tag, selector: str, attribute: str) -> Iterable[str]:
        link: Tag
        urls: list[str] = []
        for link in parsed_content.select(selector):
            url = link.attrs.get(attribute)
            if url:
                urls.append(url.strip())
        return urls


BeautifulSoupParserType = Literal['html.parser', 'lxml', 'xml', 'html5lib']


================================================
FILE: src/crawlee/crawlers/_beautifulsoup/_utils.py
================================================
from __future__ import annotations

import re
from typing import TYPE_CHECKING

from bs4 import BeautifulSoup, NavigableString, PageElement, Tag

from crawlee._utils.html_to_text import (
    _ANY_CONSECUTIVE_WHITE_SPACES,
    _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE,
    _EMPTY_OR_ENDS_WITH_NEW_LINE,
    BLOCK_TAGS,
    SKIP_TAGS,
)

if TYPE_CHECKING:
    from collections.abc import Iterable


def html_to_text(source: str | Tag) -> str:
    """Convert markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup.

    Args:
        source: Input markup string or `BeautifulSoup` object.

    Returns:
        Newline separated plain text without tags.
    """
    if isinstance(source, str):
        soup = BeautifulSoup(source, features='lxml')
    elif isinstance(source, BeautifulSoup):
        soup = source
    else:
        raise TypeError('Source must be either a string or a `BeautifulSoup` object.')

    text = ''

    def _page_element_to_text(page_elements: Iterable[PageElement]) -> None:
        """Extract and process text content from a collection of HTML elements.

        Convert page elements into plain text while preserving structure. Handle whitespace compression,
        skip unwanted elements, and format block elements correctly.
        """
        nonlocal text
        for page_element in page_elements:
            if isinstance(page_element, (Tag, NavigableString)):
                if isinstance(page_element, NavigableString):
                    compr: str
                    if isinstance(page_element.parent, Tag) and page_element.parent.name.lower() == 'pre':
                        compr = page_element.get_text()
                    else:
                        # Compress white spaces outside of pre block
                        compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', page_element.get_text())
                    # If text is empty or ends with a whitespace, don't add the leading whitespace or new line
                    if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text):
                        compr = compr[1:]
                    text += compr
                elif page_element.name.lower() in SKIP_TAGS:
                    # Skip comments and special elements
                    pass
                elif page_element.name.lower() == 'br':
                    text += '\n'
                elif page_element.name.lower() == 'td':
                    _page_element_to_text(page_element.children)
                    text += '\t'
                else:
                    # Block elements must be surrounded by newlines(unless beginning of text)
                    is_block_tag = page_element.name.lower() in BLOCK_TAGS
                    if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text):
                        text += '\n'
                    _page_element_to_text(page_element.children)
                    if is_block_tag and not text.endswith('\n'):
                        text += '\n'

    _page_element_to_text(soup.children)

    return text.strip()


================================================
FILE: src/crawlee/crawlers/_beautifulsoup/py.typed
================================================


================================================
FILE: src/crawlee/crawlers/_http/__init__.py
================================================
from crawlee.crawlers._abstract_http._http_crawling_context import HttpCrawlingContext
from crawlee.http_clients import HttpCrawlingResult

from ._http_crawler import HttpCrawler

__all__ = [
    'HttpCrawler',
    'HttpCrawlingContext',
    'HttpCrawlingResult',
]


================================================
FILE: src/crawlee/crawlers/_http/_http_crawler.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

from crawlee._utils.docs import docs_group
from crawlee.crawlers._abstract_http import AbstractHttpCrawler, ParsedHttpCrawlingContext

from ._http_parser import NoParser

if TYPE_CHECKING:
    from typing_extensions import Unpack

    from crawlee.crawlers import BasicCrawlerOptions


@docs_group('Crawlers')
class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes, bytes]):
    """Specific version of generic `AbstractHttpCrawler`.

    It uses a dummy parser that simply returns the HTTP response body as-is. Use this only if you know what you are
    doing. In most cases, using an HTML parser would be more beneficial. For such scenarios, consider using
    `BeautifulSoupCrawler`, `ParselCrawler`, or writing your own subclass of `AbstractHttpCrawler`.

    ### Usage

    ```python
    from crawlee.crawlers import HttpCrawler, HttpCrawlingContext

    crawler = HttpCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'response': (await context.http_response.read()).decode()[:100],
        }

        # Push the extracted data to the default dataset.
        await context.push_data(data)

    await crawler.run(['https://crawlee.dev/'])
    ```
    """

    def __init__(
        self,
        **kwargs: Unpack[BasicCrawlerOptions[ParsedHttpCrawlingContext[bytes]]],
    ) -> None:
        """Initialize a new instance.

        Args:
            kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`.
        """
        kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline()
        super().__init__(
            parser=NoParser(),
            **kwargs,
        )


================================================
FILE: src/crawlee/crawlers/_http/_http_parser.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

from typing_extensions import override

from crawlee._utils.docs import docs_group
from crawlee.crawlers._abstract_http import AbstractHttpParser
from crawlee.crawlers._types import BlockedInfo

if TYPE_CHECKING:
    from collections.abc import Iterable, Sequence

    from crawlee.http_clients import HttpResponse


@docs_group('HTTP parsers')
class NoParser(AbstractHttpParser[bytes, bytes]):
    """A no-op parser that returns raw response content without any processing.

    This is useful when you only need the raw response data and don't require HTML
    parsing, link extraction, or content selection functionality.
    """

    @override
    async def parse(self, response: HttpResponse) -> bytes:
        return await response.read()

    @override
    async def parse_text(self, text: str) -> bytes:
        raise NotImplementedError

    @override
    async def select(self, parsed_content: bytes, selector: str) -> Sequence[bytes]:
        raise NotImplementedError

    @override
    def is_blocked(self, parsed_content: bytes) -> BlockedInfo:  # Intentional unused argument.
        return BlockedInfo(reason='')

    @override
    def is_matching_selector(self, parsed_content: bytes, selector: str) -> bool:  # Intentional unused argument.
        return False

    @override
    def find_links(
        self, parsed_content: bytes, selector: str, attribute: str
    ) -> Iterable[str]:  # Intentional unused argument.
        return []


================================================
FILE: src/crawlee/crawlers/_parsel/__init__.py
================================================
from crawlee._utils.try_import import install_import_hook as _install_import_hook
from crawlee._utils.try_import import try_import as _try_import

_install_import_hook(__name__)

# The following imports are wrapped in try_import to handle optional dependencies,
# ensuring the module can still function even if these dependencies are missing.
with _try_import(__name__, 'ParselCrawler'):
    from ._parsel_crawler import ParselCrawler
with _try_import(__name__, 'ParselCrawlingContext'):
    from ._parsel_crawling_context import ParselCrawlingContext

__all__ = [
    'ParselCrawler',
    'ParselCrawlingContext',
]


================================================
FILE: src/crawlee/crawlers/_parsel/_parsel_crawler.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

from parsel import Selector

from crawlee._utils.docs import docs_group
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions

from ._parsel_crawling_context import ParselCrawlingContext
from ._parsel_parser import ParselParser

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from typing_extensions import Unpack

    from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext


@docs_group('Crawlers')
class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selector]):
    """A web crawler for performing HTTP requests and parsing HTML/XML content.

    The `ParselCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features.
    It specifies its own parser `ParselParser` which is used to parse `HttpResponse`.
    `ParselParser` uses following library for parsing: https://pypi.org/project/parsel/

    The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,
    if you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.

    ### Usage

    ```python
    from crawlee.crawlers import ParselCrawler, ParselCrawlingContext

    crawler = ParselCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': context.selector.css('title').get(),
        }

        # Push the extracted data to the default dataset.
        await context.push_data(data)

    await crawler.run(['https://crawlee.dev/'])
    ```
    """

    def __init__(
        self,
        **kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
    ) -> None:
        """Initialize a new instance.

        Args:
            kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`.
        """

        async def final_step(
            context: ParsedHttpCrawlingContext[Selector],
        ) -> AsyncGenerator[ParselCrawlingContext, None]:
            """Enhance `ParsedHttpCrawlingContext[Selector]` with a `selector` property."""
            yield ParselCrawlingContext.from_parsed_http_crawling_context(context)

        kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step)
        super().__init__(
            parser=ParselParser(),
            **kwargs,
        )


================================================
FILE: src/crawlee/crawlers/_parsel/_parsel_crawling_context.py
================================================
from dataclasses import dataclass, fields

from parsel import Selector
from typing_extensions import Self

from crawlee._utils.docs import docs_group
from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext

from ._utils import html_to_text


@dataclass(frozen=True)
@docs_group('Crawling contexts')
class ParselCrawlingContext(ParsedHttpCrawlingContext[Selector]):
    """The crawling context used by the `ParselCrawler`.

    It provides access to key objects as well as utility functions for handling crawling tasks.
    """

    @property
    def selector(self) -> Selector:
        """Convenience alias."""
        return self.parsed_content

    @classmethod
    def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Selector]) -> Self:
        """Create a new context from an existing `ParsedHttpCrawlingContext[Selector]`."""
        return cls(**{field.name: getattr(context, field.name) for field in fields(context)})

    def html_to_text(self) -> str:
        """Convert the parsed HTML content to newline-separated plain text without tags."""
        return html_to_text(self.parsed_content)


================================================
FILE: src/crawlee/crawlers/_parsel/_parsel_parser.py
================================================
from __future__ import annotations

import asyncio
from typing import TYPE_CHECKING

from parsel import Selector
from typing_extensions import override

from crawlee._utils.docs import docs_group
from crawlee.crawlers._abstract_http import AbstractHttpParser

if TYPE_CHECKING:
    from collections.abc import Iterable, Sequence

    from crawlee.http_clients import HttpResponse


@docs_group('HTTP parsers')
class ParselParser(AbstractHttpParser[Selector, Selector]):
    """Parser for parsing HTTP response using Parsel."""

    @override
    async def parse(self, response: HttpResponse) -> Selector:
        response_body = await response.read()
        return await asyncio.to_thread(Selector, body=response_body)

    @override
    async def parse_text(self, text: str) -> Selector:
        return Selector(text=text)

    @override
    async def select(self, parsed_content: Selector, selector: str) -> Sequence[Selector]:
        return tuple(match for match in parsed_content.css(selector))

    @override
    def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool:
        return parsed_content.type in ('html', 'xml') and parsed_content.css(selector).get() is not None

    @override
    def find_links(self, parsed_content: Selector, selector: str, attribute: str) -> Iterable[str]:
        link: Selector
        urls: list[str] = []
        for link in parsed_content.css(selector):
            url = link.xpath(f'@{attribute}').get()
            if url:
                urls.append(url.strip())
        return urls


================================================
FILE: src/crawlee/crawlers/_parsel/_utils.py
================================================
from __future__ import annotations

import re

from parsel import Selector

from crawlee._utils.html_to_text import (
    _ANY_CONSECUTIVE_WHITE_SPACES,
    _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE,
    _EMPTY_OR_ENDS_WITH_NEW_LINE,
    BLOCK_TAGS,
    SKIP_TAGS,
)


def html_to_text(source: str | Selector) -> str:
    """Convert markup string or `Selector` to newline-separated plain text without tags using Parsel.

    Args:
        source: Input markup string or `Selector` object.

    Returns:
        Newline separated plain text without tags.
    """
    if isinstance(source, str):
        selector = Selector(text=source)
    elif isinstance(source, Selector):
        selector = source
    else:
        raise TypeError('Source must be either a string or a `Selector` object.')

    text = ''

    def _extract_text(elements: list[Selector], *, compress: bool = True) -> None:
        """Extract text content from HTML elements while preserving formatting.

        Perform custom HTML parsing to match the behavior of the JavaScript version of Crawlee. Handles whitespace
        compression and block-level tag formatting.

        Args:
            elements: A list of selectors representing the HTML elements.
            compress: Whether to compress consecutive whitespace outside of `<pre>` blocks.
        """
        nonlocal text
        for element in elements:
            tag = element.root.tag if hasattr(element.root, 'tag') else None

            if tag is None:
                # Compress white spaces outside of pre block
                compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', element.root) if compress else element.root
                # If text is empty or ends with a whitespace, don't add the leading whitespace or new line
                if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text):
                    compr = compr[1:]
                text += compr

            if tag in SKIP_TAGS or not isinstance(tag, str):
                continue

            if tag == 'br':
                text += '\n'
            elif tag == 'td':
                _extract_text(element.xpath('./node()'))
                text += '\t'
            else:
                is_block_tag = tag in BLOCK_TAGS if tag else False

                if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text):
                    text += '\n'

                _extract_text(element.xpath('./node()'), compress=tag != 'pre')

                if is_block_tag and not text.endswith('\n'):
                    text += '\n'

    # Start processing the root elements
    _extract_text(selector.xpath('/*'))

    return text.strip()


================================================
FILE: src/crawlee/crawlers/_playwright/__init__.py
================================================
from crawlee._utils.try_import import install_import_hook as _install_import_hook
from crawlee._utils.try_import import try_import as _try_import

_install_import_hook(__name__)

# The following imports are wrapped in try_import to handle optional dependencies,
# ensuring the module can still function even if these dependencies are missing.
with _try_import(__name__, 'PlaywrightCrawler'):
    from ._playwright_crawler import PlaywrightCrawler
with _try_import(__name__, 'PlaywrightCrawlingContext'):
    from ._playwright_crawling_context import PlaywrightCrawlingContext
with _try_import(__name__, 'PlaywrightPreNavCrawlingContext'):
    from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
with _try_import(__name__, 'PlaywrightPostNavCrawlingContext'):
    from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext

__all__ = [
    'PlaywrightCrawler',
    'PlaywrightCrawlingContext',
    'PlaywrightPostNavCrawlingContext',
    'PlaywrightPreNavCrawlingContext',
]


================================================
FILE: src/crawlee/crawlers/_playwright/_playwright_crawler.py
================================================
from __future__ import annotations

import asyncio
import logging
import warnings
from datetime import timedelta
from functools import partial
from typing import TYPE_CHECKING, Any, Generic, Literal

import playwright.async_api
from more_itertools import partition
from pydantic import ValidationError
from typing_extensions import NotRequired, TypedDict, TypeVar

from crawlee._request import Request, RequestOptions, RequestState
from crawlee._types import BasicCrawlingContext, ConcurrencySettings
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
from crawlee._utils.docs import docs_group
from crawlee._utils.robots import RobotsTxtFile
from crawlee._utils.time import SharedTimeout
from crawlee._utils.urls import to_absolute_url_iterator
from crawlee.browsers import BrowserPool
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
from crawlee.errors import SessionError
from crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
from crawlee.http_clients import ImpitHttpClient
from crawlee.sessions._cookies import PlaywrightCookieParam
from crawlee.statistics import StatisticsState

from ._playwright_crawling_context import PlaywrightCrawlingContext
from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
from ._types import GotoOptions
from ._utils import block_requests, infinite_scroll

TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator, Awaitable, Callable, Iterator, Mapping
    from pathlib import Path

    from playwright.async_api import Page, Route
    from playwright.async_api import Request as PlaywrightRequest
    from typing_extensions import Unpack

    from crawlee import RequestTransformAction
    from crawlee._types import (
        EnqueueLinksKwargs,
        ExtractLinksFunction,
        HttpHeaders,
        HttpMethod,
        HttpPayload,
    )
    from crawlee.browsers._types import BrowserType


@docs_group('Crawlers')
class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]):
    """A web crawler that leverages the `Playwright` browser automation library.

    The `PlaywrightCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features.
    On top of that it provides a high level web crawling interface on top of the `Playwright` library. To be more
    specific, it uses the Crawlee's `BrowserPool` to manage the Playwright's browser instances and the pages they
    open. You can create your own `BrowserPool` instance and pass it to the `PlaywrightCrawler` constructor, or let
    the crawler create a new instance with the default settings.

    This crawler is ideal for crawling websites that require JavaScript execution, as it uses real browsers
    to download web pages and extract data. For websites that do not require JavaScript, consider using one of the
    HTTP client-based crawlers, such as the `HttpCrawler`, `ParselCrawler`, or `BeautifulSoupCrawler`. They use
    raw HTTP requests, which means they are much faster.

    ### Usage

    ```python
    from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext

    crawler = PlaywrightCrawler()

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page.
        data = {
            'url': context.request.url,
            'title': await context.page.title(),
            'response': (await context.response.text())[:100],
        }

        # Push the extracted data to the default dataset.
        await context.push_data(data)

    await crawler.run(['https://crawlee.dev/'])
    ```
    """

    def __init__(
        self,
        *,
        browser_pool: BrowserPool | None = None,
        browser_type: BrowserType | None = None,
        user_data_dir: str | Path | None = None,
        browser_launch_options: Mapping[str, Any] | None = None,
        browser_new_context_options: Mapping[str, Any] | None = None,
        goto_options: GotoOptions | None = None,
        fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
        headless: bool | None = None,
        use_incognito_pages: bool | None = None,
        navigation_timeout: timedelta | None = None,
        **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
    ) -> None:
        """Initialize a new instance.

        Args:
            browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
            user_data_dir: Path to a user data directory, which stores browser session data like cookies
                and local storage.
            browser_type: The type of browser to launch:
                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
                    the system.
                This option should not be used if `browser_pool` is provided.
            browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
                directly to Playwright's `browser_type.launch` method. For more details, refer to the
                [Playwright documentation](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch).
                This option should not be used if `browser_pool` is provided.
            browser_new_context_options: Keyword arguments to pass to the browser new context method. These options
                are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
                [Playwright documentation](https://playwright.dev/python/docs/api/class-browser#browser-new-context).
                This option should not be used if `browser_pool` is provided.
            fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used
                to generate browser fingerprints together with consistent headers.
            headless: Whether to run the browser in headless mode.
                This option should not be used if `browser_pool` is provided.
            use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
                own context that is destroyed once the page is closed or crashes.
                This option should not be used if `browser_pool` is provided.
            navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
                the request handler)
            goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is
                not supported, use `navigation_timeout` instead.
            kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
        """
        self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}

        if browser_pool:
            # Raise an exception if browser_pool is provided together with other browser-related arguments.
            if any(
                param not in [None, 'default']
                for param in (
                    user_data_dir,
                    use_incognito_pages,
                    headless,
                    browser_type,
                    browser_launch_options,
                    browser_new_context_options,
                    fingerprint_generator,
                )
            ):
                raise ValueError(
                    'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
                    '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
                    '`fingerprint_generator` arguments when `browser_pool` is provided.'
                )

        # If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
        else:
            if fingerprint_generator == 'default':
                generator_browser_type: list[Literal['chrome', 'firefox', 'safari', 'edge']] | None = (
                    [fingerprint_browser_type_from_playwright_browser_type(browser_type)] if browser_type else None
                )

                fingerprint_generator = DefaultFingerprintGenerator(
                    header_options=HeaderGeneratorOptions(browsers=generator_browser_type)
                )

            browser_pool = BrowserPool.with_default_plugin(
                headless=headless,
                browser_type=browser_type,
                user_data_dir=user_data_dir,
                browser_launch_options=browser_launch_options,
                browser_new_context_options=browser_new_context_options,
                use_incognito_pages=use_incognito_pages,
                fingerprint_generator=fingerprint_generator,
            )

        self._browser_pool = browser_pool

        # Compose the context pipeline with the Playwright-specific context enhancer.
        kwargs['_context_pipeline'] = (
            ContextPipeline()
            .compose(self._open_page)
            .compose(self._navigate)
            .compose(self._execute_post_navigation_hooks)
            .compose(self._handle_status_code_response)
            .compose(self._handle_blocked_request_by_content)
            .compose(self._create_crawling_context)
        )
        kwargs['_additional_context_managers'] = [self._browser_pool]
        kwargs.setdefault('_logger', logging.getLogger(__name__))
        self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]] = []
        self._post_navigation_hooks: list[Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]] = []

        kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']

        # Set default concurrency settings for browser crawlers if not provided
        if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
            kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)

        self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
        self._goto_options = goto_options or GotoOptions()

        super().__init__(**kwargs)

    async def _open_page(
        self,
        context: BasicCrawlingContext,
    ) -> AsyncGenerator[PlaywrightPreNavCrawlingContext, None]:
        if self._browser_pool is None:
            raise ValueError('Browser pool is not initialized.')

        # Create a new browser page
        crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)

        pre_navigation_context = PlaywrightPreNavCrawlingContext(
            request=context.request,
            session=context.session,
            add_requests=context.add_requests,
            send_request=context.send_request,
            push_data=context.push_data,
            use_state=context.use_state,
            proxy_info=context.proxy_info,
            get_key_value_store=context.get_key_value_store,
            log=context.log,
            page=crawlee_page.page,
            block_requests=partial(block_requests, page=crawlee_page.page),
            goto_options=GotoOptions(**self._goto_options),
        )

        context_id = id(pre_navigation_context)
        self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)

        try:
            # Only use the page context manager here — it sets the current page in a context variable,
            # making it accessible to PlaywrightHttpClient in subsequent pipeline steps.
            async with browser_page_context(crawlee_page.page):
                for hook in self._pre_navigation_hooks:
                    async with self._shared_navigation_timeouts[context_id]:
                        await hook(pre_navigation_context)

                # Yield should be inside the browser_page_context.
                yield pre_navigation_context
        finally:
            self._shared_navigation_timeouts.pop(context_id, None)

    def _prepare_request_interceptor(
        self,
        method: HttpMethod = 'GET',
        headers: HttpHeaders | dict[str, str] | None = None,
        payload: HttpPayload | None = None,
    ) -> Callable:
        """Create a request interceptor for Playwright to support non-GET methods with custom parameters.

        The interceptor modifies requests by adding custom headers and payload before they are sent.

        Args:
            method: HTTP method to use for the request.
            headers: Custom HTTP headers to send with the request.
            payload: Request body data for POST/PUT requests.
        """

        async def route_handler(route: Route, _: PlaywrightRequest) -> None:
            await route.continue_(method=method, headers=dict(headers) if headers else None, post_data=payload)

        return route_handler

    async def _navigate(
        self,
        context: PlaywrightPreNavCrawlingContext,
    ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, Exception | None]:
        """Execute an HTTP request utilizing the `BrowserPool` and the `Playwright` library.

        Args:
            context: The basic crawling context to be enhanced.

        Raises:
            ValueError: If the browser pool is not initialized.
            SessionError: If the URL cannot be loaded by the browser.
            TimeoutError: If navigation does not succeed within the navigation timeout.

        Yields:
            The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
                infinite_scroll and block_requests).
        """
        async with context.page:
            if context.session:
                session_cookies = context.session.cookies.get_cookies_as_playwright_format()
                await self._update_cookies(context.page, session_cookies)

            if context.request.headers:
                await context.page.set_extra_http_headers(context.request.headers.model_dump())
            # Navigate to the URL and get response.
            if context.request.method != 'GET':
                # Call the notification only once
                warnings.warn(
                    'Using other request methods than GET or adding payloads has a high impact on performance'
                    ' in recent versions of Playwright. Use only when necessary.',
                    category=UserWarning,
                    stacklevel=2,
                )

                route_handler = self._prepare_request_interceptor(
                    method=context.request.method,
                    headers=context.request.headers,
                    payload=context.request.payload,
                )

                # Set route_handler only for current request
                await context.page.route(context.request.url, route_handler)

            try:
                async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
                    response = await context.page.goto(
                        context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
                    )
                context.request.state = RequestState.AFTER_NAV
            except playwright.async_api.TimeoutError as exc:
                raise asyncio.TimeoutError from exc

            if response is None:
                raise SessionError(f'Failed to load the URL: {context.request.url}')

            # Set the loaded URL to the actual URL after redirection.
            context.request.loaded_url = context.page.url

            yield PlaywrightPostNavCrawlingContext(
                request=context.request,
                session=context.session,
                add_requests=context.add_requests,
                send_request=context.send_request,
                push_data=context.push_data,
                use_state=context.use_state,
                proxy_info=context.proxy_info,
                get_key_value_store=context.get_key_value_store,
                log=context.log,
                page=context.page,
                block_requests=context.block_requests,
                goto_options=context.goto_options,
                response=response,
            )

    def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction:
        """Create a callback function for extracting links from context.

        Args:
            context: The current crawling context.

        Returns:
            Awaitable that is used for extracting links from context.
        """

        async def extract_links(
            *,
            selector: str = 'a',
            attribute: str = 'href',
            label: str | None = None,
            user_data: dict | None = None,
            transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
            | None = None,
            **kwargs: Unpack[EnqueueLinksKwargs],
        ) -> list[Request]:
            """Extract links from the current page.

            The `PlaywrightCrawler` implementation of the `ExtractLinksFunction` function.
            """
            requests = list[Request]()

            base_user_data = user_data or {}

            robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)

            kwargs.setdefault('strategy', 'same-hostname')
            strategy = kwargs.get('strategy', 'same-hostname')

            elements = await context.page.query_selector_all(selector)
            links_iterator: Iterator[str] = iter(
                [url for element in elements if (url := await element.get_attribute(attribute)) is not None]
            )

            # Get base URL from <base> tag if present
            extracted_base_url = await context.page.evaluate('document.baseURI')
            base_url: str = extracted_base_url or context.request.loaded_url or context.request.url

            links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)

            if robots_txt_file:
                skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
            else:
                skipped = iter([])

            for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
                request_options = RequestOptions(
                    url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
                )

                if transform_request_function:
                    transform_request_options = transform_request_function(request_options)
                    if transform_request_options == 'skip':
                        continue
                    if transform_request_options != 'unchanged':
                        request_options = transform_request_options

                try:
                    request = Request.from_url(**request_options)
                except ValidationError as exc:
                    context.log.debug(
                        f'Skipping URL "{url}" due to invalid format: {exc}. '
                        'This may be caused by a malformed URL or unsupported URL scheme. '
                        'Please ensure the URL is correct and retry.'
                    )
                    continue

                requests.append(request)

            skipped_tasks = [
                asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
            ]
            await asyncio.gather(*skipped_tasks)

            return requests

        return extract_links

    async def _handle_status_code_response(
        self, context: PlaywrightPostNavCrawlingContext
    ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]:
        """Validate the HTTP status code and raise appropriate exceptions if needed.

        Args:
            context: The current crawling context containing the response.

        Raises:
            SessionError: If the status code indicates the session is blocked.
            HttpStatusCodeError: If the status code represents a server error or is explicitly configured as an error.
            HttpClientStatusCodeError: If the status code represents a client error.

        Yields:
            The original crawling context if no errors are detected.
        """
        status_code = context.response.status
        if self._retry_on_blocked:
            self._raise_for_session_blocked_status_code(context.session, status_code)
        self._raise_for_error_status_code(status_code)
        yield context

    async def _handle_blocked_request_by_content(
        self,
        context: PlaywrightPostNavCrawlingContext,
    ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]:
        """Try to detect if the request is blocked based on the response content.

        Args:
            context: The current crawling context.

        Raises:
            SessionError: If the request is considered blocked.

        Yields:
            The original crawling context if no errors are detected.
        """
        if self._retry_on_blocked:
            matched_selectors = [
                selector for selector in RETRY_CSS_SELECTORS if (await context.page.query_selector(selector))
            ]

            # Check if the session is blocked based on the response content
            if matched_selectors:
                raise SessionError(
                    'Assuming the session is blocked - '
                    f'HTTP response matched the following selectors: {"; ".join(matched_selectors)}'
                )

        yield context

    async def _execute_post_navigation_hooks(
        self, context: PlaywrightPostNavCrawlingContext
    ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]:
        for hook in self._post_navigation_hooks:
            await hook(context)
        yield context

    async def _create_crawling_context(
        self, context: PlaywrightPostNavCrawlingContext
    ) -> AsyncGenerator[PlaywrightCrawlingContext, Exception | None]:
        extract_links = self._create_extract_links_function(context)

        error = yield PlaywrightCrawlingContext(
            request=context.request,
            session=context.session,
            add_requests=context.add_requests,
            send_request=context.send_request,
            push_data=context.push_data,
            use_state=context.use_state,
            proxy_info=context.proxy_info,
            get_key_value_store=context.get_key_value_store,
            log=context.log,
            page=context.page,
            goto_options=context.goto_options,
            response=context.response,
            infinite_scroll=lambda: infinite_scroll(context.page),
            extract_links=extract_links,
            enqueue_links=self._create_enqueue_links_function(context, extract_links),
            block_requests=partial(block_requests, page=context.page),
        )

        if context.session:
            pw_cookies = await self._get_cookies(context.page)
            context.session.cookies.set_cookies_from_playwright_format(pw_cookies)

        # Collect data in case of errors, before the page object is closed.
        if error:
            await self.statistics.error_tracker.add(error=error, context=context, early=True)

    def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None:
        """Register a hook to be called before each navigation.

        Args:
            hook: A coroutine function to be called before each navigation.
        """
        self._pre_navigation_hooks.append(hook)

    def post_navigation_hook(self, hook: Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None:
        """Register a hook to be called after each navigation.

        Args:
            hook: A coroutine function to be called after each navigation.
        """
        self._post_navigation_hooks.append(hook)

    async def _get_cookies(self, page: Page) -> list[PlaywrightCookieParam]:
        """Get the cookies from the page."""
        cookies = await page.context.cookies()
        return [PlaywrightCookieParam(**cookie) for cookie in cookies]

    async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]) -> None:
        """Update the cookies in the page context."""
        await page.context.add_cookies([{**cookie} for cookie in cookies])

    async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
        """Find the robots.txt file for a given URL.

        Args:
            url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
        """
        http_client = ImpitHttpClient() if isinstance(self._http_client, PlaywrightHttpClient) else self._http_client

        return await RobotsTxtFile.find(url, http_client=http_client)


class _PlaywrightCrawlerAdditionalOptions(TypedDict):
    """Additional arguments for the `PlaywrightCrawler` constructor.

    It is intended for typing forwarded `__init__` arguments in the subclasses.
    All arguments are `BasicCrawlerOptions` + `_PlaywrightCrawlerAdditionalOptions`
    """

    browser_pool: NotRequired[BrowserPool]
    """A `BrowserPool` instance to be used for launching the browsers and getting pages."""

    browser_type: NotRequired[BrowserType]
    """The type of browser to launch:
    - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
    - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
    This option should not be used if `browser_pool` is provided."""

    browser_launch_options: NotRequired[Mapping[str, Any]]
    """Keyword arguments to pass to the browser launch method. These options are provided
    directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright
    documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.
    This option should not be used if `browser_pool` is provided."""

    browser_new_context_options: NotRequired[Mapping[str, Any]]
    """Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's
    `browser.new_context` method. For more details, refer to the Playwright documentation:
    https://playwright.dev/python/docs/api/class-browser#browser-new-context. This option should not be used if
    `browser_pool` is provided."""

    headless: NotRequired[bool]
    """Whether to run the browser in headless mode. This option should not be used if `browser_pool` is provided."""


class PlaywrightCrawlerOptions(
    _PlaywrightCrawlerAdditionalOptions,
    BasicCrawlerOptions[TCrawlingContext, StatisticsState],
    Generic[TCrawlingContext, TStatisticsState],
):
    """Arguments for the `AbstractHttpCrawler` constructor.

    It is intended for typing forwarded `__init__` arguments in the subclasses.
    """


================================================
FILE: src/crawlee/crawlers/_playwright/_playwright_crawling_context.py
================================================
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

from crawlee._utils.docs import docs_group

from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext

if TYPE_CHECKING:
    from collections.abc import Awaitable, Callable

    from crawlee._types import EnqueueLinksFunction, ExtractLinksFunction


@dataclass(frozen=True)
@docs_group('Crawling contexts')
class PlaywrightCrawlingContext(PlaywrightPostNavCrawlingContext):
    """The crawling context used by the `PlaywrightCrawler`.

    It provides access to key objects as well as utility functions for handling crawling tasks.
    """

    enqueue_links: EnqueueLinksFunction
    """The Playwright `EnqueueLinksFunction` implementation."""

    extract_links: ExtractLinksFunction
    """The Playwright `ExtractLinksFunction` implementation."""

    infinite_scroll: Callable[[], Awaitable[None]]
    """A function to perform infinite scrolling on the page. This scrolls to the bottom, triggering
    the loading of additional content if present."""


================================================
FILE: src/crawlee/crawlers/_playwright/_playwright_http_client.py
================================================
from __future__ import annotations

import contextvars
from contextlib import AbstractAsyncContextManager, asynccontextmanager
from typing import TYPE_CHECKING

from typing_extensions import override

from crawlee._types import HttpHeaders
from crawlee.crawlers._playwright._types import PlaywrightHttpResponse
from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator
    from datetime import timedelta

    from playwright.async_api import Page

    from crawlee import Request
    from crawlee._types import HttpMethod, HttpPayload
    from crawlee.proxy_configuration import ProxyInfo
    from crawlee.sessions import Session
    from crawlee.statistics import Statistics


_browser_page_context_var: contextvars.ContextVar[Page | None] = contextvars.ContextVar('browser_context', default=None)


@asynccontextmanager
async def browser_page_context(page: Page) -> AsyncGenerator[None, None]:
    """Asynchronous context manager for setting the current Playwright page in the context variable."""
    token = _browser_page_context_var.set(page)
    try:
        yield
    finally:
        _browser_page_context_var.reset(token)


class PlaywrightHttpClient(HttpClient):
    """HTTP client based on the Playwright library.

    This client uses the Playwright library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)
    and to manage sessions, proxies, and error handling.

    See the `HttpClient` class for more common information about HTTP clients.

    Note: This class is pre-designated for use in `PlaywrightCrawler` only
    """

    def __init__(self) -> None:
        """Initialize a new instance."""
        self._active = False

    @override
    async def crawl(
        self,
        request: Request,
        *,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        statistics: Statistics | None = None,
        timeout: timedelta | None = None,
    ) -> HttpCrawlingResult:
        raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')

    @override
    async def send_request(
        self,
        url: str,
        *,
        method: HttpMethod = 'GET',
        headers: HttpHeaders | dict[str, str] | None = None,
        payload: HttpPayload | None = None,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        timeout: timedelta | None = None,
    ) -> HttpResponse:
        # `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
        # TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
        # https://github.com/apify/crawlee-python/issues/1055

        if isinstance(headers, dict) or headers is None:
            headers = HttpHeaders(headers or {})

        browser_context = _browser_page_context_var.get()

        if browser_context is None:
            raise RuntimeError('Unable to create an `APIRequestContext` outside the browser context')

        # Proxies appropriate to the browser context are used
        response = await browser_context.request.fetch(
            url_or_request=url,
            method=method.lower(),
            headers=dict(headers) if headers else None,
            data=payload,
            timeout=timeout.total_seconds() if timeout else None,
        )

        return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')

    @override
    def stream(
        self,
        url: str,
        *,
        method: HttpMethod = 'GET',
        headers: HttpHeaders | dict[str, str] | None = None,
        payload: HttpPayload | None = None,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        timeout: timedelta | None = None,
    ) -> AbstractAsyncContextManager[HttpResponse]:
        raise NotImplementedError('The `stream` method should not be used for `PlaywrightHttpClient`')

    async def cleanup(self) -> None:
        # The `browser_page_context` is responsible for resource cleanup
        return


================================================
FILE: src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py
================================================
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

from crawlee._utils.docs import docs_group

from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext

if TYPE_CHECKING:
    from playwright.async_api import Response


@dataclass(frozen=True)
@docs_group('Crawling contexts')
class PlaywrightPostNavCrawlingContext(PlaywrightPreNavCrawlingContext):
    """The post navigation crawling context used by the `PlaywrightCrawler`.

    It provides access to the `Page` and `Response` objects, after the navigation to the URL is performed.
    """

    response: Response
    """The Playwright `Response` object containing the response details for the current URL."""


================================================
FILE: src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py
================================================
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

from crawlee._types import BasicCrawlingContext, PageSnapshot
from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
    from playwright.async_api import Page

    from ._types import BlockRequestsFunction, GotoOptions


@dataclass(frozen=True)
@docs_group('Crawling contexts')
class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
    """The pre navigation crawling context used by the `PlaywrightCrawler`.

    It provides access to the `Page` object, before the navigation to the URL is performed.
    """

    page: Page
    """The Playwright `Page` object for the current page."""

    block_requests: BlockRequestsFunction
    """Blocks network requests matching specified URL patterns."""

    goto_options: GotoOptions
    """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""

    async def get_snapshot(self) -> PageSnapshot:
        """Get snapshot of crawled page."""
        html = None
        screenshot = None

        try:
            html = await self.page.content()
        except Exception:
            self.log.exception(f'Failed to get html snapshot for {self.request.url}.')

        try:
            screenshot = await self.page.screenshot(full_page=True, type='jpeg')
        except Exception:
            self.log.exception(f'Failed to get page screenshot for {self.request.url}.')

        return PageSnapshot(html=html, screenshot=screenshot)


================================================
FILE: src/crawlee/crawlers/_playwright/_types.py
================================================
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, Literal, Protocol, TypedDict

from playwright.async_api import APIResponse

from crawlee import HttpHeaders
from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from playwright.async_api import Response
    from typing_extensions import NotRequired, Self


@docs_group('Functions')
class BlockRequestsFunction(Protocol):
    """A function for blocking unwanted HTTP requests during page loads in PlaywrightCrawler.

    It simplifies the process of blocking specific HTTP requests during page navigation.
    The function allows blocking both default resource types (like images, fonts, stylesheets) and custom URL patterns.
    """

    async def __call__(
        self, url_patterns: list[str] | None = None, extra_url_patterns: list[str] | None = None
    ) -> None:
        """Call dunder method.

        Args:
            url_patterns: List of URL patterns to block. If None, uses default patterns.
            extra_url_patterns: Additional URL patterns to append to the main patterns list.
        """


@dataclass(frozen=True)
class PlaywrightHttpResponse:
    """Wrapper class for playwright `Response` and `APIResponse` objects to implement `HttpResponse` protocol."""

    http_version: str
    status_code: int
    headers: HttpHeaders
    _content: bytes

    async def read(self) -> bytes:
        return self._content

    async def read_stream(self) -> AsyncGenerator[bytes, None]:
        # Playwright does not support `streaming` responses.
        # This is a workaround to make it compatible with `HttpResponse` protocol.
        yield self._content

    @classmethod
    async def from_playwright_response(cls, response: Response | APIResponse, protocol: str) -> Self:
        headers = HttpHeaders(response.headers)
        status_code = response.status
        # Used http protocol version cannot be obtained from `Response` and has to be passed as additional argument.
        http_version = protocol
        _content = await response.body()
        # If not called then the body will stay in memory until the context closes.
        if isinstance(response, APIResponse):
            await response.dispose()

        return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)


class GotoOptions(TypedDict):
    """Keyword arguments for Playwright's `Page.goto()` method."""

    wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]
    """When to consider operation succeeded, defaults to 'load' event."""

    referer: NotRequired[str]
    """Referer header value."""


================================================
FILE: src/crawlee/crawlers/_playwright/_utils.py
================================================
from __future__ import annotations

import asyncio
from contextlib import suppress
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from playwright.async_api import Page
    from playwright.async_api import Request as PlaywrightRequest

_DEFAULT_BLOCK_REQUEST_URL_PATTERNS = [
    '.css',
    '.webp',
    '.jpg',
    '.jpeg',
    '.png',
    '.svg',
    '.gif',
    '.woff',
    '.pdf',
    '.zip',
]


async def infinite_scroll(page: Page) -> None:
    """Scroll to the bottom of a page, handling loading of additional items."""
    scrolled_distance = 0
    finished = False

    match_count = 0
    match_count_threshold = 4

    old_request_count = 0
    new_request_count = 0

    def track_request(request: PlaywrightRequest) -> None:
        if request.resource_type in ['xhr', 'fetch', 'websocket', 'other']:
            nonlocal new_request_count
            new_request_count += 1

    page.on('request', track_request)

    async def scroll() -> None:
        body_scroll_height = await page.evaluate('() => document.body.scrollHeight')

        delta = body_scroll_height or 10000
        await page.mouse.wheel(delta_x=0, delta_y=delta)

        nonlocal scrolled_distance
        scrolled_distance += delta

    async def check_finished() -> None:
        nonlocal old_request_count, new_request_count, match_count, finished

        while True:
            if old_request_count == new_request_count:
                match_count += 1

                if match_count >= match_count_threshold:
                    finished = True
                    return
            else:
                match_count = 0
                old_request_count = new_request_count

            await asyncio.sleep(1)

    check_task = asyncio.create_task(check_finished(), name='infinite_scroll_check_finished_task')

    try:
        while not finished:
            await scroll()
            await page.wait_for_timeout(250)
    finally:
        if not check_task.done():
            check_task.cancel()
        with suppress(asyncio.CancelledError):
            await check_task


async def block_requests(
    page: Page, url_patterns: list[str] | None = None, extra_url_patterns: list[str] | None = None
) -> None:
    """Blocks network requests matching specified URL patterns.

    Args:
        page: Playwright Page object to block requests on.
        url_patterns: List of URL patterns to block. If None, uses default patterns.
        extra_url_patterns: Additional URL patterns to append to the main patterns list.
    """
    url_patterns = list(url_patterns or _DEFAULT_BLOCK_REQUEST_URL_PATTERNS)
    url_patterns.extend(extra_url_patterns or [])

    browser_type = page.context.browser.browser_type.name if page.context.browser else 'undefined'

    if browser_type == 'chromium':
        client = await page.context.new_cdp_session(page)

        await client.send('Network.enable')
        await client.send('Network.setBlockedURLs', {'urls': url_patterns})
    else:
        extensions = [pattern.strip('*.') for pattern in url_patterns if pattern.startswith(('*.', '.'))]
        specific_files = [pattern for pattern in url_patterns if not pattern.startswith(('*.', '.'))]

        if extensions:
            await page.route(f'**/*.{{{",".join(extensions)}}}*', lambda route, _: route.abort())

        if specific_files:
            await page.route(f'**/{{{",".join(specific_files)}}}*', lambda route, _: route.abort())


================================================
FILE: src/crawlee/crawlers/_types.py
================================================
from __future__ import annotations

from dataclasses import dataclass


@dataclass(frozen=True)
class BlockedInfo:
    """Information about whether the crawling is blocked. If reason is empty, then it means it is not blocked."""

    reason: str

    def __bool__(self) -> bool:
        """No reason means no blocking."""
        return bool(self.reason)


================================================
FILE: src/crawlee/crawlers/py.typed
================================================


================================================
FILE: src/crawlee/errors.py
================================================
from __future__ import annotations

from typing import Generic

from typing_extensions import TypeVar

from crawlee._types import BasicCrawlingContext
from crawlee._utils.docs import docs_group

__all__ = [
    'ContextPipelineFinalizationError',
    'ContextPipelineInitializationError',
    'ContextPipelineInterruptedError',
    'HttpClientStatusCodeError',
    'HttpStatusCodeError',
    'ProxyError',
    'RequestCollisionError',
    'RequestHandlerError',
    'ServiceConflictError',
    'SessionError',
    'UserDefinedErrorHandlerError',
]

TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)


@docs_group('Errors')
class UserDefinedErrorHandlerError(Exception):
    """Wraps an exception thrown from an user-defined error handler."""


class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
    """Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out."""


@docs_group('Errors')
class SessionError(Exception):
    """Errors of `SessionError` type will trigger a session rotation.

    This error doesn't respect the `max_request_retries` option and has a separate limit of `max_session_rotations`.
    """


@docs_group('Errors')
class ServiceConflictError(Exception):
    """Raised when attempting to reassign a service in service container that is already in use."""

    def __init__(self, service: type, new_value: object, existing_value: object) -> None:
        super().__init__(
            f'Service {service.__name__} is already in use. Existing value: {existing_value}, '
            f'attempted new value: {new_value}.'
        )


@docs_group('Errors')
class ProxyError(SessionError):
    """Raised when a proxy is being blocked or malfunctions."""


@docs_group('Errors')
class HttpStatusCodeError(Exception):
    """Raised when the response status code indicates an error."""

    def __init__(self, message: str, status_code: int) -> None:
        super().__init__(f'{message} (status code: {status_code}).')
        self.status_code = status_code
        self.message = message


@docs_group('Errors')
class HttpClientStatusCodeError(HttpStatusCodeError):
    """Raised when the response status code indicates an client error."""


@docs_group('Errors')
class RequestHandlerError(Exception, Generic[TCrawlingContext]):
    """Wraps an exception thrown from a request handler (router) and extends it with crawling context."""

    def __init__(self, wrapped_exception: Exception, crawling_context: TCrawlingContext) -> None:
        super().__init__()
        self.wrapped_exception = wrapped_exception
        self.crawling_context = crawling_context


@docs_group('Errors')
class ContextPipelineInitializationError(Exception):
    """Wraps an exception thrown in the initialization step of a context pipeline middleware.

    We may not have the complete context at this point, so only `BasicCrawlingContext` is provided.
    """

    def __init__(self, wrapped_exception: Exception, crawling_context: BasicCrawlingContext) -> None:
        super().__init__()
        self.wrapped_exception = wrapped_exception
        self.crawling_context = crawling_context


@docs_group('Errors')
class ContextPipelineFinalizationError(Exception):
    """Wraps an exception thrown in the finalization step of a context pipeline middleware.

    We may not have the complete context at this point, so only `BasicCrawlingContext` is provided.
    """

    def __init__(self, wrapped_exception: Exception, crawling_context: BasicCrawlingContext) -> None:
        super().__init__()
        self.wrapped_exception = wrapped_exception
        self.crawling_context = crawling_context


@docs_group('Errors')
class ContextPipelineInterruptedError(Exception):
    """May be thrown in the initialization phase of a middleware to signal that the request should not be processed."""


@docs_group('Errors')
class RequestCollisionError(Exception):
    """Raised when a request cannot be processed due to a conflict with required resources."""


================================================
FILE: src/crawlee/events/__init__.py
================================================
from ._event_manager import EventManager
from ._local_event_manager import LocalEventManager
from ._types import (
    Event,
    EventAbortingData,
    EventCrawlerStatusData,
    EventData,
    EventExitData,
    EventListener,
    EventMigratingData,
    EventPersistStateData,
    EventSystemInfoData,
)

__all__ = [
    'Event',
    'EventAbortingData',
    'EventCrawlerStatusData',
    'EventData',
    'EventExitData',
    'EventListener',
    'EventManager',
    'EventMigratingData',
    'EventPersistStateData',
    'EventSystemInfoData',
    'LocalEventManager',
]


================================================
FILE: src/crawlee/events/_event_manager.py
================================================
# Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/events/event_manager.ts

from __future__ import annotations

import asyncio
import inspect
from collections import defaultdict
from datetime import timedelta
from functools import wraps
from logging import getLogger
from typing import TYPE_CHECKING, Any, Literal, TypedDict, cast, overload

from pyee.asyncio import AsyncIOEventEmitter

from crawlee._utils.context import ensure_context
from crawlee._utils.docs import docs_group
from crawlee._utils.recurring_task import RecurringTask
from crawlee._utils.wait import wait_for_all_tasks_for_finish
from crawlee.events._types import (
    Event,
    EventAbortingData,
    EventCrawlerStatusData,
    EventExitData,
    EventListener,
    EventMigratingData,
    EventPersistStateData,
    EventSystemInfoData,
)

if TYPE_CHECKING:
    from collections.abc import Awaitable, Callable
    from types import TracebackType

    from typing_extensions import NotRequired

    from crawlee.events._types import EventData, WrappedListener

logger = getLogger(__name__)


class EventManagerOptions(TypedDict):
    """Arguments for the `EventManager` constructor.

    It is intended for typing forwarded `__init__` arguments in the subclasses.
    """

    persist_state_interval: NotRequired[timedelta]
    """Interval between emitted `PersistState` events to maintain state persistence."""

    close_timeout: NotRequired[timedelta | None]
    """Optional timeout for canceling pending event listeners if they exceed this duration."""


@docs_group('Event managers')
class EventManager:
    """Manage events and their listeners, enabling registration, emission, and execution control.

    It allows for registering event listeners, emitting events, and ensuring all listeners complete their execution.
    Built on top of `pyee.asyncio.AsyncIOEventEmitter`. It implements additional features such as waiting for all
    listeners to complete and emitting `PersistState` events at regular intervals.
    """

    def __init__(
        self,
        *,
        persist_state_interval: timedelta = timedelta(minutes=1),
        close_timeout: timedelta | None = None,
    ) -> None:
        """Initialize a new instance.

        Args:
            persist_state_interval: Interval between emitted `PersistState` events to maintain state persistence.
            close_timeout: Optional timeout for canceling pending event listeners if they exceed this duration.
        """
        self._persist_state_interval = persist_state_interval
        self._close_timeout = close_timeout

        # Asynchronous event emitter for handle events and invoke the event listeners.
        self._event_emitter = AsyncIOEventEmitter()

        # Listeners are wrapped inside asyncio.Task. Store their references here so that we can wait for them to finish.
        self._listener_tasks: set[asyncio.Task] = set()

        # Store the mapping between events, listeners and their wrappers in the following way:
        #   event -> listener -> [wrapped_listener_1, wrapped_listener_2, ...]
        self._listeners_to_wrappers: dict[Event, dict[EventListener[Any], list[WrappedListener]]] = defaultdict(
            lambda: defaultdict(list),
        )

        # Recurring task for emitting persist state events.
        self._emit_persist_state_event_rec_task = RecurringTask(
            func=self._emit_persist_state_event,
            delay=self._persist_state_interval,
        )

        # Flag to indicate the context state.
        self._active = False

    @property
    def active(self) -> bool:
        """Indicate whether the context is active."""
        return self._active

    async def __aenter__(self) -> EventManager:
        """Initialize the event manager upon entering the async context.

        Raises:
            RuntimeError: If the context manager is already active.
        """
        if self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is already active.')

        self._active = True
        self._emit_persist_state_event_rec_task.start()
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        """Close the local event manager upon exiting the async context.

        This will stop listening for the events, and it will wait for all the event listeners to finish.

        Raises:
            RuntimeError: If the context manager is not active.
        """
        if not self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is not active.')

        # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
        await self._emit_persist_state_event_rec_task.stop()
        await self._emit_persist_state_event()
        await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
        self._event_emitter.remove_all_listeners()
        self._listener_tasks.clear()
        self._listeners_to_wrappers.clear()
        self._active = False

    @overload
    def on(self, *, event: Literal[Event.PERSIST_STATE], listener: EventListener[EventPersistStateData]) -> None: ...
    @overload
    def on(self, *, event: Literal[Event.SYSTEM_INFO], listener: EventListener[EventSystemInfoData]) -> None: ...
    @overload
    def on(self, *, event: Literal[Event.MIGRATING], listener: EventListener[EventMigratingData]) -> None: ...
    @overload
    def on(self, *, event: Literal[Event.ABORTING], listener: EventListener[EventAbortingData]) -> None: ...
    @overload
    def on(self, *, event: Literal[Event.EXIT], listener: EventListener[EventExitData]) -> None: ...
    @overload
    def on(self, *, event: Literal[Event.CRAWLER_STATUS], listener: EventListener[EventCrawlerStatusData]) -> None: ...
    @overload
    def on(self, *, event: Event, listener: EventListener[None]) -> None: ...

    def on(self, *, event: Event, listener: EventListener[Any]) -> None:
        """Register an event listener for a specific event.

        Args:
            event: The event for which to listen to.
            listener: The function (sync or async) which is to be called when the event is emitted.
        """
        signature = inspect.signature(listener)

        @wraps(cast('Callable[..., None | Awaitable[None]]', listener))
        async def listener_wrapper(event_data: EventData) -> None:
            try:
                bound_args = signature.bind(event_data)
            except TypeError:  # Parameterless listener
                bound_args = signature.bind()

            # If the listener is a coroutine function, just call it, otherwise, run it in a separate thread
            # to avoid blocking the event loop
            coro = (
                listener(*bound_args.args, **bound_args.kwargs)
                if inspect.iscoroutinefunction(listener)
                else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs)
            )

            listener_name = listener.__name__ if hasattr(listener, '__name__') else listener.__class__.__name__
            listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener_name}')
            self._listener_tasks.add(listener_task)

            try:
                logger.debug('EventManager.on.listener_wrapper(): Awaiting listener task...')
                await listener_task
                logger.debug('EventManager.on.listener_wrapper(): Listener task completed.')
            except Exception:
                # We need to swallow the exception and just log it here, otherwise it could break the event emitter
                logger.exception(
                    'Exception in the event listener',
                    extra={
                        'event_name': event.value,
                        'listener_name': listener.__name__
                        if hasattr(listener, '__name__')
                        else listener.__class__.__name__,
                    },
                )
            finally:
                logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...')
                self._listener_tasks.remove(listener_task)

        self._listeners_to_wrappers[event][listener].append(listener_wrapper)
        self._event_emitter.add_listener(event.value, listener_wrapper)

    def off(self, *, event: Event, listener: EventListener[Any] | None = None) -> None:
        """Remove a specific listener or all listeners for an event.

        Args:
            event: The Actor event for which to remove listeners.
            listener: The listener which is supposed to be removed. If not passed, all listeners of this event
                are removed.
        """
        if listener:
            for listener_wrapper in self._listeners_to_wrappers[event][listener]:
                self._event_emitter.remove_listener(event.value, listener_wrapper)
            self._listeners_to_wrappers[event][listener] = []
        else:
            self._listeners_to_wrappers[event] = defaultdict(list)
            self._event_emitter.remove_all_listeners(event.value)

    @overload
    def emit(self, *, event: Literal[Event.PERSIST_STATE], event_data: EventPersistStateData) -> None: ...
    @overload
    def emit(self, *, event: Literal[Event.SYSTEM_INFO], event_data: EventSystemInfoData) -> None: ...
    @overload
    def emit(self, *, event: Literal[Event.MIGRATING], event_data: EventMigratingData) -> None: ...
    @overload
    def emit(self, *, event: Literal[Event.ABORTING], event_data: EventAbortingData) -> None: ...
    @overload
    def emit(self, *, event: Literal[Event.EXIT], event_data: EventExitData) -> None: ...
    @overload
    def emit(self, *, event: Literal[Event.CRAWLER_STATUS], event_data: EventCrawlerStatusData) -> None: ...
    @overload
    def emit(self, *, event: Event, event_data: Any) -> None: ...

    @ensure_context
    def emit(self, *, event: Event, event_data: EventData) -> None:
        """Emit an event with the associated data to all registered listeners.

        Args:
            event: The event which will be emitted.
            event_data: The data which will be passed to the event listeners.
        """
        self._event_emitter.emit(event.value, event_data)

    @ensure_context
    async def wait_for_all_listeners_to_complete(self, *, timeout: timedelta | None = None) -> None:
        """Wait for all currently executing event listeners to complete.

        Args:
            timeout: The maximum time to wait for the event listeners to finish. If they do not complete within
                the specified timeout, they will be canceled.
        """

        async def wait_for_listeners() -> None:
            """Gathers all listener tasks and awaits their completion, logging any exceptions encountered."""
            results = await asyncio.gather(*self._listener_tasks, return_exceptions=True)
            for result in results:
                if isinstance(result, Exception):
                    logger.exception('Event listener raised an exception.', exc_info=result)

        tasks = [asyncio.create_task(wait_for_listeners(), name=f'Task-{wait_for_listeners.__name__}')]

        await wait_for_all_tasks_for_finish(tasks=tasks, logger=logger, timeout=timeout)

    async def _emit_persist_state_event(self) -> None:
        """Emit a persist state event with the given migration status."""
        self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False))


================================================
FILE: src/crawlee/events/_local_event_manager.py
================================================
# Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/events/local_event_manager.ts

from __future__ import annotations

import asyncio
from datetime import timedelta
from logging import getLogger
from typing import TYPE_CHECKING

from crawlee._utils.docs import docs_group
from crawlee._utils.recurring_task import RecurringTask
from crawlee._utils.system import get_cpu_info, get_memory_info
from crawlee.configuration import Configuration
from crawlee.events._event_manager import EventManager, EventManagerOptions
from crawlee.events._types import Event, EventSystemInfoData

if TYPE_CHECKING:
    from types import TracebackType

    from typing_extensions import Unpack

logger = getLogger(__name__)


@docs_group('Event managers')
class LocalEventManager(EventManager):
    """Event manager for local environments.

    It extends the `EventManager` to emit `SystemInfo` events at regular intervals. The `LocalEventManager`
    is intended to be used in local environments, where the system metrics are required managing the `Snapshotter`
    and `AutoscaledPool`.
    """

    def __init__(
        self,
        system_info_interval: timedelta = timedelta(seconds=1),
        **event_manager_options: Unpack[EventManagerOptions],
    ) -> None:
        """Initialize a new instance.

        In most cases, you should use the `from_config` constructor to create a new instance based on
        the provided configuration.

        Args:
            system_info_interval: Interval at which `SystemInfo` events are emitted.
            event_manager_options: Additional options for the parent class.
        """
        self._system_info_interval = system_info_interval

        # Recurring task for emitting system info events.
        self._emit_system_info_event_rec_task = RecurringTask(
            func=self._emit_system_info_event,
            delay=self._system_info_interval,
        )

        super().__init__(**event_manager_options)

    @classmethod
    def from_config(cls, config: Configuration | None = None) -> LocalEventManager:
        """Initialize a new instance based on the provided `Configuration`.

        Args:
            config: The `Configuration` instance. Uses the global (default) one if not provided.
        """
        config = config or Configuration.get_global_configuration()

        return cls(
            system_info_interval=config.system_info_interval,
            persist_state_interval=config.persist_state_interval,
        )

    async def __aenter__(self) -> LocalEventManager:
        """Initialize the local event manager upon entering the async context.

        It starts emitting system info events at regular intervals.
        """
        await super().__aenter__()
        self._emit_system_info_event_rec_task.start()
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        """Close the local event manager upon exiting the async context.

        It stops emitting system info events and closes the event manager.
        """
        await self._emit_system_info_event_rec_task.stop()
        await super().__aexit__(exc_type, exc_value, exc_traceback)

    async def _emit_system_info_event(self) -> None:
        """Emit a system info event with the current CPU and memory usage."""
        cpu_info = await asyncio.to_thread(get_cpu_info)
        memory_info = await asyncio.to_thread(get_memory_info)

        event_data = EventSystemInfoData(cpu_info=cpu_info, memory_info=memory_info)
        self.emit(event=Event.SYSTEM_INFO, event_data=event_data)


================================================
FILE: src/crawlee/events/_types.py
================================================
from __future__ import annotations

from collections.abc import Callable, Coroutine
from enum import Enum
from typing import Annotated, Any, TypeVar

from pydantic import BaseModel, ConfigDict, Field

from crawlee._utils.docs import docs_group
from crawlee._utils.models import timedelta_secs
from crawlee._utils.system import CpuInfo, MemoryUsageInfo


@docs_group('Event data')
class Event(str, Enum):
    """Names of all possible events that can be emitted using an `EventManager`."""

    # Core events
    PERSIST_STATE = 'persistState'
    SYSTEM_INFO = 'systemInfo'
    MIGRATING = 'migrating'
    ABORTING = 'aborting'
    EXIT = 'exit'

    # Session pool events
    SESSION_RETIRED = 'sessionRetired'

    # Browser pool events
    BROWSER_LAUNCHED = 'browserLaunched'
    BROWSER_RETIRED = 'browserRetired'
    BROWSER_CLOSED = 'browserClosed'
    PAGE_CREATED = 'pageCreated'
    PAGE_CLOSED = 'pageClosed'

    # State events
    CRAWLER_STATUS = 'crawlerStatus'


@docs_group('Event data')
class EventPersistStateData(BaseModel):
    """Data for the persist state event."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

    is_migrating: Annotated[bool, Field(alias='isMigrating')]


@docs_group('Event data')
class EventSystemInfoData(BaseModel):
    """Data for the system info event."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

    cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')]
    memory_info: Annotated[
        MemoryUsageInfo,
        Field(alias='memoryInfo'),
    ]


@docs_group('Event data')
class EventMigratingData(BaseModel):
    """Data for the migrating event."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

    # The remaining time in seconds before the migration is forced and the process is killed
    # Optional because it's not present when the event handler is called manually
    time_remaining: Annotated[timedelta_secs | None, Field(alias='timeRemainingSecs')] = None


@docs_group('Event data')
class EventAbortingData(BaseModel):
    """Data for the aborting event."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)


@docs_group('Event data')
class EventExitData(BaseModel):
    """Data for the exit event."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)


@docs_group('Event data')
class EventCrawlerStatusData(BaseModel):
    """Data for the crawler status event."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

    message: str
    """A message describing the current status of the crawler."""

    crawler_id: int
    """The ID of the crawler that emitted the event."""


EventData = (
    EventPersistStateData
    | EventSystemInfoData
    | EventMigratingData
    | EventAbortingData
    | EventExitData
    | EventCrawlerStatusData
)
"""A helper type for all possible event payloads"""

WrappedListener = Callable[..., Coroutine[Any, Any, None]]

TEvent = TypeVar('TEvent')
EventListener = (
    Callable[
        [TEvent],
        None | Coroutine[Any, Any, None],
    ]
    | Callable[
        [],
        None | Coroutine[Any, Any, None],
    ]
)
"""An event listener function - it can be both sync and async and may accept zero or one argument."""


================================================
FILE: src/crawlee/events/py.typed
================================================


================================================
FILE: src/crawlee/fingerprint_suite/__init__.py
================================================
from ._browserforge_adapter import BrowserforgeFingerprintGenerator as DefaultFingerprintGenerator
from ._fingerprint_generator import FingerprintGenerator
from ._header_generator import HeaderGenerator
from ._types import HeaderGeneratorOptions, ScreenOptions

__all__ = [
    'DefaultFingerprintGenerator',
    'FingerprintGenerator',
    'HeaderGenerator',
    'HeaderGeneratorOptions',
    'ScreenOptions',
]


================================================
FILE: src/crawlee/fingerprint_suite/_browserforge_adapter.py
================================================
from __future__ import annotations

import random
from collections.abc import Iterable
from copy import deepcopy
from functools import reduce
from operator import or_
from typing import TYPE_CHECKING, Any, Literal

import apify_fingerprint_datapoints
from browserforge.bayesian_network import extract_json
from browserforge.fingerprints import Fingerprint as bf_Fingerprint
from browserforge.fingerprints import FingerprintGenerator as bf_FingerprintGenerator
from browserforge.fingerprints import Screen
from browserforge.headers.generator import HeaderGenerator as bf_HeaderGenerator
from browserforge.headers.generator import ListOrString
from typing_extensions import override

from crawlee._utils.docs import docs_group

from ._consts import BROWSER_TYPE_HEADER_KEYWORD
from ._fingerprint_generator import FingerprintGenerator

if TYPE_CHECKING:
    from browserforge.headers import Browser

    from ._types import HeaderGeneratorOptions, ScreenOptions, SupportedBrowserType


class PatchedHeaderGenerator(bf_HeaderGenerator):
    """Browserforge `HeaderGenerator` that contains patches specific for our usage of the generator."""

    def _get_accept_language_header(self, locales: tuple[str, ...] | list[str] | str) -> str:
        """Generate the Accept-Language header based on the given locales.

        Patched version due to PR of upstream repo not being merged: https://github.com/daijro/browserforge/pull/24

        Args:
            locales: Locale(s).

        Returns:
            Accept-Language header string.
        """
        # Convert to tuple if needed for consistent handling.
        if isinstance(locales, str):
            locales_tuple: tuple[str, ...] = (locales,)
        elif isinstance(locales, list):
            locales_tuple = tuple(locales)
        else:
            locales_tuple = locales

        # First locale does not include quality factor, q=1 is considered as implicit.
        additional_locales = [f'{locale};q={0.9 - index * 0.1:.1f}' for index, locale in enumerate(locales_tuple[1:])]
        return ','.join((locales_tuple[0], *additional_locales))

    def generate(
        self,
        *,
        browser: Iterable[str | Browser] | None = None,
        os: ListOrString | None = None,
        device: ListOrString | None = None,
        locale: ListOrString | None = None,
        http_version: Literal[1, 2] | None = None,
        user_agent: ListOrString | None = None,
        strict: bool | None = None,
        request_dependent_headers: dict[str, str] | None = None,
    ) -> dict[str, str]:
        """Generate HTTP headers based on the specified parameters.

        For detailed description of the original method see: `browserforge.headers.generator.HeaderGenerator.generate`
        This patched version of the method adds additional quality checks on the output of the original method. It tries
        to generate headers several times until they match the requirements.

        Returns:
            A generated headers.
        """
        # browserforge header generation can be flaky. Enforce basic QA on generated headers
        max_attempts = 10

        single_browser = self._get_single_browser_type(browser)

        if single_browser == 'chrome':
            # `BrowserForge` header generator considers `chrome` in general sense and therefore will generate also
            # other `chrome` based browser headers. This adapter desires only specific subset of `chrome` headers
            # that contain all 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform' headers.
            # Increase max attempts as from `BrowserForge` header generator perspective even `chromium`
            # headers without `sec-...` headers are valid.
            max_attempts += 50

        # Use browserforge to generate headers until it satisfies our additional requirements.
        for _attempt in range(max_attempts):
            generated_header: dict[str, str] = super().generate(
                browser=single_browser,
                os=os,
                device=device,
                locale=locale,
                http_version=http_version,
                user_agent=user_agent,
                strict=strict,
                request_dependent_headers=request_dependent_headers,
            )

            if ('headless' in generated_header.get('User-Agent', '').lower()) or (
                'headless' in generated_header.get('sec-ch-ua', '').lower()
            ):
                # It can be a valid header, but we never want to leak "headless". Get a different one.
                continue

            if any(
                keyword in generated_header['User-Agent']
                for keyword in self._get_expected_browser_keywords(single_browser)
            ):
                if single_browser == 'chrome' and not self._contains_all_sec_headers(generated_header):
                    # Accept chromium header only with all sec headers.
                    continue

                return generated_header
        raise RuntimeError('Failed to generate header.')

    def _contains_all_sec_headers(self, headers: dict[str, str]) -> bool:
        return all(header_name in headers for header_name in ('sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform'))

    def _get_expected_browser_keywords(self, browser: str | None) -> set[str]:
        if not browser:
            # Allow all possible keywords when there is no preference for specific browser type.
            return reduce(or_, BROWSER_TYPE_HEADER_KEYWORD.values())

        return BROWSER_TYPE_HEADER_KEYWORD[browser]

    def _get_single_browser_type(self, browser: Iterable[str | Browser] | None) -> str | None:
        """Get single browser type.

        Browserforge header generator accepts wider range of possible types.
        Narrow it to single optional string as that is how we use it.
        Handling the original multitype would be pointlessly complex.
        """
        # In our case we never pass more than one browser type. In general case more browsers are just bigger pool to
        # select from, so narrowing it to any of them is still a valid action as we are going to pick just one anyway.
        if isinstance(browser, str):
            return browser
        if isinstance(browser, Iterable):
            choice = random.choice(
                [
                    single_browser if isinstance(single_browser, str) else single_browser.name
                    for single_browser in browser
                ]
            )
            if choice in {'chrome', 'firefox', 'safari', 'edge'}:
                return choice
            raise ValueError('Invalid browser type.')
        return None


class PatchedFingerprintGenerator(bf_FingerprintGenerator):
    """Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo."""

    def __init__(
        self,
        *,
        screen: Screen | None = None,
        strict: bool = False,
        mock_webrtc: bool = False,
        slim: bool = False,
        **header_kwargs,  # noqa:ANN003 # Upstream repo types missing.
    ) -> None:
        """Initialize a new instance.

        Args:
            screen: Screen constraints for the generated fingerprint.
            strict: Whether to raise an exception if the constraints are too strict.
            mock_webrtc: Whether to mock WebRTC when injecting the fingerprint.
            slim: Disables performance-heavy evasions when injecting the fingerprint.
            **header_kwargs: Header generation options for `HeaderGenerator`.
        """
        super().__init__(screen=screen, strict=strict, mock_webrtc=mock_webrtc, slim=slim)
        # Replace `self.header_generator` To make sure that we consistently use `PatchedHeaderGenerator`
        self.header_generator = PatchedHeaderGenerator(**header_kwargs)


@docs_group('Other')
class BrowserforgeFingerprintGenerator(FingerprintGenerator):
    """`FingerprintGenerator` adapter for fingerprint generator from `browserforge`.

    `browserforge` is a browser header and fingerprint generator: https://github.com/daijro/browserforge
    """

    def __init__(
        self,
        *,
        header_options: HeaderGeneratorOptions | None = None,
        screen_options: ScreenOptions | None = None,
        mock_web_rtc: bool | None = None,
        slim: bool | None = None,
    ) -> None:
        """Initialize a new instance.

        All generator options are optional. If any value is not specified, then `None` is set in the options.
        Default values for options set to `None` are implementation detail of used fingerprint generator.
        Specific default values should not be relied upon. Use explicit values if it matters for your use case.

        Args:
            header_options: Collection of header related attributes that can be used by the fingerprint generator.
            screen_options: Defines the screen constrains for the fingerprint generator.
            mock_web_rtc: Whether to mock WebRTC when injecting the fingerprint.
            slim: Disables performance-heavy evasions when injecting the fingerprint.
        """
        bf_options: dict[str, Any] = {'mock_webrtc': mock_web_rtc, 'slim': slim}

        if header_options is None:
            bf_header_options = {}
        else:
            bf_header_options = deepcopy(header_options.model_dump())
            bf_header_options['browser'] = bf_header_options.pop('browsers', None)
            bf_header_options['os'] = bf_header_options.pop('operating_systems', None)
            bf_header_options['device'] = bf_header_options.pop('devices', None)
            bf_header_options['locale'] = bf_header_options.pop('locales', None)

        if screen_options is None:
            bf_options['screen'] = Screen()
        else:
            bf_options['screen'] = Screen(**screen_options.model_dump())

        self._options = {**bf_options, **bf_header_options}
        self._generator = PatchedFingerprintGenerator()

    @override
    def generate(self) -> bf_Fingerprint:
        # browserforge fingerprint generation can be flaky
        # https://github.com/daijro/browserforge/issues/22"
        # During test runs around 10 % flakiness was detected.
        # Max attempt set to 10 as (0.1)^10 is considered sufficiently low probability.
        max_attempts = 10
        for attempt in range(max_attempts):
            try:
                return self._generator.generate(**self._options)
            except ValueError:  # noqa:PERF203
                if attempt == max_attempts:
                    raise
        raise RuntimeError('Failed to generate fingerprint.')


class BrowserforgeHeaderGenerator:
    """`HeaderGenerator` adapter for fingerprint generator from `browserforge`."""

    def __init__(self) -> None:
        self._generator = PatchedHeaderGenerator(locale=['en-US', 'en'])

    def generate(self, browser_type: SupportedBrowserType = 'chrome') -> dict[str, str]:
        """Generate headers."""
        return self._generator.generate(browser=[browser_type])


def get_available_header_network() -> dict:
    """Get header network that contains possible header values."""
    return extract_json(apify_fingerprint_datapoints.get_header_network())


def get_available_header_values(header_network: dict, node_name: str | set[str]) -> set[str]:
    """Get set of possible header values from available header network."""
    node_names = {node_name} if isinstance(node_name, str) else node_name
    for node in header_network['nodes']:
        if node['name'] in node_names:
            return set(node['possibleValues'])
    return set()


================================================
FILE: src/crawlee/fingerprint_suite/_consts.py
================================================
from __future__ import annotations

COMMON_ACCEPT_LANGUAGE = 'en-US,en;q=0.9'

BROWSER_TYPE_HEADER_KEYWORD = {
    'chrome': {'Chrome', 'CriOS'},
    'firefox': {'Firefox', 'FxiOS'},
    'edge': {'Edg', 'Edge', 'EdgA', 'EdgiOS'},
    'safari': {'Safari'},
}


================================================
FILE: src/crawlee/fingerprint_suite/_fingerprint_generator.py
================================================
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
    from browserforge.fingerprints import Fingerprint


@docs_group('Other')
class FingerprintGenerator(ABC):
    """A class for creating browser fingerprints that mimic browser fingerprints of real users."""

    @abstractmethod
    def generate(self) -> Fingerprint:
        """Generate browser fingerprints.

        This is experimental feature.
        Return type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely
        it will change to custom `Fingerprint` class defined in this repo later.
        """


================================================
FILE: src/crawlee/fingerprint_suite/_header_generator.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING, Literal

from crawlee._types import HttpHeaders
from crawlee._utils.docs import docs_group
from crawlee.fingerprint_suite._browserforge_adapter import BrowserforgeHeaderGenerator

if TYPE_CHECKING:
    from crawlee.fingerprint_suite._types import SupportedBrowserType


def fingerprint_browser_type_from_playwright_browser_type(
    playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
) -> SupportedBrowserType:
    if playwright_browser_type in {'chromium', 'chrome'}:
        return 'chrome'
    if playwright_browser_type == 'firefox':
        return 'firefox'
    if playwright_browser_type == 'webkit':
        return 'safari'
    raise ValueError(f'Unsupported browser type: {playwright_browser_type}')


@docs_group('Other')
class HeaderGenerator:
    """Generate realistic looking or browser-like HTTP headers."""

    def __init__(self) -> None:
        self._generator = BrowserforgeHeaderGenerator()

    def _select_specific_headers(self, all_headers: dict[str, str], header_names: set[str]) -> HttpHeaders:
        return HttpHeaders({key: value for key, value in all_headers.items() if key in header_names})

    def get_specific_headers(
        self, header_names: set[str] | None = None, browser_type: SupportedBrowserType = 'chrome'
    ) -> HttpHeaders:
        """Return subset of headers based on the selected `header_names`.

        If no `header_names` are specified, full unfiltered headers are returned.
        """
        all_headers = self._generator.generate(browser_type=browser_type)

        if not header_names:
            return HttpHeaders(all_headers)
        return self._select_specific_headers(all_headers, header_names)

    def get_common_headers(self) -> HttpHeaders:
        """Get common HTTP headers ("Accept", "Accept-Language").

        We do not modify the "Accept-Encoding", "Connection" and other headers. They should be included and handled
        by the HTTP client or browser.
        """
        all_headers = self._generator.generate()
        return self._select_specific_headers(all_headers, header_names={'Accept', 'Accept-Language'})

    def get_random_user_agent_header(self) -> HttpHeaders:
        """Get a random User-Agent header."""
        all_headers = self._generator.generate()
        return self._select_specific_headers(all_headers, header_names={'User-Agent'})

    def get_user_agent_header(
        self,
        *,
        browser_type: SupportedBrowserType = 'chrome',
    ) -> HttpHeaders:
        """Get the User-Agent header based on the browser type."""
        if browser_type not in {'chrome', 'firefox', 'safari', 'edge'}:
            raise ValueError(f'Unsupported browser type: {browser_type}')
        all_headers = self._generator.generate(browser_type=browser_type)
        return self._select_specific_headers(all_headers, header_names={'User-Agent'})

    def get_sec_ch_ua_headers(
        self,
        *,
        browser_type: SupportedBrowserType = 'chrome',
    ) -> HttpHeaders:
        """Get the sec-ch-ua headers based on the browser type."""
        if browser_type not in {'chrome', 'firefox', 'safari', 'edge'}:
            raise ValueError(f'Unsupported browser type: {browser_type}')
        all_headers = self._generator.generate(browser_type=browser_type)
        return self._select_specific_headers(
            all_headers, header_names={'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform'}
        )


================================================
FILE: src/crawlee/fingerprint_suite/_types.py
================================================
from __future__ import annotations

from typing import Annotated, Literal

from pydantic import BaseModel, ConfigDict, Field

SupportedOperatingSystems = Literal['windows', 'macos', 'linux', 'android', 'ios']
SupportedDevices = Literal['desktop', 'mobile']
SupportedHttpVersion = Literal['1', '2']
SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']


class ScreenOptions(BaseModel):
    model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)

    """Defines the screen constrains for the fingerprint generator."""

    min_width: Annotated[float | None, Field(alias='minWidth')] = None
    """Minimal screen width constraint for the fingerprint generator."""

    max_width: Annotated[float | None, Field(alias='maxWidth')] = None
    """Maximal screen width constraint for the fingerprint generator."""

    min_height: Annotated[float | None, Field(alias='minHeight')] = None
    """Minimal screen height constraint for the fingerprint generator."""

    max_height: Annotated[float | None, Field(alias='maxHeight')] = None
    """Maximal screen height constraint for the fingerprint generator."""


class HeaderGeneratorOptions(BaseModel):
    """Collection of header related attributes that can be used by the fingerprint generator."""

    model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)

    browsers: list[SupportedBrowserType] | None = None
    """List of BrowserSpecifications to generate the headers for."""

    operating_systems: Annotated[list[SupportedOperatingSystems] | None, Field(alias='operatingSystems')] = None
    """List of operating systems to generate the headers for."""

    devices: list[SupportedDevices] | None = None
    """List of devices to generate the headers for."""

    locales: list[str] | None = None
    """List of at most 10 languages to include in the [Accept-Language]
    (https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language) request header
    in the language format accepted by that header, for example `en`, `en-US` or `de`."""

    http_version: Annotated[SupportedHttpVersion | None, Field(alias='httpVersion')] = None
    """HTTP version to be used for header generation (the headers differ depending on the version)."""

    strict: bool | None = None
    """If true, the generator will throw an error if it cannot generate headers based on the input."""


================================================
FILE: src/crawlee/fingerprint_suite/py.typed
================================================


================================================
FILE: src/crawlee/http_clients/__init__.py
================================================
from crawlee._utils.try_import import install_import_hook as _install_import_hook
from crawlee._utils.try_import import try_import as _try_import

# These imports have only mandatory dependencies, so they are imported directly.
from ._base import HttpClient, HttpCrawlingResult, HttpResponse
from ._impit import ImpitHttpClient

_install_import_hook(__name__)

# The following imports are wrapped in try_import to handle optional dependencies,
# ensuring the module can still function even if these dependencies are missing.
with _try_import(__name__, 'CurlImpersonateHttpClient'):
    from ._curl_impersonate import CurlImpersonateHttpClient

with _try_import(__name__, 'HttpxHttpClient'):
    from ._httpx import HttpxHttpClient


__all__ = [
    'CurlImpersonateHttpClient',
    'HttpClient',
    'HttpCrawlingResult',
    'HttpResponse',
    'HttpxHttpClient',
    'ImpitHttpClient',
]


================================================
FILE: src/crawlee/http_clients/_base.py
================================================
from __future__ import annotations

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import TYPE_CHECKING, Protocol

from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
    from collections.abc import AsyncIterator
    from contextlib import AbstractAsyncContextManager
    from datetime import timedelta
    from types import TracebackType

    from crawlee import Request
    from crawlee._types import HttpHeaders, HttpMethod, HttpPayload
    from crawlee.proxy_configuration import ProxyInfo
    from crawlee.sessions import Session
    from crawlee.statistics import Statistics


@docs_group('Other')
class HttpResponse(Protocol):
    """Define the interface that any HTTP response object must implement."""

    @property
    def http_version(self) -> str:
        """The HTTP version used in the response."""

    @property
    def status_code(self) -> int:
        """The HTTP status code received from the server."""

    @property
    def headers(self) -> HttpHeaders:
        """The HTTP headers received in the response."""

    async def read(self) -> bytes:
        """Read the entire content of the response body.

        This method loads the complete response body into memory at once. It should be used
        for responses received from regular HTTP requests (via `send_request` or `crawl` methods).

        Raises:
            RuntimeError: If called on a response received from the `stream` method.
        """

    def read_stream(self) -> AsyncIterator[bytes]:
        """Iterate over the content of the response body in chunks.

        This method should be used for responses received from the `stream` method to process
        large response bodies without loading them entirely into memory. It allows for efficient
        processing of potentially large data by yielding chunks sequentially.

        Raises:
            RuntimeError: If the stream has already been consumed or if the response was not obtained from the `stream`
                method.
        """


@dataclass(frozen=True)
@docs_group('Crawling contexts')
class HttpCrawlingResult:
    """Result of an HTTP-only crawl.

    Mainly for the purpose of composing specific crawling contexts (e.g. `BeautifulSoupCrawlingContext`,
    `ParselCrawlingContext`, ...).
    """

    http_response: HttpResponse
    """The HTTP response received from the server."""


@docs_group('HTTP clients')
class HttpClient(ABC):
    """An abstract base class for HTTP clients used in crawlers (`BasicCrawler` subclasses)."""

    @abstractmethod
    def __init__(
        self,
        *,
        persist_cookies_per_session: bool = True,
    ) -> None:
        """Initialize a new instance.

        Args:
            persist_cookies_per_session: Whether to persist cookies per HTTP session.
        """
        self._persist_cookies_per_session = persist_cookies_per_session

        # Flag to indicate the context state.
        self._active = False

    @property
    def active(self) -> bool:
        """Indicate whether the context is active."""
        return self._active

    @abstractmethod
    async def crawl(
        self,
        request: Request,
        *,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        statistics: Statistics | None = None,
        timeout: timedelta | None = None,
    ) -> HttpCrawlingResult:
        """Perform the crawling for a given request.

        This method is called from `crawler.run()`.

        Args:
            request: The request to be crawled.
            session: The session associated with the request.
            proxy_info: The information about the proxy to be used.
            statistics: The statistics object to register status codes.
            timeout: Maximum time allowed to process the request.

        Raises:
            ProxyError: Raised if a proxy-related error occurs.

        Returns:
            The result of the crawling.
        """

    @abstractmethod
    async def send_request(
        self,
        url: str,
        *,
        method: HttpMethod = 'GET',
        headers: HttpHeaders | dict[str, str] | None = None,
        payload: HttpPayload | None = None,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        timeout: timedelta | None = None,
    ) -> HttpResponse:
        """Send an HTTP request via the client.

        This method is called from `context.send_request()` helper.

        Args:
            url: The URL to send the request to.
            method: The HTTP method to use.
            headers: The headers to include in the request.
            payload: The data to be sent as the request body.
            session: The session associated with the request.
            proxy_info: The information about the proxy to be used.
            timeout: Maximum time allowed to process the request.

        Raises:
            ProxyError: Raised if a proxy-related error occurs.

        Returns:
            The HTTP response received from the server.
        """

    @abstractmethod
    def stream(
        self,
        url: str,
        *,
        method: HttpMethod = 'GET',
        headers: HttpHeaders | dict[str, str] | None = None,
        payload: HttpPayload | None = None,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        timeout: timedelta | None = None,
    ) -> AbstractAsyncContextManager[HttpResponse]:
        """Stream an HTTP request via the client.

        This method should be used for downloading potentially large data where you need to process
        the response body in chunks rather than loading it entirely into memory.

        Args:
            url: The URL to send the request to.
            method: The HTTP method to use.
            headers: The headers to include in the request.
            payload: The data to be sent as the request body.
            session: The session associated with the request.
            proxy_info: The information about the proxy to be used.
            timeout: The maximum time to wait for establishing the connection.

        Raises:
            ProxyError: Raised if a proxy-related error occurs.

        Returns:
            An async context manager yielding the HTTP response with streaming capabilities.
        """

    @abstractmethod
    async def cleanup(self) -> None:
        """Clean up resources used by the client.

        This method is called when the client is no longer needed and should be overridden
        in subclasses to perform any necessary cleanup such as closing connections,
        releasing file handles, or other resource deallocation.
        """

    async def __aenter__(self) -> HttpClient:
        """Initialize the client when entering the context manager.

        Raises:
            RuntimeError: If the context manager is already active.
        """
        if self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is already active.')

        self._active = True
        return self

    async def __aexit__(
        self, exc_type: BaseException | None, exc_value: BaseException | None, traceback: TracebackType | None
    ) -> None:
        """Deinitialize the client and clean up resources when exiting the context manager.

        Raises:
            RuntimeError: If the context manager is already active.
        """
        if not self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is not active.')

        await self.cleanup()
        self._active = False


================================================
FILE: src/crawlee/http_clients/_curl_impersonate.py
================================================
from __future__ import annotations

import asyncio
from contextlib import asynccontextmanager
from http.cookiejar import Cookie
from typing import TYPE_CHECKING, Any, cast

from curl_cffi import CurlInfo
from curl_cffi.const import CurlHttpVersion
from curl_cffi.requests import AsyncSession
from curl_cffi.requests.cookies import Cookies as CurlCookies
from curl_cffi.requests.cookies import CurlMorsel
from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
from curl_cffi.requests.exceptions import RequestException as CurlRequestError
from curl_cffi.requests.exceptions import Timeout
from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
from typing_extensions import override

from crawlee._types import HttpHeaders, HttpMethod, HttpPayload
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
from crawlee._utils.docs import docs_group
from crawlee.errors import ProxyError
from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator
    from datetime import timedelta

    from curl_cffi import Curl
    from curl_cffi.requests import Request as CurlRequest
    from curl_cffi.requests import Response
    from curl_cffi.requests.session import HttpMethod as CurlHttpMethod

    from crawlee import Request
    from crawlee._types import HttpMethod
    from crawlee.proxy_configuration import ProxyInfo
    from crawlee.sessions import Session
    from crawlee.statistics import Statistics


class _EmptyCookies(CurlCookies):
    @override
    def get_cookies_for_curl(self, request: CurlRequest) -> list[CurlMorsel]:
        return []

    @override
    def update_cookies_from_curl(self, morsels: list[CurlMorsel]) -> None:
        return None


class _AsyncSession(AsyncSession):
    @override
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)
        self._cookies = _EmptyCookies()


class _CurlImpersonateResponse:
    """Adapter class for `curl_cffi.requests.Response` to conform to the `HttpResponse` protocol."""

    def __init__(self, response: Response) -> None:
        self._response = response

    @property
    def http_version(self) -> str:
        if self._response.http_version == CurlHttpVersion.NONE:
            return 'NONE'
        if self._response.http_version == CurlHttpVersion.V1_0:
            return 'HTTP/1.0'
        if self._response.http_version == CurlHttpVersion.V1_1:
            return 'HTTP/1.1'
        if self._response.http_version in {
            CurlHttpVersion.V2_0,
            CurlHttpVersion.V2TLS,
            CurlHttpVersion.V2_PRIOR_KNOWLEDGE,
        }:
            return 'HTTP/2'
        if self._response.http_version == CurlHttpVersion.V3:
            return 'HTTP/3'

        raise ValueError(f'Unknown HTTP version: {self._response.http_version}')

    @property
    def status_code(self) -> int:
        return self._response.status_code

    @property
    def headers(self) -> HttpHeaders:
        return HttpHeaders({key: value for key, value in self._response.headers.items() if value})

    async def read(self) -> bytes:
        if self._response.astream_task:
            raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')

        return self._response.content

    async def read_stream(self) -> AsyncGenerator[bytes, None]:
        if not self._response.astream_task:
            raise RuntimeError('Cannot read stream, Response not obtained from `stream` method.')

        if isinstance(self._response.astream_task, asyncio.Future) and self._response.astream_task.done():
            raise RuntimeError('Cannot read stream, it was already consumed.')

        async for chunk in self._response.aiter_content():
            yield chunk


@docs_group('HTTP clients')
class CurlImpersonateHttpClient(HttpClient):
    """HTTP client based on the `curl-cffi` library.

    This client uses the `curl-cffi` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)
    and to manage sessions, proxies, and error handling.

    See the `HttpClient` class for more common information about HTTP clients.

    ### Usage

    ```python
    from crawlee.crawlers import HttpCrawler  # or any other HTTP client-based crawler
    from crawlee.http_clients import CurlImpersonateHttpClient

    http_client = CurlImpersonateHttpClient()
    crawler = HttpCrawler(http_client=http_client)
    ```
    """

    def __init__(
        self,
        *,
        persist_cookies_per_session: bool = True,
        **async_session_kwargs: Any,
    ) -> None:
        """Initialize a new instance.

        Args:
            persist_cookies_per_session: Whether to persist cookies per HTTP session.
            async_session_kwargs: Additional keyword arguments for `curl_cffi.requests.AsyncSession`.
        """
        super().__init__(
            persist_cookies_per_session=persist_cookies_per_session,
        )
        self._async_session_kwargs = async_session_kwargs

        self._client_by_proxy_url = dict[str | None, AsyncSession]()

    @override
    async def crawl(
        self,
        request: Request,
        *,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        statistics: Statistics | None = None,
        timeout: timedelta | None = None,
    ) -> HttpCrawlingResult:
        client = self._get_client(proxy_info.url if proxy_info else None)

        try:
            response = await client.request(
                url=request.url,
                method=self._convert_method(request.method),
                headers=request.headers,
                data=request.payload,
                cookies=session.cookies.jar if session else None,
                timeout=timeout.total_seconds() if timeout else None,
            )
        except Timeout as exc:
            raise asyncio.TimeoutError from exc
        except CurlRequestError as exc:
            if self._is_proxy_error(exc):
                raise ProxyError from exc
            raise

        if statistics:
            statistics.register_status_code(response.status_code)

        if self._persist_cookies_per_session and session and response.curl:
            response_cookies = self._get_cookies(response.curl)
            session.cookies.store_cookies(response_cookies)

        request.loaded_url = response.url

        return HttpCrawlingResult(
            http_response=_CurlImpersonateResponse(response),
        )

    @override
    async def send_request(
        self,
        url: str,
        *,
        method: HttpMethod = 'GET',
        headers: HttpHeaders | dict[str, str] | None = None,
        payload: HttpPayload | None = None,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        timeout: timedelta | None = None,
    ) -> HttpResponse:
        if isinstance(headers, dict) or headers is None:
            headers = HttpHeaders(headers or {})

        proxy_url = proxy_info.url if proxy_info else None
        client = self._get_client(proxy_url)

        try:
            response = await client.request(
                url=url,
                method=self._convert_method(method),
                headers=dict(headers) if headers else None,
                data=payload,
                cookies=session.cookies.jar if session else None,
                timeout=timeout.total_seconds() if timeout else None,
            )
        except Timeout as exc:
            raise asyncio.TimeoutError from exc
        except CurlRequestError as exc:
            if self._is_proxy_error(exc):
                raise ProxyError from exc
            raise

        if self._persist_cookies_per_session and session and response.curl:
            response_cookies = self._get_cookies(response.curl)
            session.cookies.store_cookies(response_cookies)

        return _CurlImpersonateResponse(response)

    @asynccontextmanager
    @override
    async def stream(
        self,
        url: str,
        *,
        method: HttpMethod = 'GET',
        headers: HttpHeaders | dict[str, str] | None = None,
        payload: HttpPayload | None = None,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        timeout: timedelta | None = None,
    ) -> AsyncGenerator[HttpResponse]:
        if isinstance(headers, dict) or headers is None:
            headers = HttpHeaders(headers or {})

        proxy_url = proxy_info.url if proxy_info else None
        client = self._get_client(proxy_url)

        try:
            response = await client.request(
                url=url,
                method=self._convert_method(method),
                headers=dict(headers) if headers else None,
                data=payload,
                cookies=session.cookies.jar if session else None,
                stream=True,
                timeout=timeout.total_seconds() if timeout else None,
            )
        except Timeout as exc:
            raise asyncio.TimeoutError from exc
        except CurlRequestError as exc:
            if self._is_proxy_error(exc):
                raise ProxyError from exc
            raise

        if self._persist_cookies_per_session and session and response.curl:
            response_cookies = self._get_cookies(response.curl)
            session.cookies.store_cookies(response_cookies)

        try:
            yield _CurlImpersonateResponse(response)
        finally:
            await response.aclose()

    def _get_client(self, proxy_url: str | None) -> AsyncSession:
        """Retrieve or create an asynchronous HTTP session for the given proxy URL.

        Check if an `AsyncSession` already exists for the specified proxy URL. If no session is found,
        create a new one with the provided proxy settings and additional session options.
        Store the new session for future use.
        """
        # Check if a session for the given proxy URL has already been created.
        if proxy_url not in self._client_by_proxy_url:
            # Prepare a default kwargs for the new session. A provided proxy URL and a chrome for impersonation
            # are set as default options.
            kwargs: dict[str, Any] = {
                'proxy': proxy_url,
                'impersonate': CURL_DEFAULT_CHROME,
            }

            # Update the default kwargs with any additional user-provided kwargs.
            kwargs.update(self._async_session_kwargs)

            # Create and store the new session with the specified kwargs.
            self._client_by_proxy_url[proxy_url] = _AsyncSession(**kwargs)

        return self._client_by_proxy_url[proxy_url]

    def _convert_method(self, method: HttpMethod) -> CurlHttpMethod:
        """Convert from Crawlee HTTP method to curl-cffi HTTP method.

        Args:
            method: Crawlee HTTP method.

        Returns:
            Corresponding curl-cffi HTTP method.

        Raises:
            ValueError: If the provided HTTP method is not supported.
        """
        method_upper = method.upper()  # curl-cffi requires uppercase methods

        match method_upper:
            case 'GET':
                return 'GET'
            case 'POST':
                return 'POST'
            case 'PUT':
                return 'PUT'
            case 'DELETE':
                return 'DELETE'
            case 'OPTIONS':
                return 'OPTIONS'
            case 'HEAD':
                return 'HEAD'
            case 'TRACE':
                return 'TRACE'
            case 'PATCH':
                return 'PATCH'
            case _:
                raise ValueError(f'HTTP method {method} is not supported in {self.__class__.__name__}.')

    @staticmethod
    def _is_proxy_error(error: CurlRequestError) -> bool:
        """Determine whether the given error is related to a proxy issue.

        Check if the error message contains known proxy-related error keywords or if it is an instance
        of `CurlProxyError`.
        """
        if any(needle in str(error) for needle in ROTATE_PROXY_ERRORS):
            return True

        if isinstance(error, CurlProxyError):  # noqa: SIM103
            return True

        return False

    @staticmethod
    def _get_cookies(curl: Curl) -> list[Cookie]:
        cookies = list[Cookie]()

        # Implementation of getinfo always returns list[bytes] for CurlInfo.COOKIELIST.
        cookie_list = cast('list[bytes]', curl.getinfo(CurlInfo.COOKIELIST))

        for curl_cookie in cookie_list:
            curl_morsel = CurlMorsel.from_curl_format(curl_cookie)
            cookie = curl_morsel.to_cookiejar_cookie()
            cookies.append(cookie)

        return cookies

    async def cleanup(self) -> None:
        for client in self._client_by_proxy_url.values():
            await client.close()
        self._client_by_proxy_url.clear()


================================================
FILE: src/crawlee/http_clients/_httpx.py
================================================
from __future__ import annotations

import asyncio
from contextlib import asynccontextmanager
from logging import getLogger
from typing import TYPE_CHECKING, Any, cast

import httpx
from typing_extensions import override

from crawlee._types import HttpHeaders
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
from crawlee._utils.docs import docs_group
from crawlee.errors import ProxyError
from crawlee.fingerprint_suite import HeaderGenerator
from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator, AsyncIterator
    from datetime import timedelta
    from ssl import SSLContext

    from crawlee import Request
    from crawlee._types import HttpMethod, HttpPayload
    from crawlee.proxy_configuration import ProxyInfo
    from crawlee.sessions import Session
    from crawlee.statistics import Statistics

logger = getLogger(__name__)


class _HttpxResponse:
    """Adapter class for `httpx.Response` to conform to the `HttpResponse` protocol."""

    def __init__(self, response: httpx.Response) -> None:
        self._response = response

    @property
    def http_version(self) -> str:
        return self._response.http_version

    @property
    def status_code(self) -> int:
        return self._response.status_code

    @property
    def headers(self) -> HttpHeaders:
        return HttpHeaders(dict(self._response.headers))

    async def read(self) -> bytes:
        if not self._response.is_closed:
            raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')
        return await self._response.aread()

    async def read_stream(self) -> AsyncIterator[bytes]:
        if self._response.is_stream_consumed:
            raise RuntimeError('Stream is already consumed.')
        else:
            async for chunk in self._response.aiter_bytes():
                yield chunk


class _HttpxTransport(httpx.AsyncHTTPTransport):
    """HTTP transport adapter that stores response cookies in a `Session`.

    This transport adapter modifies the handling of HTTP requests to update the session cookies
    based on the response cookies, ensuring that the cookies are stored in the session object
    rather than the `HTTPX` client itself.
    """

    @override
    async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
        response = await super().handle_async_request(request)
        response.request = request

        if session := cast('Session', request.extensions.get('crawlee_session')):
            session.cookies.store_cookies(list(response.cookies.jar))

        if 'Set-Cookie' in response.headers:
            del response.headers['Set-Cookie']

        return response


@docs_group('HTTP clients')
class HttpxHttpClient(HttpClient):
    """HTTP client based on the `HTTPX` library.

    This client uses the `HTTPX` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)
    and to manage sessions, proxies, and error handling.

    See the `HttpClient` class for more common information about HTTP clients.

    ### Usage

    ```python
    from crawlee.crawlers import HttpCrawler  # or any other HTTP client-based crawler
    from crawlee.http_clients import HttpxHttpClient

    http_client = HttpxHttpClient()
    crawler = HttpCrawler(http_client=http_client)
    ```
    """

    _DEFAULT_HEADER_GENERATOR = HeaderGenerator()

    def __init__(
        self,
        *,
        persist_cookies_per_session: bool = True,
        http1: bool = True,
        http2: bool = True,
        verify: str | bool | SSLContext = True,
        header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,
        **async_client_kwargs: Any,
    ) -> None:
        """Initialize a new instance.

        Args:
            persist_cookies_per_session: Whether to persist cookies per HTTP session.
            http1: Whether to enable HTTP/1.1 support.
            http2: Whether to enable HTTP/2 support.
            verify: SSL certificates used to verify the identity of requested hosts.
            header_generator: Header generator instance to use for generating common headers.
            async_client_kwargs: Additional keyword arguments for `httpx.AsyncClient`.
        """
        super().__init__(
            persist_cookies_per_session=persist_cookies_per_session,
        )
        self._http1 = http1
        self._http2 = http2

        self._async_client_kwargs = async_client_kwargs
        self._header_generator = header_generator

        self._ssl_context = httpx.create_ssl_context(verify=verify)

        self._transport: _HttpxTransport | None = None

        self._client_by_proxy_url = dict[str | None, httpx.AsyncClient]()

    @override
    async def crawl(
        self,
        request: Request,
        *,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        statistics: Statistics | None = None,
        timeout: timedelta | None = None,
    ) -> HttpCrawlingResult:
        client = self._get_client(proxy_info.url if proxy_info else None)
        headers = self._combine_headers(request.headers)

        http_request = client.build_request(
            url=request.url,
            method=request.method,
            headers=headers,
            content=request.payload,
            cookies=session.cookies.jar if session else None,
            extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
            timeout=timeout.total_seconds() if timeout is not None else httpx.USE_CLIENT_DEFAULT,
        )

        try:
            response = await client.send(http_request)
        except httpx.TimeoutException as exc:
            raise asyncio.TimeoutError from exc
        except httpx.TransportError as exc:
            if self._is_proxy_error(exc):
                raise ProxyError from exc
            raise

        if statistics:
            statistics.register_status_code(response.status_code)

        request.loaded_url = str(response.url)

        return HttpCrawlingResult(
            http_response=_HttpxResponse(response),
        )

    @override
    async def send_request(
        self,
        url: str,
        *,
        method: HttpMethod = 'GET',
        headers: HttpHeaders | dict[str, str] | None = None,
        payload: HttpPayload | None = None,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        timeout: timedelta | None = None,
    ) -> HttpResponse:
        client = self._get_client(proxy_info.url if proxy_info else None)

        http_request = self._build_request(
            client=client,
            url=url,
            method=method,
            headers=headers,
            payload=payload,
            session=session,
            timeout=httpx.Timeout(timeout.total_seconds()) if timeout is not None else None,
        )

        try:
            response = await client.send(http_request)
        except httpx.TimeoutException as exc:
            raise asyncio.TimeoutError from exc
        except httpx.TransportError as exc:
            if self._is_proxy_error(exc):
                raise ProxyError from exc
            raise

        return _HttpxResponse(response)

    @asynccontextmanager
    @override
    async def stream(
        self,
        url: str,
        *,
        method: HttpMethod = 'GET',
        headers: HttpHeaders | dict[str, str] | None = None,
        payload: HttpPayload | None = None,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        timeout: timedelta | None = None,
    ) -> AsyncGenerator[HttpResponse]:
        client = self._get_client(proxy_info.url if proxy_info else None)

        http_request = self._build_request(
            client=client,
            url=url,
            method=method,
            headers=headers,
            payload=payload,
            session=session,
            timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None,
        )

        try:
            response = await client.send(http_request, stream=True)
        except httpx.TimeoutException as exc:
            raise asyncio.TimeoutError from exc

        try:
            yield _HttpxResponse(response)
        finally:
            await response.aclose()

    def _build_request(
        self,
        client: httpx.AsyncClient,
        url: str,
        method: HttpMethod,
        headers: HttpHeaders | dict[str, str] | None,
        payload: HttpPayload | None,
        session: Session | None = None,
        timeout: httpx.Timeout | None = None,
    ) -> httpx.Request:
        """Build an `httpx.Request` using the provided parameters."""
        if isinstance(headers, dict) or headers is None:
            headers = HttpHeaders(headers or {})

        headers = self._combine_headers(headers)

        return client.build_request(
            url=url,
            method=method,
            headers=dict(headers) if headers else None,
            content=payload,
            extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
            timeout=timeout or httpx.USE_CLIENT_DEFAULT,
        )

    def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
        """Retrieve or create an HTTP client for the given proxy URL.

        If a client for the specified proxy URL does not exist, create and store a new one.
        """
        if not self._transport:
            # Configure connection pool limits and keep-alive connections for transport
            limits = self._async_client_kwargs.get(
                'limits', httpx.Limits(max_connections=1000, max_keepalive_connections=200)
            )

            self._transport = _HttpxTransport(
                http1=self._http1,
                http2=self._http2,
                verify=self._ssl_context,
                limits=limits,
            )

        if proxy_url not in self._client_by_proxy_url:
            # Prepare a default kwargs for the new client.
            kwargs: dict[str, Any] = {
                'proxy': proxy_url,
                'http1': self._http1,
                'http2': self._http2,
                'follow_redirects': True,
            }

            # Update the default kwargs with any additional user-provided kwargs.
            kwargs.update(self._async_client_kwargs)

            kwargs.update(
                {
                    'transport': self._transport,
                    'verify': self._ssl_context,
                }
            )

            client = httpx.AsyncClient(**kwargs)
            self._client_by_proxy_url[proxy_url] = client

        return self._client_by_proxy_url[proxy_url]

    def _combine_headers(self, explicit_headers: HttpHeaders | None) -> HttpHeaders | None:
        """Merge default headers with explicit headers for an HTTP request.

        Generate a final set of request headers by combining default headers, a random User-Agent header,
        and any explicitly provided headers.
        """
        common_headers = self._header_generator.get_common_headers() if self._header_generator else HttpHeaders()
        user_agent_header = (
            self._header_generator.get_random_user_agent_header() if self._header_generator else HttpHeaders()
        )
        explicit_headers = explicit_headers or HttpHeaders()
        headers = common_headers | user_agent_header | explicit_headers
        return headers or None

    @staticmethod
    def _is_proxy_error(error: httpx.TransportError) -> bool:
        """Determine whether the given error is related to a proxy issue.

        Check if the error is an instance of `httpx.ProxyError` or if its message contains known proxy-related
        error keywords.
        """
        if isinstance(error, httpx.ProxyError):
            return True

        if any(needle in str(error) for needle in ROTATE_PROXY_ERRORS):  # noqa: SIM103
            return True

        return False

    async def cleanup(self) -> None:
        for client in self._client_by_proxy_url.values():
            await client.aclose()
        self._client_by_proxy_url.clear()
        if self._transport:
            await self._transport.aclose()
            self._transport = None


================================================
FILE: src/crawlee/http_clients/_impit.py
================================================
from __future__ import annotations

import asyncio
from contextlib import asynccontextmanager
from logging import getLogger
from typing import TYPE_CHECKING, Any, TypedDict

from cachetools import LRUCache
from impit import AsyncClient, Browser, HTTPError, Response, TimeoutException, TransportError
from impit import ProxyError as ImpitProxyError
from typing_extensions import override

from crawlee._types import HttpHeaders
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
from crawlee._utils.docs import docs_group
from crawlee.errors import ProxyError
from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator, AsyncIterator
    from datetime import timedelta
    from http.cookiejar import CookieJar

    from crawlee import Request
    from crawlee._types import HttpMethod, HttpPayload
    from crawlee.proxy_configuration import ProxyInfo
    from crawlee.sessions import Session
    from crawlee.statistics import Statistics

logger = getLogger(__name__)


class _ClientCacheEntry(TypedDict):
    """Type definition for client cache entries."""

    client: AsyncClient
    cookie_jar: CookieJar | None


class _ImpitResponse:
    """Adapter class for `impit.Response` to conform to the `HttpResponse` protocol."""

    def __init__(self, response: Response) -> None:
        self._response = response

    @property
    def http_version(self) -> str:
        return str(self._response.http_version)

    @property
    def status_code(self) -> int:
        return int(self._response.status_code)

    @property
    def headers(self) -> HttpHeaders:
        return HttpHeaders(dict(self._response.headers))

    async def read(self) -> bytes:
        if not self._response.is_closed:
            raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')
        return self._response.content

    async def read_stream(self) -> AsyncIterator[bytes]:
        if self._response.is_stream_consumed:
            raise RuntimeError('Stream is already consumed.')
        else:
            async for chunk in self._response.aiter_bytes():
                yield chunk


@docs_group('HTTP clients')
class ImpitHttpClient(HttpClient):
    """HTTP client based on the `impit` library.

    This client uses the `impit` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)
    and to manage sessions, proxies, and error handling.

    See the `HttpClient` class for more common information about HTTP clients.

    ### Usage

    ```python
    from crawlee.crawlers import HttpCrawler  # or any other HTTP client-based crawler
    from crawlee.http_clients import ImpitHttpClient

    http_client = ImpitHttpClient()
    crawler = HttpCrawler(http_client=http_client)
    ```
    """

    def __init__(
        self,
        *,
        persist_cookies_per_session: bool = True,
        http3: bool = False,
        verify: bool = True,
        browser: Browser | None = 'firefox',
        **async_client_kwargs: Any,
    ) -> None:
        """Initialize a new instance.

        Args:
            persist_cookies_per_session: Whether to persist cookies per HTTP session.
            http3: Whether to enable HTTP/3 support.
            verify: SSL certificates used to verify the identity of requested hosts.
            browser: Browser to impersonate.
            async_client_kwargs: Additional keyword arguments for `impit.AsyncClient`.
        """
        super().__init__(
            persist_cookies_per_session=persist_cookies_per_session,
        )
        self._http3 = http3
        self._verify = verify
        self._browser = browser

        self._async_client_kwargs = async_client_kwargs

        self._client_by_proxy_url = LRUCache[str | None, _ClientCacheEntry](maxsize=10)

    @override
    async def crawl(
        self,
        request: Request,
        *,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        statistics: Statistics | None = None,
        timeout: timedelta | None = None,
    ) -> HttpCrawlingResult:
        client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)

        try:
            response = await client.request(
                url=request.url,
                method=request.method,
                content=request.payload,
                headers=dict(request.headers) if request.headers else None,
                timeout=timeout.total_seconds() if timeout else None,
            )
        except TimeoutException as exc:
            raise asyncio.TimeoutError from exc
        except (TransportError, HTTPError) as exc:
            if self._is_proxy_error(exc):
                raise ProxyError from exc
            raise

        if statistics:
            statistics.register_status_code(response.status_code)

        request.loaded_url = str(response.url)

        return HttpCrawlingResult(http_response=_ImpitResponse(response))

    @override
    async def send_request(
        self,
        url: str,
        *,
        method: HttpMethod = 'GET',
        headers: HttpHeaders | dict[str, str] | None = None,
        payload: HttpPayload | None = None,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        timeout: timedelta | None = None,
    ) -> HttpResponse:
        if isinstance(headers, dict) or headers is None:
            headers = HttpHeaders(headers or {})

        client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)

        try:
            response = await client.request(
                method=method,
                url=url,
                content=payload,
                headers=dict(headers) if headers else None,
                timeout=timeout.total_seconds() if timeout else None,
            )
        except TimeoutException as exc:
            raise asyncio.TimeoutError from exc
        except (TransportError, HTTPError) as exc:
            if self._is_proxy_error(exc):
                raise ProxyError from exc
            raise

        return _ImpitResponse(response)

    @asynccontextmanager
    @override
    async def stream(
        self,
        url: str,
        *,
        method: HttpMethod = 'GET',
        headers: HttpHeaders | dict[str, str] | None = None,
        payload: HttpPayload | None = None,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        timeout: timedelta | None = None,
    ) -> AsyncGenerator[HttpResponse]:
        client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)

        try:
            response = await client.request(
                method=method,
                url=url,
                content=payload,
                headers=dict(headers) if headers else None,
                timeout=timeout.total_seconds() if timeout else None,
                stream=True,
            )
        except TimeoutException as exc:
            raise asyncio.TimeoutError from exc

        try:
            yield _ImpitResponse(response)
        finally:
            # TODO: https://github.com/apify/impit/issues/242
            # Quickly closing Response while reading the response body causes an error in the Rust generator in `impit`.
            # With a short sleep and sync closing, the error does not occur.
            # Replace with `response.aclose` when this is resolved in impit.
            await asyncio.sleep(0.01)
            response.close()

    def _get_client(self, proxy_url: str | None, cookie_jar: CookieJar | None) -> AsyncClient:
        """Retrieve or create an HTTP client for the given proxy URL.

        If a client for the specified proxy URL does not exist, create and store a new one.
        """
        cached_data = self._client_by_proxy_url.get(proxy_url)
        if cached_data:
            client = cached_data['client']
            client_cookie_jar = cached_data['cookie_jar']
            if client_cookie_jar is cookie_jar:
                # If the cookie jar matches, return the existing client.
                return client

        # Prepare a default kwargs for the new client.
        kwargs: dict[str, Any] = {
            'proxy': proxy_url,
            'http3': self._http3,
            'verify': self._verify,
            'follow_redirects': True,
            'browser': self._browser,
        }

        # Update the default kwargs with any additional user-provided kwargs.
        kwargs.update(self._async_client_kwargs)

        client = AsyncClient(**kwargs, cookie_jar=cookie_jar)

        self._client_by_proxy_url[proxy_url] = _ClientCacheEntry(client=client, cookie_jar=cookie_jar)

        return client

    @staticmethod
    def _is_proxy_error(error: HTTPError) -> bool:
        """Determine whether the given error is related to a proxy issue.

        Check if the error message contains known proxy-related error keywords.
        """
        if isinstance(error, ImpitProxyError):
            return True

        if any(needle in str(error) for needle in ROTATE_PROXY_ERRORS):  # noqa: SIM103
            return True

        return False

    @override
    async def cleanup(self) -> None:
        """Clean up resources used by the HTTP client."""
        self._client_by_proxy_url.clear()


================================================
FILE: src/crawlee/otel/__init__.py
================================================
from crawlee.otel.crawler_instrumentor import CrawlerInstrumentor

__all__ = [
    'CrawlerInstrumentor',
]


================================================
FILE: src/crawlee/otel/crawler_instrumentor.py
================================================
from __future__ import annotations

import inspect
from typing import TYPE_CHECKING, Any

from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
from opentelemetry.instrumentation.utils import unwrap
from opentelemetry.semconv.attributes.code_attributes import CODE_FUNCTION_NAME
from opentelemetry.semconv.attributes.http_attributes import HTTP_REQUEST_METHOD
from opentelemetry.semconv.attributes.url_attributes import URL_FULL
from opentelemetry.trace import get_tracer
from wrapt import wrap_function_wrapper

from crawlee._utils.docs import docs_group
from crawlee.crawlers import BasicCrawler, ContextPipeline
from crawlee.crawlers._basic._context_pipeline import _Middleware

if TYPE_CHECKING:
    from collections.abc import Callable

    from crawlee.crawlers import BasicCrawlingContext


@docs_group('Other')
class CrawlerInstrumentor(BaseInstrumentor):
    """Helper class for instrumenting crawlers with OpenTelemetry."""

    def __init__(
        self, *, instrument_classes: list[type] | None = None, request_handling_instrumentation: bool = True
    ) -> None:
        """Initialize the instrumentor.

        Args:
            instrument_classes: List of classes to be instrumented - all their public methods and coroutines will be
                wrapped by generic instrumentation wrapper that will create spans for them.
            request_handling_instrumentation: When `True`, the most relevant methods in the request handling pipeline
                will be instrumented. When `False`, no request handling instrumentation will be done.
        """
        self._tracer = get_tracer(__name__)

        async def _simple_async_wrapper(wrapped: Any, _: Any, args: Any, kwargs: Any) -> Any:
            with self._tracer.start_as_current_span(
                name=wrapped.__name__, attributes={CODE_FUNCTION_NAME: wrapped.__qualname__}
            ):
                return await wrapped(*args, **kwargs)

        def _simple_wrapper(wrapped: Any, _: Any, args: Any, kwargs: Any) -> Any:
            with self._tracer.start_as_current_span(
                name=wrapped.__name__, attributes={CODE_FUNCTION_NAME: wrapped.__qualname__}
            ):
                return wrapped(*args, **kwargs)

        def _init_wrapper(wrapped: Any, _: Any, args: Any, kwargs: Any) -> None:
            with self._tracer.start_as_current_span(
                name=wrapped.__name__, attributes={CODE_FUNCTION_NAME: wrapped.__qualname__}
            ):
                wrapped(*args, **kwargs)

        self._instrumented: list[tuple[Any, str, Callable]] = []
        self._simple_wrapper = _simple_wrapper
        self._simple_async_wrapper = _simple_async_wrapper
        self._init_wrapper = _init_wrapper

        if instrument_classes:
            for _class in instrument_classes:
                self._instrument_all_public_methods(on_class=_class)

        if request_handling_instrumentation:

            async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
                with self._tracer.start_as_current_span(
                    name=f'{instance.generator.__name__}, {wrapped.__name__}',  # type:ignore[attr-defined]  # valid in our context
                    attributes={
                        URL_FULL: instance.input_context.request.url,
                        CODE_FUNCTION_NAME: instance.generator.__qualname__,  # type:ignore[attr-defined]  # valid in our context
                    },
                ):
                    return await wrapped(*args, **kwargs)

            async def context_pipeline_wrapper(
                wrapped: Any, _: ContextPipeline[BasicCrawlingContext], args: Any, kwargs: Any
            ) -> Any:
                context = args[0]
                final_context_consumer = args[1]

                async def wrapped_final_consumer(*args: Any, **kwargs: Any) -> Any:
                    with self._tracer.start_as_current_span(
                        name='request_handler',
                        attributes={URL_FULL: context.request.url, HTTP_REQUEST_METHOD: context.request.method},
                    ):
                        return await final_context_consumer(*args, **kwargs)

                with self._tracer.start_as_current_span(
                    name='ContextPipeline',
                    attributes={URL_FULL: context.request.url, HTTP_REQUEST_METHOD: context.request.method},
                ):
                    return await wrapped(context, wrapped_final_consumer, **kwargs)

            async def _commit_request_handler_result_wrapper(
                wrapped: Callable[[Any], Any], _: BasicCrawler, args: Any, kwargs: Any
            ) -> Any:
                context = args[0]
                with self._tracer.start_as_current_span(
                    name='Commit results',
                    attributes={URL_FULL: context.request.url, HTTP_REQUEST_METHOD: context.request.method},
                ):
                    return await wrapped(*args, **kwargs)

            # Handpicked interesting methods to instrument
            self._instrumented.extend(
                [
                    (_Middleware, 'action', middleware_wrapper),
                    (_Middleware, 'cleanup', middleware_wrapper),
                    (ContextPipeline, '__call__', context_pipeline_wrapper),
                    (BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
                    (BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),
                ]
            )

    def instrumentation_dependencies(self) -> list[str]:
        """Return a list of python packages with versions that will be instrumented."""
        return ['crawlee']

    def _instrument_all_public_methods(self, on_class: type) -> None:
        public_coroutines = {
            name
            for name, member in inspect.getmembers(on_class, predicate=inspect.iscoroutinefunction)
            if not name.startswith('_')
        }
        public_methods = {
            name
            for name, member in inspect.getmembers(on_class, predicate=inspect.isfunction)
            if not name.startswith('_')
        } - public_coroutines

        for coroutine in public_coroutines:
            self._instrumented.append((on_class, coroutine, self._simple_async_wrapper))

        for method in public_methods:
            self._instrumented.append((on_class, method, self._simple_wrapper))

        self._instrumented.append((on_class, '__init__', self._init_wrapper))

    def _instrument(self, **_: Any) -> None:
        for _class, method, wrapper in self._instrumented:
            wrap_function_wrapper(_class, method, wrapper)

    def _uninstrument(self, **_: Any) -> None:
        for _class, method, __ in self._instrumented:
            unwrap(_class, method)


================================================
FILE: src/crawlee/project_template/cookiecutter.json
================================================
{
    "project_name": "crawlee-python-project",
    "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
    "crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit"],
    "__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}",
    "http_client": ["impit", "httpx", "curl-impersonate"],
    "package_manager": ["poetry", "pip", "uv"],
    "enable_apify_integration": false,
    "install_project": true,
    "start_url": "https://crawlee.dev",
    "_jinja2_env_vars": {
        "line_statement_prefix": "# %"
    },
    "_extensions": ["jinja2.ext.do"]
}


================================================
FILE: src/crawlee/project_template/hooks/post_gen_project.py
================================================
import platform
import subprocess
from pathlib import Path

# % if cookiecutter.package_manager in ['poetry', 'uv']
Path('requirements.txt').unlink()

# % if cookiecutter.install_project == True
# % if cookiecutter.package_manager == 'poetry'
subprocess.check_call(['poetry', 'install'])
# % elif cookiecutter.package_manager == 'uv'
subprocess.check_call(['uv', 'sync'])
# % endif

# % if cookiecutter.crawler_type == 'playwright'
manager = "{{ cookiecutter.package_manager }}"
subprocess.check_call([manager, 'run', 'playwright', 'install'])
# % endif
# % endif


# % elif cookiecutter.package_manager == 'pip'
import venv  # noqa: E402

# Create a virtual environment
venv_root = Path('.venv')
venv.main([str(venv_root)])

# % if cookiecutter.install_project == True
if platform.system() == 'Windows':  # noqa: SIM108
    path = venv_root / 'Scripts'
else:
    path = venv_root / 'bin'

# Install requirements and generate requirements.txt as an impromptu lockfile
subprocess.check_call([str(path / 'pip'), 'install', '-r', 'requirements.txt'])
Path('requirements.txt').write_text(
    subprocess.check_output([str(path / 'pip'), 'freeze']).decode()
)

# % if cookiecutter.crawler_type == 'playwright'
subprocess.check_call([str(path / 'playwright'), 'install'])
# % endif
# % endif
# % endif


================================================
FILE: src/crawlee/project_template/hooks/pre_gen_project.py
================================================
# % if cookiecutter.package_manager in ['poetry', 'uv']
import subprocess
import shutil
import re
import sys

manager = "{{cookiecutter.package_manager}}"
manager_text = manager.title()
# % if cookiecutter.package_manager == 'poetry'
version_regex = r'Poetry \(version 2\..*\)'
r_version = '2.x'
# % elif cookiecutter.package_manager == 'uv'
version_regex = r'uv (0\..*)'
r_version = '0.x'
# % endif

# Check if package manager is available in PATH
if not shutil.which(manager):
    sys.stderr.write(f'\nError: You selected {manager_text} as your package manager, but it is not installed. Please install it and try again.\n')
    sys.exit(1)

# Check if the package manager is executable
try:
    version = subprocess.check_output([manager, '--version']).decode().strip()
except OSError:
    sys.stderr.write(f'\nError: Your selected package manager {manager_text} was found but failed to execute.\n')
    sys.exit(1)

# Check if the version matches the required regex
if not re.match(version_regex, version):
    sys.stderr.write(f'\nError: Your selected package manager {manager_text} requires version {r_version}, but {version} is installed.\n')
    sys.exit(1)
# % endif


================================================
FILE: src/crawlee/project_template/templates/main.py
================================================
# % if cookiecutter.enable_apify_integration
from apify import Actor
# % endif
# % block import required
# % endblock
# % if cookiecutter.http_client == 'curl-impersonate'
from crawlee.http_clients import CurlImpersonateHttpClient
# % elif cookiecutter.http_client == 'httpx'
from crawlee.http_clients import HttpxHttpClient
# % elif cookiecutter.http_client == 'impit'
from crawlee.http_clients import ImpitHttpClient
# % endif

from .routes import router

# % filter truncate(0, end='')
# % block http_client_instantiation
# % if cookiecutter.http_client == 'curl-impersonate'
http_client=CurlImpersonateHttpClient(),
# % elif cookiecutter.http_client == 'httpx'
http_client=HttpxHttpClient(),
# % elif cookiecutter.http_client == 'impit'
http_client=ImpitHttpClient(),
# % endif
# % endblock
# % endfilter
# % if self.pre_main is defined

{{self.pre_main()}}

# % endif
async def main() -> None:
    """The crawler entry point."""
    # % filter truncate(0, end='')
    # % block instantiation required
    # % endblock
    # % endfilter

    # % if cookiecutter.enable_apify_integration
    async with Actor:
    # % set indent_width = 8
    # % else
    # % set indent_width = 4
    # % endif
# % filter indent(width=indent_width, first=True)
{{self.instantiation()}}

await crawler.run(
    [
        '{{ cookiecutter.start_url }}',
    ]
)
# % endfilter


================================================
FILE: src/crawlee/project_template/templates/main_beautifulsoup.py
================================================
# % extends 'main.py'

# % block import
from crawlee.crawlers import BeautifulSoupCrawler
# % endblock

# % block instantiation
crawler = BeautifulSoupCrawler(
    request_handler=router,
    max_requests_per_crawl=10,
    {{ self.http_client_instantiation() }})
# % endblock


================================================
FILE: src/crawlee/project_template/templates/main_parsel.py
================================================
# % extends 'main.py'

# % block import
from crawlee.crawlers import ParselCrawler
# % endblock

# % block instantiation
crawler = ParselCrawler(
    request_handler=router,
    max_requests_per_crawl=10,
    {{ self.http_client_instantiation() }})
# % endblock


================================================
FILE: src/crawlee/project_template/templates/main_playwright.py
================================================
# % extends 'main.py'

# % block import
from crawlee.crawlers import PlaywrightCrawler
# % endblock

# % block instantiation
crawler = PlaywrightCrawler(
    request_handler=router,
    headless=True,
    max_requests_per_crawl=10,
    {{ self.http_client_instantiation() }})
# % endblock


================================================
FILE: src/crawlee/project_template/templates/main_playwright_camoufox.py
================================================
# % extends 'main.py'

# % block import
from camoufox import AsyncNewBrowser
from typing_extensions import override

from crawlee._utils.context import ensure_context
from crawlee.browsers import PlaywrightBrowserPlugin, PlaywrightBrowserController, BrowserPool
from crawlee.crawlers import PlaywrightCrawler
# % endblock

# % block pre_main
class CamoufoxPlugin(PlaywrightBrowserPlugin):
    """Example browser plugin that uses Camoufox Browser, but otherwise keeps the functionality of
    PlaywrightBrowserPlugin."""

    @ensure_context
    @override
    async def new_browser(self) -> PlaywrightBrowserController:
        if not self._playwright:
            raise RuntimeError('Playwright browser plugin is not initialized.')

        return PlaywrightBrowserController(
            browser=await AsyncNewBrowser(self._playwright, headless=True),
            max_open_pages_per_browser=1,  #  Increase, if camoufox can handle it in your usecase.
            header_generator=None,  #  This turns off the crawlee header_generation. Camoufox has its own.
        )
# % endblock

# % block instantiation
crawler = PlaywrightCrawler(
    max_requests_per_crawl=10,
    request_handler=router,
    browser_pool=BrowserPool(plugins=[CamoufoxPlugin()])
)
# % endblock


================================================
FILE: src/crawlee/project_template/templates/main_playwright_chrome.py
================================================
# % extends 'main.py'

# % block import
from crawlee.crawlers import PlaywrightCrawler
# % endblock

# % block instantiation
crawler = PlaywrightCrawler(
    request_handler=router,
    headless=True,
    max_requests_per_crawl=10,
    browser_type="chrome",
    {{ self.http_client_instantiation() }}
)
# % endblock


================================================
FILE: src/crawlee/project_template/templates/main_playwright_firefox.py
================================================
# % extends 'main.py'

# % block import
from crawlee.crawlers import PlaywrightCrawler
# % endblock

# % block instantiation
crawler = PlaywrightCrawler(
    request_handler=router,
    headless=True,
    max_requests_per_crawl=10,
    browser_type="firefox",
    {{ self.http_client_instantiation() }}
)
# % endblock


================================================
FILE: src/crawlee/project_template/templates/main_playwright_webkit.py
================================================
# % extends 'main.py'

# % block import
from crawlee.crawlers import PlaywrightCrawler
# % endblock

# % block instantiation
crawler = PlaywrightCrawler(
    request_handler=router,
    headless=True,
    max_requests_per_crawl=10,
    browser_type="webkit",
    {{ self.http_client_instantiation() }}
)
# % endblock


================================================
FILE: src/crawlee/project_template/templates/routes_beautifulsoup.py
================================================
from crawlee.crawlers import BeautifulSoupCrawlingContext
from crawlee.router import Router

router = Router[BeautifulSoupCrawlingContext]()


@router.default_handler
async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
    """Default request handler."""
    context.log.info(f'Processing {context.request.url} ...')
    title = context.soup.find('title')
    await context.push_data(
        {
            'url': context.request.loaded_url,
            'title': title.text if title else None,
        }
    )

    await context.enqueue_links()


================================================
FILE: src/crawlee/project_template/templates/routes_parsel.py
================================================
from crawlee.crawlers import ParselCrawlingContext
from crawlee.router import Router

router = Router[ParselCrawlingContext]()


@router.default_handler
async def default_handler(context: ParselCrawlingContext) -> None:
    """Default request handler."""
    context.log.info(f'Processing {context.request.url} ...')
    title = context.selector.xpath('//title/text()').get()
    await context.push_data(
        {
            'url': context.request.loaded_url,
            'title': title,
        }
    )

    await context.enqueue_links()


================================================
FILE: src/crawlee/project_template/templates/routes_playwright.py
================================================
from crawlee.crawlers import PlaywrightCrawlingContext
from crawlee.router import Router

router = Router[PlaywrightCrawlingContext]()


@router.default_handler
async def default_handler(context: PlaywrightCrawlingContext) -> None:
    """Default request handler."""
    context.log.info(f'Processing {context.request.url} ...')
    title = await context.page.query_selector('title')
    await context.push_data(
        {
            'url': context.request.loaded_url,
            'title': await title.inner_text() if title else None,
        }
    )

    await context.enqueue_links()


================================================
FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/.dockerignore
================================================
.venv


================================================
FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile
================================================
# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
# % if cookiecutter.crawler_type == 'playwright'
FROM apify/actor-python-playwright:3.13
# % elif cookiecutter.crawler_type == 'playwright-camoufox'
FROM apify/actor-python-playwright-camoufox:3.13
# % elif cookiecutter.crawler_type == 'playwright-chrome'
FROM apify/actor-python-playwright-chrome:3.13
# % elif cookiecutter.crawler_type == 'playwright-firefox'
FROM apify/actor-python-playwright-firefox:3.13
# % elif cookiecutter.crawler_type == 'playwright-webkit'
FROM apify/actor-python-playwright-webkit:3.13
# % else
FROM apify/actor-python:3.13
# % endif

RUN apt update && apt install -yq git && rm -rf /var/lib/apt/lists/*

# % if cookiecutter.package_manager == 'poetry'
RUN pip install -U pip setuptools \
    && pip install 'poetry<3' \
    && poetry self add 'poetry-plugin-export'

# Second, copy just poetry.lock and pyproject.toml into the Actor image,
# since those should be the only files that affects the dependency install in the next step,
# in order to speed up the build
COPY pyproject.toml poetry.lock ./

# Install the dependencies
RUN echo "Python version:" \
 && python --version \
 && echo "Installing dependencies:" \
 # Export packages from poetry.lock
 && poetry export -f requirements.txt --without-hashes | \
 # Replace playwright version so that it matches whatever is pre-installed in the image (the `hash` checks if playwright is installed)
    sed "s/^playwright==\(.*\)/playwright==$(hash playwright 2>/dev/null && (playwright --version | cut -d ' ' -f 2) || echo '\1')/" | \
 # Install everything using pip (ignore dependency checks - the lockfile is correct, period)
    pip install -r /dev/stdin --no-dependencies \
 && echo "All installed Python packages:" \
 && pip freeze
# % elif cookiecutter.package_manager == 'uv'
RUN pip install -U pip setuptools \
    && pip install 'uv<1'

ENV UV_PROJECT_ENVIRONMENT="/usr/local"

COPY pyproject.toml uv.lock ./

RUN echo "Python version:" \
    && python --version \
    && echo "Installing dependencies:" \
    # Check if playwright is already installed
    && PLAYWRIGHT_INSTALLED=$(pip freeze | grep -q playwright && echo "true" || echo "false") \
    && if [ "$PLAYWRIGHT_INSTALLED" = "true" ]; then \
        echo "Playwright already installed, excluding from uv sync" \
        && uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact --no-install-package playwright; \
    else \
        echo "Playwright not found, installing all dependencies" \
        && uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact; \
    fi \
    && echo "All installed Python packages:" \
    && pip freeze
# % elif cookiecutter.package_manager == 'pip'
RUN pip install -U pip setuptools

# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the dependencies
RUN echo "Python version:" \
 && python --version \
 && echo "Installing dependencies:" \
 # Install everything using pip, set playwright version so that it matches whatever is pre-installed in the image
 && cat requirements.txt | \
 # Replace playwright version so that it matches whatever is pre-installed in the image (the `hash` checks if playwright is installed)
    sed "s/^playwright==\(.*\)/playwright==$(hash playwright 2>/dev/null && (playwright --version | cut -d ' ' -f 2) || echo '\1')/" | \
 # Install everything using pip
    pip install -r /dev/stdin \
 && echo "All installed Python packages:" \
 && pip freeze
# % elif cookiecutter.package_manager == 'manual'
# TODO install dependencies
# % endif

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python -m compileall -q .

# % if cookiecutter.crawler_type == 'playwright-camoufox'
# Fetch camoufox files that are always needed when using camoufox.
RUN python -m camoufox fetch
# % endif

# Specify how to launch the source code of your Actor.
CMD ["python", "-m", "{{ cookiecutter.__package_name }}"]


================================================
FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/README.md
================================================
# {{cookiecutter.project_name}}

Project skeleton generated by Crawlee ({{ cookiecutter.crawler_type | capitalize }} template).

## Usage

{% if cookiecutter.package_manager == 'poetry' -%}
To get started, ensure you have [Poetry](https://python-poetry.org/), a package and dependency management system, installed on your machine. We recommend installing it with the following command:

```sh
pipx install poetry
```

Next, install the project dependencies:

```sh
poetry install
```

Finally, launch the crawler with:

```sh
poetry run python -m {{cookiecutter.__package_name}}
```
{% elif cookiecutter.package_manager == 'pip' -%}
To install dependencies, your can run the following command:

```sh
python -m pip install .
```

When the dependencies are installed, you may launch the crawler with:

```sh
python -m {{cookiecutter.__package_name}}
```

{% elif cookiecutter.package_manager == 'uv' -%}
To get started, ensure you have [UV](https://docs.astral.sh/uv/), a package and dependency management system, installed on your machine. We recommend installing it with the following command:

```sh
pipx install uv
```

Next, install the project dependencies:

```sh
uv sync
```

Finally, launch the crawler with:

```sh
uv run python -m {{cookiecutter.__package_name}}
```
{% elif cookiecutter.package_manager == 'pip' -%}
To install dependencies, your can run the following command:

```sh
python -m pip install .
```

When the dependencies are installed, you may launch the crawler with:

```sh
python -m {{cookiecutter.__package_name}}
```
{% elif cookiecutter.package_manager == 'manual' -%}
You selected the manual dependency installation method, so you're on your own. There is a simple `requirements.txt` file to get you started.
{% endif %}


================================================
FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml
================================================
# % if cookiecutter.crawler_type.startswith('playwright')
# % set extras = ['playwright']
# % else
# % set extras = [cookiecutter.crawler_type]
# % endif
# % if cookiecutter.http_client == 'curl-impersonate'
# % do extras.append('curl-impersonate')
# % elif cookiecutter.http_client == 'httpx'
# % do extras.append('httpx')
# % endif

[project]
name = "{{cookiecutter.project_name}}"
version = "0.0.1"
description = ""
authors = [
    {name = "Your Name",email = "you@example.com"}
]
readme = "README.md"
requires-python = ">=3.10,<4.0"
dependencies = [
    "crawlee[{{ extras|join(',') }}]",
    # % if cookiecutter.crawler_type == 'playwright-camoufox'
    "camoufox[geoip]~=0.4.5",
    # % endif
    # % if cookiecutter.enable_apify_integration
    "apify",
    # % endif
]

# % if cookiecutter.package_manager == 'poetry'
[tool.poetry]
package-mode = false

[build-system]
requires = ["poetry-core>=2.0.0,<3.0.0"]
build-backend = "poetry.core.masonry.api"
# % endif


================================================
FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt
================================================
# % if cookiecutter.crawler_type == 'playwright-camoufox'
camoufox[geoip]~=0.4.5
# % endif
# % if cookiecutter.crawler_type.startswith('playwright')
# % set extras = ['playwright']
# % else
# % set extras = [cookiecutter.crawler_type]
# % endif
# % if cookiecutter.enable_apify_integration
apify
# % endif
# % if cookiecutter.http_client == 'curl-impersonate'
# % do extras.append('curl-impersonate')
# % endif
# % if cookiecutter.http_client == 'httpx'
# % do extras.append('httpx')
# % endif
crawlee[{{ extras | join(',') }}]


================================================
FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py
================================================


================================================
FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py
================================================
import asyncio
# % if cookiecutter.http_client == 'curl-impersonate'
import platform
# % if 'playwright' in cookiecutter.crawler_type
import warnings
# % endif
# % endif
{{ '' }}
from .main import main

if __name__ == '__main__':
    # % if cookiecutter.http_client == 'curl-impersonate'
    if platform.system() == 'Windows':
        # This mitigates a warning raised by curl-cffi.
        # % if 'playwright' in cookiecutter.crawler_type
        warnings.warn(
            message=('curl-cffi suggests using WindowsSelectorEventLoopPolicy, but this conflicts with Playwright. '
                     'Ignore the curl-cffi warning.'),
            category=UserWarning,
            stacklevel=2,
        )
        # % else
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
        # % endif
    # % endif
{{ '' }}
    asyncio.run(main())


================================================
FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py
================================================
# % include 'main_%s.py' % cookiecutter.__crawler_type


================================================
FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py
================================================
# % if cookiecutter.crawler_type.startswith('playwright')
# % include 'routes_playwright.py'
# % else
# % include 'routes_%s.py' % cookiecutter.__crawler_type
# % endif


================================================
FILE: src/crawlee/proxy_configuration.py
================================================
from __future__ import annotations

import inspect
from collections import defaultdict
from dataclasses import dataclass
from typing import TYPE_CHECKING

from more_itertools import flatten
from pydantic import AnyHttpUrl, TypeAdapter
from typing_extensions import Protocol
from yarl import URL

from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
    from collections.abc import Awaitable, Sequence

    from crawlee import Request

__all__ = ['ProxyConfiguration', 'ProxyInfo']


@dataclass
@docs_group('Other')
class ProxyInfo:
    """Provides information about a proxy connection that is used for requests."""

    url: str
    """The URL of the proxy."""

    scheme: str
    """The scheme of the proxy."""

    hostname: str
    """The hostname of the proxy."""

    port: int
    """The proxy port."""

    username: str = ''
    """The username for the proxy."""

    password: str = ''
    """The password for the proxy."""

    session_id: str | None = None
    """The identifier of the used proxy session, if used.
    Using the same session ID guarantees getting the same proxy URL."""

    proxy_tier: int | None = None
    """The tier of the proxy."""


@docs_group('Configuration')
class ProxyConfiguration:
    """Configures connection to a proxy server with the provided options.

    Proxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or
    blacklists. Setting proxy configuration in your crawlers automatically configures them to use the selected proxies
    for all connections. You can get information about the currently used proxy by inspecting the {@apilink ProxyInfo}
    property in your crawler's page function. There, you can inspect the proxy's URL and other attributes.

    If you want to use your own proxies, use the {@apilink ProxyConfigurationOptions.proxyUrls} option. Your list of
    proxy URLs will be rotated by the configuration if this option is provided.
    """

    def __init__(
        self,
        *,
        proxy_urls: list[str | None] | None = None,
        new_url_function: _NewUrlFunction | None = None,
        tiered_proxy_urls: list[list[str | None]] | None = None,
    ) -> None:
        """Initialize a new instance.

        Exactly one of `proxy_urls`, `tiered_proxy_urls` or `new_url_function` must be specified.

        Args:
            proxy_urls: A list of URLs of proxies that will be rotated in a round-robin fashion
            tiered_proxy_urls: A list of URL tiers (where a tier is a list of proxy URLs). Crawlers will automatically
                try to use the lowest tier (smallest index) where blocking does not happen. The proxy URLs in
                the selected tier will be rotated in a round-robin fashion.
            new_url_function: A function that returns a proxy URL for a given Request. This provides full control over
                the proxy selection mechanism.
        """
        self._next_custom_url_index = 0
        self._used_proxy_urls = dict[str, URL | None]()
        self._url_validator = TypeAdapter(AnyHttpUrl)

        # Validation
        if sum(map(bool, (proxy_urls, new_url_function, list(flatten(tiered_proxy_urls or []))))) != 1:
            raise ValueError(
                'Exactly one of `proxy_urls`, `tiered_proxy_urls` and `new_url_function` '
                'must be specified (and non-empty).'
            )

        self._proxy_urls = [self._create_url(url) for url in proxy_urls] if proxy_urls else []
        self._proxy_tier_tracker = (
            _ProxyTierTracker([[self._create_url(url) for url in tier] for tier in tiered_proxy_urls])
            if tiered_proxy_urls
            else None
        )
        self._new_url_function = new_url_function

    def _create_url(self, url: str | None) -> URL | None:
        """Create URL from input string. None means that intentionally no proxy should be used."""
        if url is None:
            return None

        self._url_validator.validate_python(url)
        return URL(url)

    async def new_proxy_info(
        self, session_id: str | None, request: Request | None, proxy_tier: int | None
    ) -> ProxyInfo | None:
        """Return a new ProxyInfo object based on the configured proxy rotation strategy.

        Args:
            session_id: Session identifier. If provided, same proxy URL will be returned for
                subsequent calls with this ID. Will be auto-generated for tiered proxies if
                not provided.
            request: Request object used for proxy rotation and tier selection. Required for
                tiered proxies to track retries and adjust tier accordingly.
            proxy_tier: Specific proxy tier to use. If not provided, will be automatically
                selected based on configuration.
        """
        if self._proxy_tier_tracker is not None and session_id is None:
            session_id = crypto_random_object_id(6)

        url, proxy_tier = await self._pick_url(session_id, request, proxy_tier)

        if url is None:
            return None

        if url.port is None:
            raise ValueError(f'Port is None for URL: {url}')

        if url.host is None:
            raise ValueError(f'Host is None for URL: {url}')

        info = ProxyInfo(
            url=str(url),
            scheme=url.scheme,
            hostname=url.host,
            port=url.port,
            username=url.user or '',
            password=url.password or '',
        )

        if session_id is not None:
            info.session_id = session_id

        if proxy_tier is not None:
            info.proxy_tier = proxy_tier

        return info

    async def new_url(
        self, session_id: str | None = None, request: Request | None = None, proxy_tier: int | None = None
    ) -> str | None:
        """Return a proxy URL string based on the configured proxy rotation strategy.

        Args:
            session_id: Session identifier. If provided, same proxy URL will be returned for
                subsequent calls with this ID. Will be auto-generated for tiered proxies if
                not provided.
            request: Request object used for proxy rotation and tier selection. Required for
                tiered proxies to track retries and adjust tier accordingly.
            proxy_tier: Specific proxy tier to use. If not provided, will be automatically
                selected based on configuration.
        """
        proxy_info = await self.new_proxy_info(session_id, request, proxy_tier)
        return proxy_info.url if proxy_info else None

    async def _pick_url(
        self, session_id: str | None, request: Request | None, proxy_tier: int | None
    ) -> tuple[URL | None, int | None]:
        if self._new_url_function:
            try:
                result = self._new_url_function(session_id, request)
                if inspect.isawaitable(result):
                    result = await result

                return URL(str(result)) if result is not None else None, None
            except Exception as e:
                raise ValueError('The provided "new_url_function" did not return a valid URL') from e

        if self._proxy_tier_tracker:
            if request is not None and proxy_tier is None:
                hostname = URL(request.url).host
                if hostname is None:
                    raise ValueError('The request URL does not have a hostname')

                if request.last_proxy_tier is not None:
                    self._proxy_tier_tracker.add_error(hostname, request.last_proxy_tier)

                proxy_tier = self._proxy_tier_tracker.predict_tier(hostname)

                request.last_proxy_tier = proxy_tier
                request.forefront = True

            if proxy_tier is not None:
                urls = self._proxy_tier_tracker.get_tier_urls(proxy_tier)
            else:
                urls = self._proxy_tier_tracker.all_urls
        elif self._proxy_urls:
            urls = self._proxy_urls
        else:
            raise RuntimeError('Invalid state')

        if session_id is None:
            url = urls[self._next_custom_url_index % len(urls)]
            self._next_custom_url_index += 1
            return url, proxy_tier

        if session_id not in self._used_proxy_urls:
            self._used_proxy_urls[session_id] = urls[self._next_custom_url_index % len(urls)]
            self._next_custom_url_index += 1

        return self._used_proxy_urls[session_id], proxy_tier


class _ProxyTierTracker:
    """Tracks the state of currently used proxy tiers and their error frequency for individual crawled domains."""

    def __init__(self, tiered_proxy_urls: list[list[URL | None]]) -> None:
        self._tiered_proxy_urls = tiered_proxy_urls
        self._histogram_by_domain = defaultdict[str, list[int]](lambda: [0 for _tier in tiered_proxy_urls])
        self._current_tier_by_domain = defaultdict[str, int](lambda: 0)

    @property
    def all_urls(self) -> Sequence[URL | None]:
        return list(flatten(self._tiered_proxy_urls))

    def get_tier_urls(self, tier_number: int) -> Sequence[URL | None]:
        return self._tiered_proxy_urls[tier_number]

    def add_error(self, domain: str, tier: int) -> None:
        self._histogram_by_domain[domain][tier] += 10

    def predict_tier(self, domain: str) -> int:
        histogram = self._histogram_by_domain[domain]
        current_tier = self._current_tier_by_domain[domain]

        for index, value in enumerate(histogram):
            if index == current_tier:
                continue
            if value > 0:
                histogram[index] -= 1

        left = histogram[current_tier - 1] if current_tier > 0 else float('inf')
        right = histogram[current_tier + 1] if current_tier < len(histogram) - 1 else float('inf')

        if histogram[current_tier] > min(left, right):
            self._current_tier_by_domain[domain] = current_tier - 1 if left <= right else current_tier + 1
        elif histogram[current_tier] == left:
            self._current_tier_by_domain[domain] -= 1

        return self._current_tier_by_domain[domain]


class _NewUrlFunction(Protocol):
    def __call__(
        self,
        session_id: str | None = None,
        request: Request | None = None,
    ) -> str | None | Awaitable[str | None]: ...


================================================
FILE: src/crawlee/py.typed
================================================


================================================
FILE: src/crawlee/request_loaders/__init__.py
================================================
from ._request_list import RequestList
from ._request_loader import RequestLoader
from ._request_manager import RequestManager
from ._request_manager_tandem import RequestManagerTandem
from ._sitemap_request_loader import SitemapRequestLoader

__all__ = ['RequestList', 'RequestLoader', 'RequestManager', 'RequestManagerTandem', 'SitemapRequestLoader']


================================================
FILE: src/crawlee/request_loaders/_request_list.py
================================================
from __future__ import annotations

import asyncio
import contextlib
from collections.abc import AsyncGenerator, AsyncIterable, AsyncIterator, Iterable
from logging import getLogger
from typing import Annotated

from pydantic import BaseModel, ConfigDict, Field
from typing_extensions import override

from crawlee._request import Request
from crawlee._utils.docs import docs_group
from crawlee.request_loaders._request_loader import RequestLoader

logger = getLogger(__name__)


class RequestListState(BaseModel):
    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

    next_index: Annotated[int, Field(alias='nextIndex')] = 0
    next_unique_key: Annotated[str | None, Field(alias='nextUniqueKey')] = None
    in_progress: Annotated[set[str], Field(alias='inProgress')] = set()


class RequestListData(BaseModel):
    requests: Annotated[list[Request], Field()]


@docs_group('Request loaders')
class RequestList(RequestLoader):
    """Represents a (potentially very large) list of URLs to crawl."""

    def __init__(
        self,
        requests: Iterable[str | Request] | AsyncIterable[str | Request] | None = None,
        name: str | None = None,
        persist_state_key: str | None = None,
        persist_requests_key: str | None = None,
    ) -> None:
        """Initialize a new instance.

        Args:
            requests: The request objects (or their string representations) to be added to the provider.
            name: A name of the request list.
            persist_state_key: A key for persisting the progress information of the RequestList.
                If you do not pass a key but pass a `name`, a key will be derived using the name.
                Otherwise, state will not be persisted.
            persist_requests_key: A key for persisting the request data loaded from the `requests` iterator.
                If specified, the request data will be stored in the KeyValueStore to make sure that they don't change
                over time. This is useful if the `requests` iterator pulls the data dynamically.
        """
        from crawlee._utils.recoverable_state import RecoverableState  # noqa: PLC0415

        self._name = name
        self._handled_count = 0
        self._assumed_total_count = 0

        self._next: tuple[Request | None, Request | None] = (None, None)

        if persist_state_key is None and name is not None:
            persist_state_key = f'SDK_REQUEST_LIST_STATE-{name}'

        self._state = RecoverableState(
            default_state=RequestListState(),
            persistence_enabled=bool(persist_state_key),
            persist_state_key=persist_state_key or '',
            logger=logger,
        )

        self._persist_request_data = bool(persist_requests_key)

        self._requests_data = RecoverableState(
            default_state=RequestListData(requests=[]),
            # With request data persistence enabled, a snapshot of the requests will be done on initialization
            persistence_enabled='explicit_only' if self._persist_request_data else False,
            persist_state_key=persist_requests_key or '',
            logger=logger,
        )

        self._requests: AsyncIterator[str | Request]
        if isinstance(requests, AsyncIterable):
            self._requests = requests.__aiter__()  # ty: ignore[invalid-assignment]
        elif requests is None:
            self._requests = self._iterate_in_threadpool([])
        else:
            self._requests = self._iterate_in_threadpool(requests)

        self._requests_lock: asyncio.Lock | None = None

    async def _get_state(self) -> RequestListState:
        # If state is already initialized, we are done
        if self._state.is_initialized:
            return self._state.current_value

        # Initialize recoverable state
        await self._state.initialize()
        await self._requests_data.initialize()

        # Initialize lock if necessary
        if self._requests_lock is None:
            self._requests_lock = asyncio.Lock()

        # If the RequestList is configured to persist request data, ensure that a copy of request data is used
        if self._persist_request_data:
            async with self._requests_lock:
                if not await self._requests_data.has_persisted_state():
                    self._requests_data.current_value.requests = [
                        request if isinstance(request, Request) else Request.from_url(request)
                        async for request in self._requests
                    ]
                    await self._requests_data.persist_state()

                self._requests = self._iterate_in_threadpool(
                    self._requests_data.current_value.requests[self._state.current_value.next_index :]
                )
        # If not using persistent request data, advance the request iterator
        else:
            async with self._requests_lock:
                for _ in range(self._state.current_value.next_index):
                    with contextlib.suppress(StopAsyncIteration):
                        await self._requests.__anext__()

        # Check consistency of the stored state and the request iterator
        if (unique_key_to_check := self._state.current_value.next_unique_key) is not None:
            await self._ensure_next_request()

            next_unique_key = self._next[0].unique_key if self._next[0] is not None else None
            if next_unique_key != unique_key_to_check:
                raise RuntimeError(
                    f"""Mismatch at index {
                        self._state.current_value.next_index
                    } in persisted requests - Expected unique key `{unique_key_to_check}`, got `{next_unique_key}`"""
                )

        return self._state.current_value

    @property
    def name(self) -> str | None:
        return self._name

    @override
    async def get_handled_count(self) -> int:
        return self._handled_count

    @override
    async def get_total_count(self) -> int:
        return self._assumed_total_count

    @override
    async def is_empty(self) -> bool:
        await self._ensure_next_request()
        return self._next[0] is None

    @override
    async def is_finished(self) -> bool:
        state = await self._get_state()
        return len(state.in_progress) == 0 and await self.is_empty()

    @override
    async def fetch_next_request(self) -> Request | None:
        await self._get_state()
        await self._ensure_next_request()

        if self._next[0] is None:
            return None

        state = await self._get_state()
        state.in_progress.add(self._next[0].unique_key)
        self._assumed_total_count += 1

        next_request = self._next[0]
        if next_request is not None:
            state.next_index += 1
            state.next_unique_key = self._next[1].unique_key if self._next[1] is not None else None

        self._next = (self._next[1], None)
        await self._ensure_next_request()

        return next_request

    @override
    async def mark_request_as_handled(self, request: Request) -> None:
        self._handled_count += 1
        state = await self._get_state()
        state.in_progress.remove(request.unique_key)

    async def _ensure_next_request(self) -> None:
        await self._get_state()

        if self._requests_lock is None:
            self._requests_lock = asyncio.Lock()

        async with self._requests_lock:
            if None in self._next:
                if self._next[0] is None:
                    to_enqueue = [item async for item in self._dequeue_requests(2)]
                    self._next = (to_enqueue[0], to_enqueue[1])
                else:
                    to_enqueue = [item async for item in self._dequeue_requests(1)]
                    self._next = (self._next[0], to_enqueue[0])

    async def _dequeue_requests(self, count: int) -> AsyncGenerator[Request | None]:
        for _ in range(count):
            try:
                yield self._transform_request(await self._requests.__anext__())
            except StopAsyncIteration:  # noqa: PERF203
                yield None

    async def _iterate_in_threadpool(self, iterable: Iterable[str | Request]) -> AsyncIterator[str | Request]:
        """Inspired by a function of the same name from encode/starlette."""
        iterator = iter(iterable)

        class _StopIteration(Exception):  # noqa: N818
            pass

        def _next() -> str | Request:
            # We can't raise `StopIteration` from within the threadpool iterator
            # and catch it outside that context, so we coerce them into a different
            # exception type.
            try:
                return next(iterator)
            except StopIteration:
                raise _StopIteration  # noqa: B904

        try:
            while True:
                yield await asyncio.to_thread(_next)
        except _StopIteration:
            return


================================================
FILE: src/crawlee/request_loaders/_request_loader.py
================================================
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

from crawlee import Request
from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
    from collections.abc import Sequence

    from crawlee.request_loaders import RequestManager, RequestManagerTandem
    from crawlee.storage_clients.models import ProcessedRequest


@docs_group('Request loaders')
class RequestLoader(ABC):
    """An abstract class defining the interface for classes that provide access to a read-only stream of requests.

    Request loaders are used to manage and provide access to a storage of crawling requests.

    Key responsibilities:
        - Fetching the next request to be processed.
        - Marking requests as successfully handled after processing.
        - Managing state information such as the total and handled request counts.
    """

    @abstractmethod
    async def get_handled_count(self) -> int:
        """Get the number of requests in the loader that have been handled."""

    @abstractmethod
    async def get_total_count(self) -> int:
        """Get an offline approximation of the total number of requests in the loader (i.e. pending + handled)."""

    @abstractmethod
    async def is_empty(self) -> bool:
        """Return True if there are no more requests in the loader (there might still be unfinished requests)."""

    @abstractmethod
    async def is_finished(self) -> bool:
        """Return True if all requests have been handled."""

    @abstractmethod
    async def fetch_next_request(self) -> Request | None:
        """Return the next request to be processed, or `None` if there are no more pending requests.

        The method should return `None` if and only if `is_finished` would return `True`. In other cases, the method
        should wait until a request appears.
        """

    @abstractmethod
    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
        """Mark a request as handled after a successful processing (or after giving up retrying)."""

    async def to_tandem(self, request_manager: RequestManager | None = None) -> RequestManagerTandem:
        """Combine the loader with a request manager to support adding and reclaiming requests.

        Args:
            request_manager: Request manager to combine the loader with.
                If None is given, the default request queue is used.
        """
        # Import here to avoid circular imports.
        from crawlee.request_loaders import RequestManagerTandem  # noqa: PLC0415
        from crawlee.storages import RequestQueue  # noqa: PLC0415

        if request_manager is None:
            request_manager = await RequestQueue.open()

        return RequestManagerTandem(self, request_manager)

    def _transform_request(self, request: str | Request) -> Request:
        """Transform a request-like object into a Request object."""
        if isinstance(request, Request):
            return request

        if isinstance(request, str):
            return Request.from_url(request)

        raise ValueError(f'Invalid request type: {type(request)}')

    def _transform_requests(self, requests: Sequence[str | Request]) -> list[Request]:
        """Transform a list of request-like objects into a list of `Request` objects."""
        processed_requests = dict[str, Request]()

        for request in requests:
            processed_request = self._transform_request(request)
            processed_requests.setdefault(processed_request.unique_key, processed_request)

        return list(processed_requests.values())


================================================
FILE: src/crawlee/request_loaders/_request_manager.py
================================================
from __future__ import annotations

from abc import ABC, abstractmethod
from datetime import timedelta
from typing import TYPE_CHECKING

from crawlee._utils.docs import docs_group
from crawlee.request_loaders._request_loader import RequestLoader
from crawlee.storage_clients.models import ProcessedRequest

if TYPE_CHECKING:
    from collections.abc import Sequence

    from crawlee._request import Request


@docs_group('Request loaders')
class RequestManager(RequestLoader, ABC):
    """Base class that extends `RequestLoader` with the capability to enqueue new requests and reclaim failed ones."""

    @abstractmethod
    async def drop(self) -> None:
        """Remove persistent state either from the Apify Cloud storage or from the local database."""

    @abstractmethod
    async def add_request(
        self,
        request: str | Request,
        *,
        forefront: bool = False,
    ) -> ProcessedRequest | None:
        """Add a single request to the manager and store it in underlying resource client.

        Args:
            request: The request object (or its string representation) to be added to the manager.
            forefront: Determines whether the request should be added to the beginning (if True) or the end (if False)
                of the manager.

        Returns:
            Information about the request addition to the manager or None if the request was not added.
        """

    async def add_requests(
        self,
        requests: Sequence[str | Request],
        *,
        forefront: bool = False,
        batch_size: int = 1000,  # noqa: ARG002
        wait_time_between_batches: timedelta = timedelta(seconds=1),  # noqa: ARG002
        wait_for_all_requests_to_be_added: bool = False,  # noqa: ARG002
        wait_for_all_requests_to_be_added_timeout: timedelta | None = None,  # noqa: ARG002
    ) -> None:
        """Add requests to the manager in batches.

        Args:
            requests: Requests to enqueue.
            forefront: If True, add requests to the beginning of the queue.
            batch_size: The number of requests to add in one batch.
            wait_time_between_batches: Time to wait between adding batches.
            wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning.
            wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added.
        """
        # Default and dumb implementation.
        processed_requests = list[ProcessedRequest]()
        for request in requests:
            processed_request = await self.add_request(request, forefront=forefront)
            if processed_request:
                processed_requests.append(processed_request)

    @abstractmethod
    async def reclaim_request(self, request: Request, *, forefront: bool = False) -> ProcessedRequest | None:
        """Reclaims a failed request back to the source, so that it can be returned for processing later again.

        It is possible to modify the request data by supplying an updated request as a parameter.
        """


================================================
FILE: src/crawlee/request_loaders/_request_manager_tandem.py
================================================
from __future__ import annotations

from datetime import timedelta
from logging import getLogger
from typing import TYPE_CHECKING

from typing_extensions import override

from crawlee._utils.docs import docs_group
from crawlee.request_loaders import RequestManager

if TYPE_CHECKING:
    from collections.abc import Sequence

    from crawlee import Request
    from crawlee.request_loaders import RequestLoader
    from crawlee.storage_clients.models import ProcessedRequest


logger = getLogger(__name__)


@docs_group('Request loaders')
class RequestManagerTandem(RequestManager):
    """Implements a tandem behaviour for a pair of `RequestLoader` and `RequestManager`.

    In this scenario, the contents of the "loader" get transferred into the "manager", allowing processing the requests
    from both sources and also enqueueing new requests (not possible with plain `RequestManager`).
    """

    def __init__(self, request_loader: RequestLoader, request_manager: RequestManager) -> None:
        self._read_only_loader = request_loader
        self._read_write_manager = request_manager

    @override
    async def get_handled_count(self) -> int:
        return await self._read_write_manager.get_handled_count()

    @override
    async def get_total_count(self) -> int:
        return (await self._read_only_loader.get_total_count()) + (await self._read_write_manager.get_total_count())

    @override
    async def is_empty(self) -> bool:
        return (await self._read_only_loader.is_empty()) and (await self._read_write_manager.is_empty())

    @override
    async def is_finished(self) -> bool:
        return (await self._read_only_loader.is_finished()) and (await self._read_write_manager.is_finished())

    @override
    async def add_request(self, request: str | Request, *, forefront: bool = False) -> ProcessedRequest | None:
        return await self._read_write_manager.add_request(request, forefront=forefront)

    @override
    async def add_requests(
        self,
        requests: Sequence[str | Request],
        *,
        forefront: bool = False,
        batch_size: int = 1000,
        wait_time_between_batches: timedelta = timedelta(seconds=1),
        wait_for_all_requests_to_be_added: bool = False,
        wait_for_all_requests_to_be_added_timeout: timedelta | None = None,
    ) -> None:
        return await self._read_write_manager.add_requests(
            requests,
            forefront=forefront,
            batch_size=batch_size,
            wait_time_between_batches=wait_time_between_batches,
            wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added,
            wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout,
        )

    @override
    async def fetch_next_request(self) -> Request | None:
        if await self._read_only_loader.is_finished():
            return await self._read_write_manager.fetch_next_request()

        request = await self._read_only_loader.fetch_next_request()

        if not request:
            return await self._read_write_manager.fetch_next_request()

        try:
            await self._read_write_manager.add_request(request, forefront=True)
        except Exception:
            logger.exception(
                'Adding request from the RequestLoader to the RequestManager failed, the request has been dropped',
                extra={'url': request.url, 'unique_key': request.unique_key},
            )
            return None

        await self._read_only_loader.mark_request_as_handled(request)

        return await self._read_write_manager.fetch_next_request()

    @override
    async def reclaim_request(self, request: Request, *, forefront: bool = False) -> None:
        await self._read_write_manager.reclaim_request(request, forefront=forefront)

    @override
    async def mark_request_as_handled(self, request: Request) -> None:
        await self._read_write_manager.mark_request_as_handled(request)

    @override
    async def drop(self) -> None:
        await self._read_write_manager.drop()


================================================
FILE: src/crawlee/request_loaders/_sitemap_request_loader.py
================================================
from __future__ import annotations

import asyncio
from collections import deque
from contextlib import suppress
from logging import getLogger
from typing import TYPE_CHECKING, Annotated, Any

from pydantic import BaseModel, ConfigDict, Field
from typing_extensions import override

from crawlee import Request, RequestOptions
from crawlee._utils.docs import docs_group
from crawlee._utils.globs import Glob
from crawlee._utils.recoverable_state import RecoverableState
from crawlee._utils.sitemap import NestedSitemap, ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
from crawlee.request_loaders._request_loader import RequestLoader

if TYPE_CHECKING:
    import re
    from collections.abc import Callable, Sequence
    from types import TracebackType

    from crawlee import RequestTransformAction
    from crawlee.http_clients import HttpClient
    from crawlee.proxy_configuration import ProxyInfo
    from crawlee.storage_clients.models import ProcessedRequest


logger = getLogger(__name__)


class SitemapRequestLoaderState(BaseModel):
    """State model for persisting sitemap request loader data.

    The crawler processes one sitemap at a time. The current sitemap is stored in `in_progress_sitemap_url`.
    The `parse_sitemap` function parses the sitemap and returns elements as an async iterator. Each element retrieved
    from the iterator is processed based on its type. If the element is a `NestedSitemap`, its URL is added to
    `pending_sitemap_urls` if it hasn't been processed yet (not in `processed_sitemap_urls`). If the element is a
    `SitemapUrl`, the system checks whether it already exists in `current_sitemap_processed_urls`. If it exists,
    the loader was restarted from a saved state and the URL is skipped.

    If the URL is new, it is first added to `url_queue`, then to `current_sitemap_processed_urls`, and `total_count` is
    incremented by 1. When all elements from the current sitemap iterator have been processed, `in_progress_sitemap_url`
    is set to `None`, the sitemap URL is added to `processed_sitemap_urls`, and `current_sitemap_processed_urls` is
    cleared. The next sitemap is retrieved from `pending_sitemap_urls`, skipping any URLs that already exist in
    `processed_sitemap_urls`. If `pending_sitemap_urls` is empty, `completed` is set to `True`.

    When `fetch_next_request` is called, a URL is extracted from `url_queue` and placed in `in_progress`.
    When `mark_request_as_handled` is called for the extracted URL, it is removed from `in_progress` and
    `handled_count` is incremented by 1.

    During initial startup or restart after persistence, state validation occurs in `_get_state`. If both
    `pending_sitemap_urls` and `in_progress_sitemap_url` are empty and `completed` is False, this indicates a
    fresh start. In this case, `self._sitemap_urls` are moved to `pending_sitemap_urls`. Otherwise, the system is
    restarting from a persisted state. If `in_progress` contains any URLs, they are moved back to `url_queue` and
    `in_progress` is cleared.
    """

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

    url_queue: Annotated[deque[str], Field(alias='urlQueue')]
    """Queue of URLs extracted from sitemaps and ready for processing."""

    in_progress: Annotated[set[str], Field(alias='inProgress')] = set()
    """Set of request URLs currently being processed."""

    pending_sitemap_urls: Annotated[deque[str], Field(alias='pendingSitemapUrls')]
    """Queue of sitemap URLs that need to be fetched and processed."""

    in_progress_sitemap_url: Annotated[str | None, Field(alias='inProgressSitemapUrl')] = None
    """The sitemap URL currently being processed."""

    current_sitemap_processed_urls: Annotated[set[str], Field(alias='currentSitemapProcessedUrls')] = set()
    """URLs from the current sitemap that have been added to the queue."""

    processed_sitemap_urls: Annotated[set[str], Field(alias='processedSitemapUrls')] = set()
    """Set of processed sitemap URLs."""

    completed: Annotated[bool, Field(alias='sitemapCompleted')] = False
    """Whether all sitemaps have been fully processed."""

    total_count: Annotated[int, Field(alias='totalCount')] = 0
    """Total number of URLs found and added to the queue from all processed sitemaps."""

    handled_count: Annotated[int, Field(alias='handledCount')] = 0
    """Number of URLs that have been successfully handled."""


@docs_group('Request loaders')
class SitemapRequestLoader(RequestLoader):
    """A request loader that reads URLs from sitemap(s).

    The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
    (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
    Note that HTML pages containing links are not supported - those should be handled by regular crawlers
    and the `enqueue_links` functionality.

    The loader fetches and parses sitemaps in the background, allowing crawling to start
    before all URLs are loaded. It supports filtering URLs using glob and regex patterns.

    The loader supports state persistence, allowing it to resume from where it left off
    after interruption when a `persist_state_key` is provided during initialization.
    """

    def __init__(
        self,
        sitemap_urls: list[str],
        http_client: HttpClient,
        *,
        proxy_info: ProxyInfo | None = None,
        include: list[re.Pattern[Any] | Glob] | None = None,
        exclude: list[re.Pattern[Any] | Glob] | None = None,
        max_buffer_size: int = 200,
        persist_state_key: str | None = None,
        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
    ) -> None:
        """Initialize the sitemap request loader.

        Args:
            sitemap_urls: Configuration options for the loader.
            proxy_info: Optional proxy to use for fetching sitemaps.
            include: List of glob or regex patterns to include URLs.
            exclude: List of glob or regex patterns to exclude URLs.
            max_buffer_size: Maximum number of URLs to buffer in memory.
            http_client: the instance of `HttpClient` to use for fetching sitemaps.
            persist_state_key: A key for persisting the loader's state in the KeyValueStore.
                When provided, allows resuming from where it left off after interruption.
                If None, no state persistence occurs.
            transform_request_function: An optional function to transform requests
                generated by the loader. It receives `RequestOptions` with `url` and should return either
                modified `RequestOptions` or a `RequestTransformAction`.
        """
        self._http_client = http_client
        self._sitemap_urls = sitemap_urls
        self._include = include
        self._exclude = exclude
        self._proxy_info = proxy_info
        self._max_buffer_size = max_buffer_size
        self._transform_request_function = transform_request_function

        # Synchronization for queue operations
        self._queue_has_capacity = asyncio.Event()
        self._queue_has_capacity.set()
        self._queue_lock = asyncio.Lock()

        # Initialize recoverable state
        self._state = RecoverableState(
            default_state=SitemapRequestLoaderState(
                url_queue=deque(),
                pending_sitemap_urls=deque(),
            ),
            persistence_enabled=bool(persist_state_key),
            persist_state_key=persist_state_key or '',
            logger=logger,
        )

        # Start background loading
        self._loading_task = asyncio.create_task(self._load_sitemaps())

    async def _get_state(self) -> SitemapRequestLoaderState:
        """Initialize and return the current state."""
        async with self._queue_lock:
            if self._state.is_initialized:
                return self._state.current_value

            await self._state.initialize()

            # Initialize pending sitemaps on first run
            has_sitemap_for_processing = (
                self._state.current_value.pending_sitemap_urls or self._state.current_value.in_progress_sitemap_url
            )
            if not has_sitemap_for_processing and not self._state.current_value.completed:
                self._state.current_value.pending_sitemap_urls.extend(self._sitemap_urls)

            if self._state.current_value.in_progress:
                self._state.current_value.url_queue.extendleft(self._state.current_value.in_progress)
                self._state.current_value.in_progress.clear()

            if (
                self._state.current_value.url_queue
                and len(self._state.current_value.url_queue) >= self._max_buffer_size
            ):
                # Notify that the queue is full
                self._queue_has_capacity.clear()

            return self._state.current_value

    def _check_url_patterns(
        self,
        target_url: str,
        include: Sequence[re.Pattern[Any] | Glob] | None,
        exclude: Sequence[re.Pattern[Any] | Glob] | None,
    ) -> bool:
        """Check if a URL matches configured include/exclude patterns."""
        # If the URL matches any `exclude` pattern, reject it
        for pattern in exclude or ():
            if isinstance(pattern, Glob):
                pattern = pattern.regexp  # noqa: PLW2901

            if pattern.match(target_url) is not None:
                return False

        # If there are no `include` patterns and the URL passed all `exclude` patterns, accept the URL
        if include is None:
            return True

        # If the URL matches any `include` pattern, accept it
        for pattern in include:
            if isinstance(pattern, Glob):
                pattern = pattern.regexp  # noqa: PLW2901

            if pattern.match(target_url) is not None:
                return True

        # The URL does not match any `include` pattern - reject it
        return False

    async def _load_sitemaps(self) -> None:
        """Load URLs from sitemaps in the background."""
        try:
            # Get actual state
            while (state := await self._get_state()) and (state.pending_sitemap_urls or state.in_progress_sitemap_url):
                # Get sitemap URL for parsing
                sitemap_url = state.in_progress_sitemap_url
                if not sitemap_url:
                    sitemap_url = state.pending_sitemap_urls.popleft()
                    # Skip processed urls
                    if sitemap_url in state.processed_sitemap_urls:
                        continue
                    state.in_progress_sitemap_url = sitemap_url

                parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)

                async for item in parse_sitemap(
                    [SitemapSource(type='url', url=sitemap_url)],
                    self._http_client,
                    proxy_info=self._proxy_info,
                    options=parse_options,
                ):
                    if isinstance(item, NestedSitemap):
                        # Add nested sitemap to queue
                        if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:
                            state.pending_sitemap_urls.append(item.loc)
                        continue

                    if isinstance(item, SitemapUrl):
                        url = item.loc

                        state = await self._get_state()

                        # Skip if already processed
                        if url in state.current_sitemap_processed_urls:
                            continue

                        # Check if URL should be included
                        if not self._check_url_patterns(url, self._include, self._exclude):
                            continue

                        # Check if we have capacity in the queue
                        await self._queue_has_capacity.wait()

                        state = await self._get_state()
                        async with self._queue_lock:
                            state.url_queue.append(url)
                            state.current_sitemap_processed_urls.add(url)
                            state.total_count += 1
                            if len(state.url_queue) >= self._max_buffer_size:
                                # Notify that the queue is full
                                self._queue_has_capacity.clear()

                # Clear current sitemap after processing
                state = await self._get_state()
                current_sitemap_url = state.in_progress_sitemap_url
                state.in_progress_sitemap_url = None
                if current_sitemap_url:
                    state.processed_sitemap_urls.add(current_sitemap_url)
                state.current_sitemap_processed_urls.clear()

            # Mark as completed after processing all sitemap urls
            state.completed = True

        except Exception:
            logger.exception('Error loading sitemaps')
            raise

    @override
    async def get_total_count(self) -> int:
        """Return the total number of URLs found so far."""
        state = await self._get_state()
        return state.total_count

    @override
    async def get_handled_count(self) -> int:
        """Return the number of URLs that have been handled."""
        state = await self._get_state()
        return state.handled_count

    @override
    async def is_empty(self) -> bool:
        """Check if there are no more URLs to process."""
        state = await self._get_state()
        return not state.url_queue

    @override
    async def is_finished(self) -> bool:
        """Check if all URLs have been processed."""
        state = await self._get_state()
        return not state.url_queue and len(state.in_progress) == 0 and self._loading_task.done()

    @override
    async def fetch_next_request(self) -> Request | None:
        """Fetch the next request to process."""
        while not (await self.is_finished()):
            state = await self._get_state()
            if not state.url_queue:
                await asyncio.sleep(0.1)
                continue

            async with self._queue_lock:
                url = state.url_queue.popleft()
                request_option = RequestOptions(url=url)
                if self._transform_request_function:
                    transform_request_option = self._transform_request_function(request_option)
                    if transform_request_option == 'skip':
                        state.total_count -= 1
                        continue
                    if transform_request_option != 'unchanged':
                        request_option = transform_request_option
                request = Request.from_url(**request_option)
                state.in_progress.add(request.url)
                if len(state.url_queue) < self._max_buffer_size:
                    self._queue_has_capacity.set()

            return request

        return None

    @override
    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
        """Mark a request as successfully handled."""
        state = await self._get_state()
        if request.url in state.in_progress:
            state.in_progress.remove(request.url)
            state.handled_count += 1
        return None

    async def abort_loading(self) -> None:
        """Abort the sitemap loading process."""
        if self._loading_task and not self._loading_task.done():
            self._loading_task.cancel()
            with suppress(asyncio.CancelledError):
                await self._loading_task

    async def start(self) -> None:
        """Start the sitemap loading process."""
        if self._loading_task and not self._loading_task.done():
            return
        self._loading_task = asyncio.create_task(self._load_sitemaps())

    async def close(self) -> None:
        """Close the request loader."""
        await self.abort_loading()
        await self._state.teardown()

    async def __aenter__(self) -> SitemapRequestLoader:
        """Enter the context manager."""
        await self.start()
        return self

    async def __aexit__(
        self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None
    ) -> None:
        """Exit the context manager."""
        await self.close()


================================================
FILE: src/crawlee/router.py
================================================
from __future__ import annotations

import asyncio
from collections.abc import Awaitable, Callable
from typing import Generic, TypeVar

from crawlee._request import RequestState
from crawlee._types import BasicCrawlingContext
from crawlee._utils.docs import docs_group

__all__ = ['Router']

from crawlee.errors import UserHandlerTimeoutError

TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]


@docs_group('Other')
class Router(Generic[TCrawlingContext]):
    """A request dispatching system that routes requests to registered handlers based on their labels.

    The `Router` allows you to define and register request handlers for specific labels. When a request is received,
    the router invokes the corresponding `request_handler` based on the request's `label`. If no matching handler
    is found, the default handler is used.

    ### Usage

    ```python
    from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
    from crawlee.router import Router

    router = Router[HttpCrawlingContext]()


    # Handler for requests without a matching label handler
    @router.default_handler
    async def default_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Request without label {context.request.url} ...')


    # Handler for category requests
    @router.handler(label='category')
    async def category_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Category request {context.request.url} ...')


    # Handler for product requests
    @router.handler(label='product')
    async def product_handler(context: HttpCrawlingContext) -> None:
        context.log.info(f'Product {context.request.url} ...')


    async def main() -> None:
        crawler = HttpCrawler(request_handler=router)
        await crawler.run()
    """

    def __init__(self) -> None:
        self._default_handler: RequestHandler[TCrawlingContext] | None = None
        self._handlers_by_label = dict[str, RequestHandler[TCrawlingContext]]()

    def default_handler(self: Router, handler: RequestHandler[TCrawlingContext]) -> RequestHandler[TCrawlingContext]:
        """Register a default request handler.

        The default request handler is invoked for requests that have either no label or a label for which we have
        no matching handler.
        """
        if self._default_handler is not None:
            raise RuntimeError('A default handler is already configured')

        self._default_handler = handler

        return handler

    def handler(
        self,
        label: str,
    ) -> Callable[[RequestHandler[TCrawlingContext]], Callable[[TCrawlingContext], Awaitable]]:
        """Register a request handler based on a label.

        This decorator registers a request handler for a specific label. The handler will be invoked only for requests
        that have the exact same label.
        """
        if label in self._handlers_by_label:
            raise RuntimeError(f'A handler for label `{label}` is already registered')

        def wrapper(handler: Callable[[TCrawlingContext], Awaitable]) -> Callable[[TCrawlingContext], Awaitable]:
            self._handlers_by_label[label] = handler
            return handler

        return wrapper

    async def __call__(self, context: TCrawlingContext) -> None:
        """Invoke a request handler that matches the request label (or the default)."""
        context.request.state = RequestState.REQUEST_HANDLER
        if context.request.label is None or context.request.label not in self._handlers_by_label:
            if self._default_handler is None:
                raise RuntimeError(
                    f'No handler matches label `{context.request.label}` and no default handler is configured'
                )

            user_defined_handler = self._default_handler
        else:
            user_defined_handler = self._handlers_by_label[context.request.label]

        try:
            return await user_defined_handler(context)
        except asyncio.TimeoutError as e:
            # Timeout in handler, but not timeout of handler.
            raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e


================================================
FILE: src/crawlee/sessions/__init__.py
================================================
from ._cookies import CookieParam, SessionCookies
from ._session import Session
from ._session_pool import SessionPool

__all__ = ['CookieParam', 'Session', 'SessionCookies', 'SessionPool']


================================================
FILE: src/crawlee/sessions/_cookies.py
================================================
from __future__ import annotations

from copy import deepcopy
from http.cookiejar import Cookie, CookieJar
from typing import TYPE_CHECKING, Any, Literal

from typing_extensions import NotRequired, Required, TypedDict

from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
    from collections.abc import Iterator
    from typing import TypeGuard


@docs_group('Session management')
class CookieParam(TypedDict, total=False):
    """Dictionary representation of cookies for `SessionCookies.set` method."""

    name: Required[str]
    """Cookie name."""

    value: Required[str]
    """Cookie value."""

    domain: NotRequired[str]
    """Domain for which the cookie is set."""

    path: NotRequired[str]
    """Path on the specified domain for which the cookie is set."""

    secure: NotRequired[bool]
    """Set the `Secure` flag for the cookie."""

    http_only: NotRequired[bool]
    """Set the `HttpOnly` flag for the cookie."""

    expires: NotRequired[int]
    """Expiration date for the cookie, None for a session cookie."""

    same_site: NotRequired[Literal['Lax', 'None', 'Strict']]
    """Set the `SameSite` attribute for the cookie."""


class PlaywrightCookieParam(TypedDict, total=False):
    """Cookie parameters in Playwright format with camelCase naming."""

    name: NotRequired[str]
    value: NotRequired[str]
    domain: NotRequired[str]
    path: NotRequired[str]
    secure: NotRequired[bool]
    httpOnly: NotRequired[bool]
    expires: NotRequired[float]
    sameSite: NotRequired[Literal['Lax', 'None', 'Strict']]
    partitionKey: NotRequired[str | None]


@docs_group('Session management')
class SessionCookies:
    """Storage cookies for session with browser-compatible serialization and deserialization."""

    def __init__(self, cookies: SessionCookies | CookieJar | dict[str, str] | list[CookieParam] | None = None) -> None:
        if isinstance(cookies, CookieJar):
            self._jar = cookies
            return

        self._jar = CookieJar()

        if isinstance(cookies, list):
            for item in cookies:
                self.set(**item)

        elif isinstance(cookies, SessionCookies):
            for cookie in cookies.jar:
                self._jar.set_cookie(cookie)

        elif isinstance(cookies, dict):
            cookies_dict: dict[str, str] = cookies
            for key, value in cookies_dict.items():
                self.set(key, value)

    @property
    def jar(self) -> CookieJar:
        """The cookie jar instance."""
        return self._jar

    def set(
        self,
        name: str,
        value: str,
        *,
        domain: str = '',
        path: str = '/',
        expires: int | None = None,
        http_only: bool = False,
        secure: bool = False,
        same_site: Literal['Lax', 'None', 'Strict'] | None = None,
        **_kwargs: Any,  # Unknown parameters will be ignored.
    ) -> None:
        """Create and store a cookie with modern browser attributes.

        Args:
            name: Cookie name.
            value: Cookie value.
            domain: Cookie domain.
            path: Cookie path.
            expires: Cookie expiration timestamp.
            http_only: Whether cookie is HTTP-only.
            secure: Whether cookie requires secure context.
            same_site: SameSite cookie attribute value.
        """
        cookie = Cookie(
            version=0,
            name=name,
            value=value,
            port=None,
            port_specified=False,
            domain=domain,
            domain_specified=bool(domain),
            domain_initial_dot=domain.startswith('.'),
            path=path,
            path_specified=bool(path),
            secure=secure,
            expires=expires,
            discard=True,
            comment=None,
            comment_url=None,
            rest={'HttpOnly': ''} if http_only else {},
            rfc2109=False,
        )

        if same_site:
            cookie.set_nonstandard_attr('SameSite', same_site)

        self.jar.set_cookie(cookie)

    def _convert_cookie_to_dict(self, cookie: Cookie) -> CookieParam:
        """Convert `http.cookiejar.Cookie` to dictionary format.

        Args:
            cookie: Cookie object to convert.
        """
        cookie_dict = CookieParam(
            name=cookie.name,
            value=cookie.value or '',
            domain=cookie.domain,
            path=cookie.path,
            secure=cookie.secure,
            http_only=cookie.has_nonstandard_attr('HttpOnly'),
        )

        if cookie.expires:
            cookie_dict['expires'] = cookie.expires

        if (same_site := cookie.get_nonstandard_attr('SameSite')) and self._is_valid_same_site(same_site):
            cookie_dict['same_site'] = same_site

        return cookie_dict

    def _to_playwright(self, cookie_dict: CookieParam) -> PlaywrightCookieParam:
        """Convert internal cookie to Playwright format."""
        result: dict = dict(cookie_dict)

        if 'http_only' in result:
            result['httpOnly'] = result.pop('http_only')
        if 'same_site' in result:
            result['sameSite'] = result.pop('same_site')
        if 'expires' in result:
            result['expires'] = float(result['expires'])

        return PlaywrightCookieParam(**result)

    def _from_playwright(self, cookie_dict: PlaywrightCookieParam) -> CookieParam:
        """Convert Playwright cookie to internal format."""
        result: dict = dict(cookie_dict)

        if 'httpOnly' in result:
            result['http_only'] = result.pop('httpOnly')
        if 'sameSite' in result:
            result['same_site'] = result.pop('sameSite')
        if 'expires' in result:
            expires = int(result['expires'])
            result['expires'] = None if expires == -1 else expires

        return CookieParam(name=result.pop('name', ''), value=result.pop('value', ''), **result)

    def get_cookies_as_dicts(self) -> list[CookieParam]:
        """Convert cookies to a list with `CookieParam` dicts."""
        return [self._convert_cookie_to_dict(cookie) for cookie in self.jar]

    def store_cookie(self, cookie: Cookie) -> None:
        """Store a Cookie object in the session cookie jar.

        Args:
            cookie: The Cookie object to store in the jar.
        """
        self.jar.set_cookie(cookie)

    def store_cookies(self, cookies: list[Cookie]) -> None:
        """Store multiple cookie objects in the session cookie jar.

        Args:
            cookies: A list of cookie objects to store in the jar.
        """
        for cookie in cookies:
            self.store_cookie(cookie)
        self._jar.clear_expired_cookies()

    def set_cookies(self, cookie_dicts: list[CookieParam]) -> None:
        """Create and store cookies from their dictionary representations.

        Args:
            cookie_dicts: List of dictionaries where each dict represents cookie parameters.
        """
        for cookie_dict in cookie_dicts:
            self.set(**cookie_dict)
        self._jar.clear_expired_cookies()

    def get_cookies_as_playwright_format(self) -> list[PlaywrightCookieParam]:
        """Get cookies in playwright format."""
        return [self._to_playwright(cookie) for cookie in self.get_cookies_as_dicts()]

    def set_cookies_from_playwright_format(self, pw_cookies: list[PlaywrightCookieParam]) -> None:
        """Set cookies from playwright format."""
        for pw_cookie in pw_cookies:
            cookie_param = self._from_playwright(pw_cookie)
            self.set(**cookie_param)
        self._jar.clear_expired_cookies()

    def __deepcopy__(self, memo: dict[int, Any] | None) -> SessionCookies:
        # This is necessary because `CookieJar` use `RLock`, which prevents `deepcopy`.
        cookie_dicts = self.get_cookies_as_dicts()
        return self.__class__(deepcopy(cookie_dicts, memo))

    def __len__(self) -> int:
        return len(self._jar)

    def __setitem__(self, name: str, value: str) -> None:
        self.set(name, value)

    def __getitem__(self, name: str) -> str | None:
        for cookie in self._jar:
            if cookie.name == name:
                return cookie.value
        raise KeyError(f"Cookie '{name}' not found")

    def __iter__(self) -> Iterator[CookieParam]:
        return (self._convert_cookie_to_dict(cookie) for cookie in self._jar)

    def __repr__(self) -> str:
        cookies_str: str = ', '.join(
            [f'<Cookie {cookie.name}={cookie.value} for {cookie.domain}{cookie.path}>' for cookie in self._jar]
        )
        return f'<SessionCookies[{cookies_str}]>'

    def __bool__(self) -> bool:
        for _ in self._jar:
            return True
        return False

    def __eq__(self, other: object) -> bool:
        if not isinstance(other, SessionCookies):
            return NotImplemented

        if len(self) != len(other):
            return False

        self_keys = {(cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in self._jar}
        other_keys = {(cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in other.jar}

        return self_keys == other_keys

    def __hash__(self) -> int:
        """Return hash based on the cookies key attributes."""
        cookie_tuples = frozenset((cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in self._jar)
        return hash(cookie_tuples)

    def _is_valid_same_site(self, value: str | None) -> TypeGuard[Literal['Lax', 'None', 'Strict']]:
        return value in {'Lax', 'None', 'Strict'}


================================================
FILE: src/crawlee/sessions/_models.py
================================================
from __future__ import annotations

from datetime import datetime, timedelta
from typing import Annotated, Any

from pydantic import (
    BaseModel,
    BeforeValidator,
    ConfigDict,
    Field,
    GetPydanticSchema,
    PlainSerializer,
    computed_field,
)

from ._cookies import CookieParam
from ._session import Session


class SessionModel(BaseModel):
    """Model for a Session object."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

    id: Annotated[str, Field(alias='id')]
    max_age: Annotated[timedelta, Field(alias='maxAge')]
    user_data: Annotated[dict, Field(alias='userData')]
    max_error_score: Annotated[float, Field(alias='maxErrorScore')]
    error_score_decrement: Annotated[float, Field(alias='errorScoreDecrement')]
    created_at: Annotated[datetime, Field(alias='createdAt')]
    usage_count: Annotated[int, Field(alias='usageCount')]
    max_usage_count: Annotated[int, Field(alias='maxUsageCount')]
    error_score: Annotated[float, Field(alias='errorScore')]
    cookies: Annotated[list[CookieParam], Field(alias='cookies')]
    blocked_status_codes: Annotated[list[int], Field(alias='blockedStatusCodes')]


class SessionPoolModel(BaseModel):
    """Model for a SessionPool object."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

    max_pool_size: Annotated[int, Field(alias='maxPoolSize')]

    sessions: Annotated[
        dict[
            str,
            Annotated[
                Session, GetPydanticSchema(lambda _, handler: handler(Any))
            ],  # handler(Any) is fine - we validate manually in the BeforeValidator
        ],
        Field(alias='sessions'),
        PlainSerializer(
            lambda value: [session.get_state().model_dump(by_alias=True) for session in value.values()],
            return_type=list,
        ),
        BeforeValidator(
            lambda value: {
                session.id: session
                for item in value
                if (session := Session.from_model(SessionModel.model_validate(item, by_alias=True)))
            }
        ),
    ]

    @computed_field(alias='sessionCount')
    @property
    def session_count(self) -> int:
        """Get the total number of sessions currently maintained in the pool."""
        return len(self.sessions)

    @computed_field(alias='usableSessionCount')
    @property
    def usable_session_count(self) -> int:
        """Get the number of sessions that are currently usable."""
        return len([session for _, session in self.sessions.items() if session.is_usable])

    @computed_field(alias='retiredSessionCount')
    @property
    def retired_session_count(self) -> int:
        """Get the number of sessions that are no longer usable."""
        return self.session_count - self.usable_session_count


================================================
FILE: src/crawlee/sessions/_session.py
================================================
# Inspiration: https://github.com/apify/crawlee/blob/v3.9.0/packages/core/src/session_pool/session.ts

from __future__ import annotations

from datetime import datetime, timedelta, timezone
from logging import getLogger
from typing import TYPE_CHECKING, ClassVar, Literal, overload

from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.docs import docs_group
from crawlee.sessions._cookies import CookieParam, SessionCookies

if TYPE_CHECKING:
    from http.cookiejar import CookieJar

    from crawlee.sessions._models import SessionModel

logger = getLogger(__name__)


@docs_group('Session management')
class Session:
    """Represent a single user session, managing cookies, error states, and usage limits.

    A `Session` simulates a specific user with attributes like cookies, IP (via proxy), and potentially
    a unique browser fingerprint. It maintains its internal state, which can include custom user data
    (e.g., authorization tokens or headers) and tracks its usability through metrics such as error score,
    usage count, and expiration.
    """

    _DEFAULT_BLOCKED_STATUS_CODES: ClassVar = [401, 403, 429]
    """Default status codes that indicate a session is blocked."""

    def __init__(
        self,
        *,
        id: str | None = None,
        max_age: timedelta = timedelta(minutes=50),
        user_data: dict | None = None,
        max_error_score: float = 3.0,
        error_score_decrement: float = 0.5,
        created_at: datetime | None = None,
        usage_count: int = 0,
        max_usage_count: int = 50,
        error_score: float = 0.0,
        cookies: SessionCookies | CookieJar | dict[str, str] | list[CookieParam] | None = None,
        blocked_status_codes: list | None = None,
    ) -> None:
        """Initialize a new instance.

        Args:
            id: Unique identifier for the session, autogenerated if not provided.
            max_age: Time duration after which the session expires.
            user_data: Custom user data associated with the session.
            max_error_score: Threshold score beyond which the session is considered blocked.
            error_score_decrement: Value by which the error score is decremented on successful operations.
            created_at: Timestamp when the session was created, defaults to current UTC time if not provided.
            usage_count: Number of times the session has been used.
            max_usage_count: Maximum allowable uses of the session before it is considered expired.
            error_score: Current error score of the session.
            cookies: Cookies associated with the session.
            blocked_status_codes: HTTP status codes that indicate a session should be blocked.
        """
        self._id = id or crypto_random_object_id(length=10)
        self._max_age = max_age
        self._user_data = user_data or {}
        self._max_error_score = max_error_score
        self._error_score_decrement = error_score_decrement
        self._created_at = created_at or datetime.now(timezone.utc)
        self._usage_count = usage_count
        self._max_usage_count = max_usage_count
        self._error_score = error_score
        self._cookies = SessionCookies(cookies) or SessionCookies()
        self._blocked_status_codes = set(blocked_status_codes or self._DEFAULT_BLOCKED_STATUS_CODES)

    @classmethod
    def from_model(cls, model: SessionModel) -> Session:
        """Initialize a new instance from a `SessionModel`."""
        cookies = SessionCookies(model.cookies)
        return cls(**model.model_dump(exclude={'cookies'}), cookies=cookies)

    def __repr__(self) -> str:
        """Get a string representation."""
        return f'<{self.__class__.__name__} {self.get_state(as_dict=False)}>'

    def __eq__(self, other: object) -> bool:
        """Compare two sessions for equality."""
        if not isinstance(other, Session):
            return NotImplemented
        return self.get_state(as_dict=True) == other.get_state(as_dict=True)

    def __hash__(self) -> int:
        """Return hash based on the session state."""
        state = self.get_state(as_dict=True)
        hashable_items = list[tuple[str, int]]()

        # Convert dict to tuple of sorted items for consistent hashing. Exclude non-hashable values like cookies
        # and convert them to their string representation.
        for key, value in sorted(state.items()):
            if key == 'cookies':
                # Use hash of the cookies object if it has __hash__ method.
                hashable_items.append((key, hash(self._cookies)))
            elif isinstance(value, (list, dict)):
                # Convert collections to tuples for hashing.
                if isinstance(value, list):
                    hashable_items.append((key, hash(tuple(value))))
                else:
                    hashable_items.append((key, hash(tuple(sorted(value.items())))))
            else:
                hashable_items.append((key, hash(value)))

        return hash(tuple(hashable_items))

    @property
    def id(self) -> str:
        """Get the session ID."""
        return self._id

    @property
    def user_data(self) -> dict:
        """Get the user data."""
        return self._user_data

    @property
    def cookies(self) -> SessionCookies:
        """Get the cookies."""
        return self._cookies

    @property
    def error_score(self) -> float:
        """Get the current error score."""
        return self._error_score

    @property
    def usage_count(self) -> float:
        """Get the current usage count."""
        return self._usage_count

    @property
    def expires_at(self) -> datetime:
        """Get the expiration datetime of the session."""
        return self._created_at + self._max_age

    @property
    def is_blocked(self) -> bool:
        """Indicate whether the session is blocked based on the error score.."""
        return self._error_score >= self._max_error_score

    @property
    def is_expired(self) -> bool:
        """Indicate whether the session is expired based on the current time."""
        return datetime.now(timezone.utc) >= self.expires_at

    @property
    def is_max_usage_count_reached(self) -> bool:
        """Indicate whether the session has reached its maximum usage limit."""
        return self._usage_count >= self._max_usage_count

    @property
    def is_usable(self) -> bool:
        """Determine if the session is usable for next requests."""
        return not (self.is_blocked or self.is_expired or self.is_max_usage_count_reached)

    @overload
    def get_state(self, *, as_dict: Literal[True]) -> dict: ...

    @overload
    def get_state(self, *, as_dict: Literal[False]) -> SessionModel: ...

    def get_state(self, *, as_dict: bool = False) -> SessionModel | dict:
        """Retrieve the current state of the session either as a model or as a dictionary."""
        from ._models import SessionModel  # noqa: PLC0415

        model = SessionModel(
            id=self._id,
            max_age=self._max_age,
            user_data=self._user_data,
            max_error_score=self._max_error_score,
            error_score_decrement=self._error_score_decrement,
            created_at=self._created_at,
            usage_count=self._usage_count,
            max_usage_count=self._max_usage_count,
            error_score=self._error_score,
            cookies=self._cookies.get_cookies_as_dicts(),
            blocked_status_codes=list(self._blocked_status_codes),
        )
        if as_dict:
            return model.model_dump()
        return model

    def mark_good(self) -> None:
        """Mark the session as good. Should be called after a successful session usage."""
        self._usage_count += 1

        if self._error_score > 0:
            self._error_score = max(0, self._error_score - self._error_score_decrement)

        # Retire the session if it is not usable anymore
        if not self.is_usable:
            self.retire()

    def mark_bad(self) -> None:
        """Mark the session as bad after an unsuccessful session usage."""
        self._error_score += 1
        self._usage_count += 1

        # Retire the session if it is not usable anymore
        if not self.is_usable:
            self.retire()

    def retire(self) -> None:
        """Retire the session by setting the error score to the maximum value.

        This method should be used if the session usage was unsuccessful and you are sure that it is because of
        the session configuration and not any external matters. For example when server returns 403 status code.
        If the session does not work due to some external factors as server error such as 5XX you probably want
        to use `mark_bad` method.
        """
        self._error_score += self._max_error_score
        self._usage_count += 1
        # Note: We emit an event here because of the Puppeteer in TS implementation.

    def is_blocked_status_code(
        self,
        *,
        status_code: int,
        ignore_http_error_status_codes: set[int] | None = None,
    ) -> bool:
        """Evaluate whether a session should be retired based on the received HTTP status code.

        Args:
            status_code: The HTTP status code received from a server response.
            ignore_http_error_status_codes: Optional status codes to allow suppression of
            codes from `blocked_status_codes`.

        Returns:
            True if the session should be retired, False otherwise.
        """
        return status_code in (self._blocked_status_codes - (ignore_http_error_status_codes or set()))


================================================
FILE: src/crawlee/sessions/_session_pool.py
================================================
# Inspiration: https://github.com/apify/crawlee/blob/v3.9.0/packages/core/src/session_pool/session_pool.ts

from __future__ import annotations

import random
from collections.abc import Callable
from logging import getLogger
from typing import TYPE_CHECKING, Literal, overload

from crawlee import service_locator
from crawlee._utils.context import ensure_context
from crawlee._utils.docs import docs_group
from crawlee._utils.recoverable_state import RecoverableState
from crawlee.sessions import Session
from crawlee.sessions._models import SessionPoolModel

if TYPE_CHECKING:
    from types import TracebackType

    from crawlee.events import EventManager

logger = getLogger(__name__)

CreateSessionFunctionType = Callable[[], Session]


@docs_group('Session management')
class SessionPool:
    """A pool of sessions that are managed, rotated, and persisted based on usage and age.

    It ensures effective session management by maintaining a pool of sessions and rotating them based on
    usage count, expiration time, or custom rules. It provides methods to retrieve sessions, manage their
    lifecycle, and optionally persist the state to enable recovery.
    """

    def __init__(
        self,
        *,
        max_pool_size: int = 1000,
        create_session_settings: dict | None = None,
        create_session_function: CreateSessionFunctionType | None = None,
        event_manager: EventManager | None = None,
        persistence_enabled: bool = False,
        persist_state_kvs_name: str | None = None,
        persist_state_key: str = 'CRAWLEE_SESSION_POOL_STATE',
    ) -> None:
        """Initialize a new instance.

        Args:
            max_pool_size: Maximum number of sessions to maintain in the pool. You can add more sessions to the pool
                by using the `add_session` method.
            create_session_settings: Settings for creating new session instances. If None, default settings will
                be used. Do not set it if you are providing a `create_session_function`.
            create_session_function: A callable to create new session instances. If None, a default session settings
                will be used. Do not set it if you are providing `create_session_settings`.
            event_manager: The event manager to handle events like persist state.
            persistence_enabled: Flag to enable or disable state persistence of the pool.
            persist_state_kvs_name: The name of the `KeyValueStore` used for state persistence.
            persist_state_key: The key under which the session pool's state is stored in the `KeyValueStore`.
        """
        if event_manager:
            service_locator.set_event_manager(event_manager)

        self._state = RecoverableState(
            default_state=SessionPoolModel(
                max_pool_size=max_pool_size,
                sessions={},
            ),
            logger=logger,
            persistence_enabled=persistence_enabled,
            persist_state_kvs_name=persist_state_kvs_name,
            persist_state_key=persist_state_key or 'CRAWLEE_SESSION_POOL_STATE',
        )

        self._max_pool_size = max_pool_size
        self._session_settings = create_session_settings or {}
        self._create_session_function = create_session_function
        self._persistence_enabled = persistence_enabled

        if self._create_session_function and self._session_settings:
            raise ValueError('Both `create_session_settings` and `create_session_function` cannot be provided.')

        # Flag to indicate the context state.
        self._active = False

    def __repr__(self) -> str:
        """Get a string representation."""
        return f'<{self.__class__.__name__} {self.get_state(as_dict=False)}>'

    @property
    def session_count(self) -> int:
        """Get the total number of sessions currently maintained in the pool."""
        return len(self._state.current_value.sessions)

    @property
    def usable_session_count(self) -> int:
        """Get the number of sessions that are currently usable."""
        return self._state.current_value.usable_session_count

    @property
    def retired_session_count(self) -> int:
        """Get the number of sessions that are no longer usable."""
        return self._state.current_value.retired_session_count

    @property
    def active(self) -> bool:
        """Indicate whether the context is active."""
        return self._active

    async def __aenter__(self) -> SessionPool:
        """Initialize the pool upon entering the context manager.

        Raises:
            RuntimeError: If the context manager is already active.
        """
        if self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is already active.')

        self._active = True

        state = await self._state.initialize()
        state.max_pool_size = self._max_pool_size
        self._remove_retired_sessions()

        if not state.sessions:
            await self._fill_sessions_to_max()

        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        """Deinitialize the pool upon exiting the context manager.

        Raises:
            RuntimeError: If the context manager is not active.
        """
        if not self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is not active.')

        await self._state.teardown()

        self._active = False

    @overload
    def get_state(self, *, as_dict: Literal[True]) -> dict: ...

    @overload
    def get_state(self, *, as_dict: Literal[False]) -> SessionPoolModel: ...

    @ensure_context
    def get_state(self, *, as_dict: bool = False) -> SessionPoolModel | dict:
        """Retrieve the current state of the pool either as a model or as a dictionary."""
        model = self._state.current_value.model_copy(deep=True)
        if as_dict:
            return model.model_dump()
        return model

    @ensure_context
    def add_session(self, session: Session) -> None:
        """Add an externally created session to the pool.

        This is intended only for the cases when you want to add a session that was created outside of the pool.
        Otherwise, the pool will create new sessions automatically.

        Args:
            session: The session to add to the pool.
        """
        state = self._state.current_value

        if session.id in state.sessions:
            logger.warning(f'Session with ID {session.id} already exists in the pool.')
            return
        state.sessions[session.id] = session

    @ensure_context
    async def get_session(self) -> Session:
        """Retrieve a random session from the pool.

        This method first ensures the session pool is at its maximum capacity. If the random session is not usable,
        retired sessions are removed and a new session is created and returned.

        Returns:
            The session object.
        """
        await self._fill_sessions_to_max()
        session = self._get_random_session()

        if session.is_usable:
            return session

        # If the random session is not usable, clean up and create a new session
        self._remove_retired_sessions()
        return await self._create_new_session()

    @ensure_context
    async def get_session_by_id(self, session_id: str) -> Session | None:
        """Retrieve a session by ID from the pool.

        This method first ensures the session pool is at its maximum capacity. It then tries to retrieve a specific
        session by ID. If the session is not found or not usable, `None` is returned.

        Args:
            session_id: The ID of the session to retrieve.

        Returns:
            The session object if found and usable, otherwise `None`.
        """
        await self._fill_sessions_to_max()
        session = self._state.current_value.sessions.get(session_id)

        if not session:
            logger.warning(f'Session with ID {session_id} not found.')
            return None

        if not session.is_usable:
            logger.warning(f'Session with ID {session_id} is not usable.')
            return None

        return session

    async def reset_store(self) -> None:
        """Reset the KVS where the pool state is persisted."""
        await self._state.reset()

    async def _create_new_session(self) -> Session:
        """Create a new session, add it to the pool and return it."""
        if self._create_session_function:
            new_session = self._create_session_function()
        else:
            new_session = Session(**self._session_settings)
        self._state.current_value.sessions[new_session.id] = new_session
        return new_session

    async def _fill_sessions_to_max(self) -> None:
        """Fill the pool with sessions to the maximum size."""
        for _ in range(self._max_pool_size - self.session_count):
            await self._create_new_session()

    def _get_random_session(self) -> Session:
        """Get a random session from the pool."""
        state = self._state.current_value
        if not state.sessions:
            raise ValueError('No sessions available in the pool.')
        return random.choice(list(state.sessions.values()))

    def _remove_retired_sessions(self) -> None:
        """Remove all sessions from the pool that are no longer usable."""
        state = self._state.current_value
        state.sessions = {session.id: session for session in state.sessions.values() if session.is_usable}


================================================
FILE: src/crawlee/sessions/py.typed
================================================


================================================
FILE: src/crawlee/statistics/__init__.py
================================================
from ._models import FinalStatistics, StatisticsState
from ._statistics import Statistics

__all__ = ['FinalStatistics', 'Statistics', 'StatisticsState']


================================================
FILE: src/crawlee/statistics/_error_snapshotter.py
================================================
from __future__ import annotations

import asyncio
import hashlib
import re
import string
from typing import TYPE_CHECKING

from crawlee.storages import KeyValueStore

if TYPE_CHECKING:
    from crawlee._types import BasicCrawlingContext


class ErrorSnapshotter:
    MAX_ERROR_CHARACTERS = 30
    MAX_HASH_LENGTH = 30
    MAX_FILENAME_LENGTH = 250
    BASE_MESSAGE = 'An error occurred'
    SNAPSHOT_PREFIX = 'ERROR_SNAPSHOT'
    ALLOWED_CHARACTERS = string.ascii_letters + string.digits + '!-_.'

    def __init__(self, *, snapshot_kvs_name: str | None = None) -> None:
        self._kvs_name = snapshot_kvs_name

    async def capture_snapshot(
        self,
        error_message: str,
        file_and_line: str,
        context: BasicCrawlingContext,
    ) -> None:
        """Capture error snapshot and save it to key value store.

        It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
        it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
        returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
        an exception.

        Args:
            error_message: Used in filename of the snapshot.
            file_and_line: Used in filename of the snapshot.
            context: Context that is used to get the snapshot.
        """
        if snapshot := await context.get_snapshot():
            kvs = await KeyValueStore.open(name=self._kvs_name)
            snapshot_base_name = self._get_snapshot_base_name(error_message, file_and_line)
            snapshot_save_tasks = list[asyncio.Task]()

            if snapshot.html:
                snapshot_save_tasks.append(
                    asyncio.create_task(self._save_html(kvs, snapshot.html, base_name=snapshot_base_name))
                )

            if snapshot.screenshot:
                snapshot_save_tasks.append(
                    asyncio.create_task(self._save_screenshot(kvs, snapshot.screenshot, base_name=snapshot_base_name))
                )

            await asyncio.gather(*snapshot_save_tasks)

    async def _save_html(self, kvs: KeyValueStore, html: str, base_name: str) -> None:
        file_name = f'{base_name}.html'
        await kvs.set_value(file_name, html, content_type='text/html')

    async def _save_screenshot(self, kvs: KeyValueStore, screenshot: bytes, base_name: str) -> None:
        file_name = f'{base_name}.jpg'
        await kvs.set_value(file_name, screenshot, content_type='image/jpeg')

    def _sanitize_filename(self, filename: str) -> str:
        return re.sub(f'[^{re.escape(self.ALLOWED_CHARACTERS)}]', '', filename[: self.MAX_FILENAME_LENGTH])

    def _get_snapshot_base_name(self, error_message: str, file_and_line: str) -> str:
        sha1_hash = hashlib.sha1()  # noqa:S324 # Collisions related attacks are of no concern here.
        sha1_hash.update(file_and_line.encode('utf-8'))
        hashed_file_and_text = sha1_hash.hexdigest()[: self.MAX_HASH_LENGTH]
        error_message_start = (error_message or self.BASE_MESSAGE)[: self.MAX_ERROR_CHARACTERS]
        return self._sanitize_filename(f'{self.SNAPSHOT_PREFIX}_{hashed_file_and_text}_{error_message_start}')


================================================
FILE: src/crawlee/statistics/_error_tracker.py
================================================
# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/utils/src/internals/error_tracker.ts

from __future__ import annotations

import traceback
from collections import Counter, defaultdict
from itertools import zip_longest
from logging import getLogger
from typing import TYPE_CHECKING

from crawlee.statistics._error_snapshotter import ErrorSnapshotter

if TYPE_CHECKING:
    from crawlee._types import BasicCrawlingContext

GroupName = str | None
ErrorFilenameGroups = dict[GroupName, dict[GroupName, Counter[GroupName]]]


logger = getLogger(__name__)


class ErrorTracker:
    """Track errors and aggregates their counts by similarity."""

    def __init__(
        self,
        *,
        snapshot_kvs_name: str | None = None,
        show_error_name: bool = True,
        show_file_and_line_number: bool = True,
        show_error_message: bool = True,
        show_full_message: bool = False,
        save_error_snapshots: bool = False,
    ) -> None:
        self.error_snapshotter = ErrorSnapshotter(snapshot_kvs_name=snapshot_kvs_name) if save_error_snapshots else None
        self.show_error_name = show_error_name
        self.show_file_and_line_number = show_file_and_line_number
        self.show_error_message = show_error_message
        if show_full_message and not show_error_message:
            raise ValueError('`show_error_message` must be `True` if `show_full_message` is set to `True`')
        self.show_full_message = show_full_message
        self._errors: ErrorFilenameGroups = defaultdict(lambda: defaultdict(Counter))
        self._early_reported_errors = set[int]()

    async def add(
        self,
        error: Exception,
        *,
        context: BasicCrawlingContext | None = None,
        early: bool = False,
    ) -> None:
        """Add an error in the statistics.

        Args:
            error: Error to be added to statistics.
            context: Context used to collect error snapshot.
            early: Flag indicating that the error is added earlier than usual to have access to resources that will be
             closed before normal error collection. This prevents double reporting during normal error collection.
        """
        if id(error) in self._early_reported_errors:
            # Error had to be collected earlier before relevant resources are closed.
            self._early_reported_errors.remove(id(error))
            return

        if early:
            self._early_reported_errors.add(id(error))

        error_group_name = error.__class__.__name__ if self.show_error_name else None
        error_group_message = self._get_error_message(error)
        new_error_group_message = ''  # In case of wildcard similarity match
        error_group_file_and_line = self._get_file_and_line(error)

        # First two levels are grouped only in case of exact match.
        specific_groups = self._errors[error_group_file_and_line][error_group_name]

        # Lowest level group is matched by similarity.
        if error_group_message in specific_groups:
            # Exact match.
            specific_groups.update([error_group_message])
        else:
            for existing_error_group_message in specific_groups:
                # Add to first group with similar text. Modify text with wildcard characters if necessary.
                if new_error_group_message := self._create_generic_message(
                    existing_error_group_message, error_group_message
                ):
                    # Replace old name.
                    specific_groups[new_error_group_message] = specific_groups.pop(existing_error_group_message)
                    # Increment.
                    specific_groups.update([new_error_group_message])
                    break
            else:
                # No similar message found. Create new group.
                self._errors[error_group_file_and_line][error_group_name].update([error_group_message])

        if (
            self._errors[error_group_file_and_line][error_group_name][new_error_group_message or error_group_message]
            == 1
            and context is not None
        ):
            # Save snapshot only on the first occurrence of the error and only if context and kvs was passed as well.
            await self._capture_error_snapshot(
                error_message=new_error_group_message or error_group_message,
                file_and_line=error_group_file_and_line,
                context=context,
            )

    async def _capture_error_snapshot(
        self, error_message: str, file_and_line: str, context: BasicCrawlingContext
    ) -> None:
        if self.error_snapshotter:
            try:
                await self.error_snapshotter.capture_snapshot(
                    error_message=error_message, file_and_line=file_and_line, context=context
                )
            except Exception:
                logger.exception(f'Error when trying to collect error snapshot for exception: {error_message}')

    def _get_file_and_line(self, error: Exception) -> str:
        if self.show_file_and_line_number:
            error_traceback = traceback.extract_tb(error.__traceback__)
            # Show only the most specific frame.
            return f'{error_traceback[-1].filename.split("/")[-1]}:{error_traceback[-1].lineno}'
        return ''

    def _get_error_message(self, error: Exception) -> str:
        if self.show_error_message:
            error_content = error.args[0] if error.args else error.__context__
            error_content = str(error_content) if error_content else error.__class__.__name__
            if self.show_full_message:
                return error_content
            return error_content.split('\n')[0]
        return ''

    @property
    def unique_error_count(self) -> int:
        """Number of distinct kinds of errors."""
        unique_error_count = 0
        for file_and_line_group in self._errors.values():
            for name_group in file_and_line_group.values():
                unique_error_count += len(name_group)
        return unique_error_count

    @property
    def total(self) -> int:
        """Total number of errors."""
        error_count = 0
        for file_and_line_group in self._errors.values():
            for name_group in file_and_line_group.values():
                error_count += sum(name_group.values())
        return error_count

    def get_most_common_errors(self, n: int = 3) -> list[tuple[str | None, int]]:
        """Return n most common errors."""
        all_errors: Counter[GroupName] = Counter()
        for file_and_line_group_name, file_and_line_group in self._errors.items():
            for name_group_name, name_group in file_and_line_group.items():
                for message_group_name, count in name_group.items():
                    all_errors[self._get_error_repr(file_and_line_group_name, name_group_name, message_group_name)] = (
                        count
                    )
        return all_errors.most_common(n)

    def _get_error_repr(self, file_and_line: str | None, name: str | None, message: str | None) -> str:
        """Get the most specific error representation."""
        file_and_line_part = f'{file_and_line}:' if file_and_line else ''
        name_part = f'{name}:' if name else ''
        message_part = f'{message}' if message else ''
        return f'{file_and_line_part}{name_part}{message_part}'

    @staticmethod
    def _create_generic_message(message_1: str | None, message_2: str | None) -> str:
        """Create a generic error message from two messages, if they are similar enough.

        Different parts of similar messages are replaced by `***`.
        """
        if message_1 is None or message_2 is None:
            return ''

        replacement_string = '***'
        replacement_count = 0

        generic_message_parts = []
        message_1_parts = message_1.split(' ')
        message_2_parts = message_2.split(' ')
        parts_count = min(len(message_1_parts), len(message_2_parts))

        for message_1_part, message_2_part in zip_longest(message_1_parts, message_2_parts, fillvalue=''):
            if message_1_part != message_2_part:
                generic_message_parts.append(replacement_string)
                replacement_count += 1
                if replacement_count >= parts_count / 2:
                    # Messages are too different.
                    return ''
            else:
                generic_message_parts.append(message_1_part)
        return ' '.join(generic_message_parts)


================================================
FILE: src/crawlee/statistics/_models.py
================================================
from __future__ import annotations

import json
import warnings
from dataclasses import asdict, dataclass
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING, Annotated, Any

from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator, computed_field
from typing_extensions import override

from crawlee._utils.console import make_table
from crawlee._utils.docs import docs_group
from crawlee._utils.models import timedelta_ms
from crawlee._utils.time import format_duration

_STATISTICS_TABLE_WIDTH = 100


@dataclass(frozen=True)
@docs_group('Statistics')
class FinalStatistics:
    """Statistics about a crawler run."""

    requests_finished: int
    requests_failed: int
    retry_histogram: list[int]
    request_avg_failed_duration: timedelta | None
    request_avg_finished_duration: timedelta | None
    requests_finished_per_minute: float
    requests_failed_per_minute: float
    request_total_duration: timedelta
    requests_total: int
    crawler_runtime: timedelta

    def to_table(self) -> str:
        """Print out the Final Statistics data as a table."""
        formatted_dict = {}
        for k, v in asdict(self).items():
            if isinstance(v, timedelta):
                formatted_dict[k] = format_duration(v)
            else:
                formatted_dict[k] = v

        return make_table([(str(k), str(v)) for k, v in formatted_dict.items()], width=_STATISTICS_TABLE_WIDTH)

    def to_dict(self) -> dict[str, float | int | list[int]]:
        return {k: v.total_seconds() if isinstance(v, timedelta) else v for k, v in asdict(self).items()}

    @override
    def __str__(self) -> str:
        return json.dumps(
            {k: v.total_seconds() if isinstance(v, timedelta) else v for k, v in asdict(self).items()},
        )


@docs_group('Statistics')
class StatisticsState(BaseModel):
    """Statistic data about a crawler run."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
    stats_id: Annotated[int | None, Field(alias='statsId')] = None

    requests_finished: Annotated[int, Field(alias='requestsFinished')] = 0
    requests_failed: Annotated[int, Field(alias='requestsFailed')] = 0
    requests_retries: Annotated[int, Field(alias='requestsRetries')] = 0
    requests_failed_per_minute: Annotated[float, Field(alias='requestsFailedPerMinute')] = 0
    requests_finished_per_minute: Annotated[float, Field(alias='requestsFinishedPerMinute')] = 0
    request_min_duration: Annotated[timedelta_ms | None, Field(alias='requestMinDurationMillis')] = None
    request_max_duration: Annotated[timedelta_ms | None, Field(alias='requestMaxDurationMillis')] = None
    request_total_failed_duration: Annotated[timedelta_ms, Field(alias='requestTotalFailedDurationMillis')] = (
        timedelta()
    )
    request_total_finished_duration: Annotated[timedelta_ms, Field(alias='requestTotalFinishedDurationMillis')] = (
        timedelta()
    )
    crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
    crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
    crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None

    # Workaround for Pydantic and type checkers when using Annotated with default_factory
    if TYPE_CHECKING:
        errors: dict[str, Any] = {}
        retry_errors: dict[str, Any] = {}
        requests_with_status_code: dict[str, int] = {}
    else:
        errors: Annotated[dict[str, Any], Field(default_factory=dict)]
        retry_errors: Annotated[dict[str, Any], Field(alias='retryErrors', default_factory=dict)]
        requests_with_status_code: Annotated[
            dict[str, int],
            Field(alias='requestsWithStatusCode', default_factory=dict),
        ]

    stats_persisted_at: Annotated[
        datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc))
    ] = None
    request_retry_histogram: Annotated[
        dict[int, int],
        Field(alias='requestRetryHistogram'),
        PlainValidator(lambda value: dict(enumerate(value)), json_schema_input_type=list[int]),
        PlainSerializer(
            lambda value: [value.get(i, 0) for i in range(max(value.keys(), default=0) + 1)],
            return_type=list[int],
        ),
    ] = {}

    # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
    _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()

    def model_post_init(self, /, __context: Any) -> None:
        self._runtime_offset = self.crawler_runtime or self._runtime_offset

    @property
    def crawler_runtime(self) -> timedelta:
        if self.crawler_last_started_at:
            finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
            return self._runtime_offset + finished_at - self.crawler_last_started_at
        return self._runtime_offset

    @crawler_runtime.setter
    def crawler_runtime(self, value: timedelta) -> None:
        # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
        # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
        warnings.warn(
            f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
            f' Value {value} will not be used.',
            DeprecationWarning,
            stacklevel=2,
        )

    @computed_field(alias='crawlerRuntimeMillis')
    def crawler_runtime_for_serialization(self) -> timedelta:
        if self.crawler_last_started_at:
            finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
            return self._runtime_offset + finished_at - self.crawler_last_started_at
        return self._runtime_offset

    @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms)
    @property
    def request_total_duration(self) -> timedelta:
        return self.request_total_finished_duration + self.request_total_failed_duration

    @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)
    @property
    def request_avg_failed_duration(self) -> timedelta | None:
        return (self.request_total_failed_duration / self.requests_failed) if self.requests_failed else None

    @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)
    @property
    def request_avg_finished_duration(self) -> timedelta | None:
        return (self.request_total_finished_duration / self.requests_finished) if self.requests_finished else None

    @computed_field(alias='requestsTotal')
    @property
    def requests_total(self) -> int:
        return self.requests_failed + self.requests_finished


================================================
FILE: src/crawlee/statistics/_statistics.py
================================================
# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
from __future__ import annotations

import asyncio
import math
import time
from datetime import datetime, timedelta, timezone
from logging import Logger, getLogger
from typing import TYPE_CHECKING, Generic, Literal

from typing_extensions import Self, TypeVar

from crawlee._utils.context import ensure_context
from crawlee._utils.docs import docs_group
from crawlee._utils.recoverable_state import RecoverableState
from crawlee._utils.recurring_task import RecurringTask
from crawlee.statistics import FinalStatistics, StatisticsState
from crawlee.statistics._error_tracker import ErrorTracker

if TYPE_CHECKING:
    from collections.abc import Callable, Coroutine
    from types import TracebackType

    from crawlee.storages import KeyValueStore

TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState)
logger = getLogger(__name__)


class RequestProcessingRecord:
    """Tracks information about the processing of a request."""

    def __init__(self) -> None:
        self._last_run_at_ns: int | None = None
        self._runs = 0
        self.duration: timedelta | None = None

    def run(self) -> int:
        """Mark the job as started."""
        self._last_run_at_ns = time.perf_counter_ns()
        self._runs += 1
        return self._runs

    def finish(self) -> timedelta:
        """Mark the job as finished."""
        if self._last_run_at_ns is None:
            raise RuntimeError('Invalid state')

        self.duration = timedelta(microseconds=math.ceil((time.perf_counter_ns() - self._last_run_at_ns) / 1000))
        return self.duration

    @property
    def retry_count(self) -> int:
        """Number of times the job has been retried."""
        return max(0, self._runs - 1)


@docs_group('Statistics')
class Statistics(Generic[TStatisticsState]):
    """A class for collecting, tracking, and logging runtime statistics for requests.

    It is designed to record information such as request durations, retries, successes, and failures, enabling
    analysis of crawler performance. The collected statistics are persisted to a `KeyValueStore`, ensuring they
    remain available across crawler migrations, abortions, and restarts. This persistence allows for tracking
    and evaluation of crawler behavior over its lifecycle.
    """

    __next_id = 0

    def __init__(
        self,
        *,
        persistence_enabled: bool | Literal['explicit_only'] = False,
        persist_state_kvs_name: str | None = None,
        persist_state_key: str | None = None,
        persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
        log_message: str = 'Statistics',
        periodic_message_logger: Logger | None = None,
        log_interval: timedelta = timedelta(minutes=1),
        state_model: type[TStatisticsState],
        statistics_log_format: Literal['table', 'inline'] = 'table',
        save_error_snapshots: bool = False,
    ) -> None:
        self._id = Statistics.__next_id
        Statistics.__next_id += 1

        self.error_tracker = ErrorTracker(
            save_error_snapshots=save_error_snapshots,
            snapshot_kvs_name=persist_state_kvs_name,
        )
        self.error_tracker_retry = ErrorTracker(save_error_snapshots=False)

        self._requests_in_progress = dict[str, RequestProcessingRecord]()

        self._state = RecoverableState(
            default_state=state_model(stats_id=self._id),
            persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
            persistence_enabled=persistence_enabled,
            persist_state_kvs_name=persist_state_kvs_name,
            persist_state_kvs_factory=persist_state_kvs_factory,
            logger=logger,
        )

        self._log_message = log_message
        self._statistics_log_format = statistics_log_format
        self._periodic_message_logger = periodic_message_logger or logger
        self._periodic_logger = RecurringTask(self._log, log_interval)

        # Flag to indicate the context state.
        self._active = False

    def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]:
        """Create near copy of the `Statistics` with replaced `state_model`."""
        new_statistics: Statistics[TNewStatisticsState] = Statistics(
            persistence_enabled=self._state._persistence_enabled,  # noqa: SLF001
            persist_state_key=self._state._persist_state_key,  # noqa: SLF001
            persist_state_kvs_factory=self._state._persist_state_kvs_factory,  # noqa: SLF001
            log_message=self._log_message,
            periodic_message_logger=self._periodic_message_logger,
            state_model=state_model,
        )
        new_statistics._periodic_logger = self._periodic_logger  # Accessing private member to create copy like-object.
        return new_statistics

    @staticmethod
    def with_default_state(
        *,
        persistence_enabled: bool = False,
        persist_state_kvs_name: str | None = None,
        persist_state_key: str | None = None,
        persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
        log_message: str = 'Statistics',
        periodic_message_logger: Logger | None = None,
        log_interval: timedelta = timedelta(minutes=1),
        statistics_log_format: Literal['table', 'inline'] = 'table',
        save_error_snapshots: bool = False,
    ) -> Statistics[StatisticsState]:
        """Initialize a new instance with default state model `StatisticsState`."""
        return Statistics[StatisticsState](
            persistence_enabled=persistence_enabled,
            persist_state_kvs_name=persist_state_kvs_name,
            persist_state_key=persist_state_key,
            persist_state_kvs_factory=persist_state_kvs_factory,
            log_message=log_message,
            periodic_message_logger=periodic_message_logger,
            log_interval=log_interval,
            state_model=StatisticsState,
            statistics_log_format=statistics_log_format,
            save_error_snapshots=save_error_snapshots,
        )

    @property
    def active(self) -> bool:
        """Indicate whether the context is active."""
        return self._active

    async def __aenter__(self) -> Self:
        """Subscribe to events and start collecting statistics.

        Raises:
            RuntimeError: If the context manager is already active.
        """
        if self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is already active.')

        await self._state.initialize()
        # Reset `crawler_finished_at` to indicate a new run in progress.
        self.state.crawler_finished_at = None

        # Start periodic logging and let it print initial state before activation.
        self._periodic_logger.start()
        await asyncio.sleep(0.01)
        self._active = True

        self.state.crawler_last_started_at = datetime.now(timezone.utc)
        self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        """Stop collecting statistics.

        Raises:
            RuntimeError: If the context manager is not active.
        """
        if not self._active:
            raise RuntimeError(f'The {self.__class__.__name__} is not active.')

        if not self.state.crawler_last_started_at:
            raise RuntimeError('Statistics.state.crawler_last_started_at not set.')

        # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
        await self._periodic_logger.stop()
        self.state.crawler_finished_at = datetime.now(timezone.utc)
        self._active = False
        await self._state.teardown()

    @property
    def state(self) -> TStatisticsState:
        return self._state.current_value

    @ensure_context
    def register_status_code(self, code: int) -> None:
        """Increment the number of times a status code has been received."""
        state = self._state.current_value
        state.requests_with_status_code.setdefault(str(code), 0)
        state.requests_with_status_code[str(code)] += 1

    @ensure_context
    def record_request_processing_start(self, request_id_or_key: str) -> None:
        """Mark a request as started."""
        record = self._requests_in_progress.get(request_id_or_key, RequestProcessingRecord())
        record.run()
        self._requests_in_progress[request_id_or_key] = record

    @ensure_context
    def record_request_processing_finish(self, request_id_or_key: str) -> None:
        """Mark a request as finished."""
        record = self._requests_in_progress.get(request_id_or_key)
        if record is None:
            return

        state = self._state.current_value
        duration = record.finish()

        state.requests_finished += 1
        state.request_total_finished_duration += duration
        self._save_retry_count_for_request(record)
        state.request_min_duration = min(
            state.request_min_duration if state.request_min_duration is not None else timedelta.max, duration
        )
        state.request_max_duration = max(
            state.request_max_duration if state.request_max_duration is not None else timedelta(), duration
        )

        del self._requests_in_progress[request_id_or_key]

    @ensure_context
    def record_request_processing_failure(self, request_id_or_key: str) -> None:
        """Mark a request as failed."""
        record = self._requests_in_progress.get(request_id_or_key)
        if record is None:
            return

        state = self._state.current_value

        state.request_total_failed_duration += record.finish()
        state.requests_failed += 1
        self._save_retry_count_for_request(record)

        del self._requests_in_progress[request_id_or_key]

    def calculate(self) -> FinalStatistics:
        """Calculate the current statistics."""
        total_minutes = self.state.crawler_runtime.total_seconds() / 60
        state = self._state.current_value
        serialized_state = state.model_dump(by_alias=False)

        return FinalStatistics(
            request_avg_failed_duration=state.request_avg_failed_duration,
            request_avg_finished_duration=state.request_avg_finished_duration,
            requests_finished_per_minute=round(state.requests_finished / total_minutes) if total_minutes else 0,
            requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
            request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
            requests_total=state.requests_failed + state.requests_finished,
            crawler_runtime=state.crawler_runtime,
            requests_finished=state.requests_finished,
            requests_failed=state.requests_failed,
            retry_histogram=serialized_state['request_retry_histogram'],
        )

    async def reset(self) -> None:
        """Reset the statistics to their defaults and remove any persistent state."""
        await self._state.reset()
        self.error_tracker = ErrorTracker()
        self.error_tracker_retry = ErrorTracker()
        self._requests_in_progress.clear()

    def _log(self) -> None:
        stats = self.calculate()
        if self._statistics_log_format == 'table':
            self._periodic_message_logger.info(f'{self._log_message}\n{stats.to_table()}')
        else:
            self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())

    def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
        retry_count = record.retry_count
        state = self._state.current_value

        if retry_count:
            state.requests_retries += 1

        state.request_retry_histogram.setdefault(retry_count, 0)
        state.request_retry_histogram[retry_count] += 1


================================================
FILE: src/crawlee/storage_clients/__init__.py
================================================
from crawlee._utils.try_import import install_import_hook as _install_import_hook
from crawlee._utils.try_import import try_import as _try_import

# These imports have only mandatory dependencies, so they are imported directly.
from ._base import StorageClient
from ._file_system import FileSystemStorageClient
from ._memory import MemoryStorageClient

_install_import_hook(__name__)

# The following imports are wrapped in try_import to handle optional dependencies,
# ensuring the module can still function even if these dependencies are missing.
with _try_import(__name__, 'SqlStorageClient'):
    from ._sql import SqlStorageClient

with _try_import(__name__, 'RedisStorageClient'):
    from ._redis import RedisStorageClient

__all__ = [
    'FileSystemStorageClient',
    'MemoryStorageClient',
    'RedisStorageClient',
    'SqlStorageClient',
    'StorageClient',
]


================================================
FILE: src/crawlee/storage_clients/_base/__init__.py
================================================
from ._dataset_client import DatasetClient
from ._key_value_store_client import KeyValueStoreClient
from ._request_queue_client import RequestQueueClient
from ._storage_client import StorageClient

__all__ = [
    'DatasetClient',
    'KeyValueStoreClient',
    'RequestQueueClient',
    'StorageClient',
]


================================================
FILE: src/crawlee/storage_clients/_base/_dataset_client.py
================================================
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from collections.abc import AsyncIterator
    from typing import Any

    from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata


class DatasetClient(ABC):
    """An abstract class for dataset storage clients.

    Dataset clients provide an interface for accessing and manipulating dataset storage. They handle
    operations like adding and getting dataset items across different storage backends.

    Storage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`,
    `RequestQueue`), and can operate with various storage systems including memory, file system,
    databases, and cloud storage solutions.

    This abstract class defines the interface that all specific dataset clients must implement.
    """

    @abstractmethod
    async def get_metadata(self) -> DatasetMetadata:
        """Get the metadata of the dataset."""

    @abstractmethod
    async def drop(self) -> None:
        """Drop the whole dataset and remove all its items.

        The backend method for the `Dataset.drop` call.
        """

    @abstractmethod
    async def purge(self) -> None:
        """Purge all items from the dataset.

        The backend method for the `Dataset.purge` call.
        """

    @abstractmethod
    async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
        """Push data to the dataset.

        The backend method for the `Dataset.push_data` call.
        """

    @abstractmethod
    async def get_data(
        self,
        *,
        offset: int = 0,
        limit: int | None = 999_999_999_999,
        clean: bool = False,
        desc: bool = False,
        fields: list[str] | None = None,
        omit: list[str] | None = None,
        unwind: list[str] | None = None,
        skip_empty: bool = False,
        skip_hidden: bool = False,
        flatten: list[str] | None = None,
        view: str | None = None,
    ) -> DatasetItemsListPage:
        """Get data from the dataset with various filtering options.

        The backend method for the `Dataset.get_data` call.
        """

    @abstractmethod
    async def iterate_items(
        self,
        *,
        offset: int = 0,
        limit: int | None = None,
        clean: bool = False,
        desc: bool = False,
        fields: list[str] | None = None,
        omit: list[str] | None = None,
        unwind: list[str] | None = None,
        skip_empty: bool = False,
        skip_hidden: bool = False,
    ) -> AsyncIterator[dict[str, Any]]:
        """Iterate over the dataset items with filtering options.

        The backend method for the `Dataset.iterate_items` call.
        """
        # This syntax is to make type checker properly work with abstract AsyncIterator.
        # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
        raise NotImplementedError
        if False:
            yield 0


================================================
FILE: src/crawlee/storage_clients/_base/_key_value_store_client.py
================================================
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
    from collections.abc import AsyncIterator

    from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata


class KeyValueStoreClient(ABC):
    """An abstract class for key-value store (KVS) storage clients.

    Key-value stores clients provide an interface for accessing and manipulating KVS storage. They handle
    operations like getting, setting, deleting KVS values across different storage backends.

    Storage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`,
    `RequestQueue`), and can operate with various storage systems including memory, file system,
    databases, and cloud storage solutions.

    This abstract class defines the interface that all specific KVS clients must implement.
    """

    @abstractmethod
    async def get_metadata(self) -> KeyValueStoreMetadata:
        """Get the metadata of the key-value store."""

    @abstractmethod
    async def drop(self) -> None:
        """Drop the whole key-value store and remove all its values.

        The backend method for the `KeyValueStore.drop` call.
        """

    @abstractmethod
    async def purge(self) -> None:
        """Purge all items from the key-value store.

        The backend method for the `KeyValueStore.purge` call.
        """

    @abstractmethod
    async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
        """Retrieve the given record from the key-value store.

        The backend method for the `KeyValueStore.get_value` call.
        """

    @abstractmethod
    async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:
        """Set a value in the key-value store by its key.

        The backend method for the `KeyValueStore.set_value` call.
        """

    @abstractmethod
    async def delete_value(self, *, key: str) -> None:
        """Delete a value from the key-value store by its key.

        The backend method for the `KeyValueStore.delete_value` call.
        """

    @abstractmethod
    async def iterate_keys(
        self,
        *,
        exclusive_start_key: str | None = None,
        limit: int | None = None,
    ) -> AsyncIterator[KeyValueStoreRecordMetadata]:
        """Iterate over all the existing keys in the key-value store.

        The backend method for the `KeyValueStore.iterate_keys` call.
        """
        # This syntax is to make type checker properly work with abstract AsyncIterator.
        # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
        raise NotImplementedError
        if False:
            yield 0

    @abstractmethod
    async def get_public_url(self, *, key: str) -> str:
        """Get the public URL for the given key.

        The backend method for the `KeyValueStore.get_public_url` call.
        """

    @abstractmethod
    async def record_exists(self, *, key: str) -> bool:
        """Check if a record with the given key exists in the key-value store.

        The backend method for the `KeyValueStore.record_exists` call.

        Args:
            key: The key to check for existence.

        Returns:
            True if a record with the given key exists, False otherwise.
        """


================================================
FILE: src/crawlee/storage_clients/_base/_request_queue_client.py
================================================
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from collections.abc import Sequence

    from crawlee import Request
    from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata


class RequestQueueClient(ABC):
    """An abstract class for request queue resource clients.

    These clients are specific to the type of resource they manage and operate under a designated storage
    client, like a memory storage client.
    """

    @abstractmethod
    async def get_metadata(self) -> RequestQueueMetadata:
        """Get the metadata of the request queue."""

    @abstractmethod
    async def drop(self) -> None:
        """Drop the whole request queue and remove all its values.

        The backend method for the `RequestQueue.drop` call.
        """

    @abstractmethod
    async def purge(self) -> None:
        """Purge all items from the request queue.

        The backend method for the `RequestQueue.purge` call.
        """

    @abstractmethod
    async def add_batch_of_requests(
        self,
        requests: Sequence[Request],
        *,
        forefront: bool = False,
    ) -> AddRequestsResponse:
        """Add batch of requests to the queue.

        This method adds a batch of requests to the queue. Each request is processed based on its uniqueness
        (determined by `unique_key`). Duplicates will be identified but not re-added to the queue.

        Args:
            requests: The collection of requests to add to the queue.
            forefront: Whether to put the added requests at the beginning (True) or the end (False) of the queue.
                When True, the requests will be processed sooner than previously added requests.
            batch_size: The maximum number of requests to add in a single batch.
            wait_time_between_batches: The time to wait between adding batches of requests.
            wait_for_all_requests_to_be_added: If True, the method will wait until all requests are added
                to the queue before returning.
            wait_for_all_requests_to_be_added_timeout: The maximum time to wait for all requests to be added.

        Returns:
            A response object containing information about which requests were successfully
            processed and which failed (if any).
        """

    @abstractmethod
    async def get_request(self, unique_key: str) -> Request | None:
        """Retrieve a request from the queue.

        Args:
            unique_key: Unique key of the request to retrieve.

        Returns:
            The retrieved request, or None, if it did not exist.
        """

    @abstractmethod
    async def fetch_next_request(self) -> Request | None:
        """Return the next request in the queue to be processed.

        Once you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`
        to mark the request as handled in the queue. If there was some error in processing the request, call
        `RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer
        in another call to the `fetch_next_request` method.

        Note that the `None` return value does not mean the queue processing finished, it means there are currently
        no pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`
        instead.

        Returns:
            The request or `None` if there are no more pending requests.
        """

    @abstractmethod
    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
        """Mark a request as handled after successful processing.

        Handled requests will never again be returned by the `RequestQueue.fetch_next_request` method.

        Args:
            request: The request to mark as handled.

        Returns:
            Information about the queue operation. `None` if the given request was not in progress.
        """

    @abstractmethod
    async def reclaim_request(
        self,
        request: Request,
        *,
        forefront: bool = False,
    ) -> ProcessedRequest | None:
        """Reclaim a failed request back to the queue.

        The request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.

        Args:
            request: The request to return to the queue.
            forefront: Whether to add the request to the head or the end of the queue.

        Returns:
            Information about the queue operation. `None` if the given request was not in progress.
        """

    @abstractmethod
    async def is_empty(self) -> bool:
        """Check if the request queue is empty.

        Returns:
            True if the request queue is empty, False otherwise.
        """


================================================
FILE: src/crawlee/storage_clients/_base/_storage_client.py
================================================
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
    from collections.abc import Hashable

    from crawlee.configuration import Configuration

    from ._dataset_client import DatasetClient
    from ._key_value_store_client import KeyValueStoreClient
    from ._request_queue_client import RequestQueueClient


@docs_group('Storage clients')
class StorageClient(ABC):
    """Base class for storage clients.

    The `StorageClient` serves as an abstract base class that defines the interface for accessing Crawlee's
    storage types: datasets, key-value stores, and request queues. It provides methods to open clients for
    each of these storage types and handles common functionality.

    Storage clients implementations can be provided for various backends (file system, memory, databases,
    various cloud providers, etc.) to support different use cases from development to production environments.

    Each storage client implementation is responsible for ensuring proper initialization, data persistence
    (where applicable), and consistent access patterns across all storage types it supports.
    """

    def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable:  # noqa: ARG002
        """Return a cache key that can differentiate between different storages of this and other clients.

        Can be based on configuration or on the client itself. By default, returns a module and name of the client
        class.
        """
        return f'{self.__class__.__module__}.{self.__class__.__name__}'

    @abstractmethod
    async def create_dataset_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> DatasetClient:
        """Create a dataset client."""

    @abstractmethod
    async def create_kvs_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> KeyValueStoreClient:
        """Create a key-value store client."""

    @abstractmethod
    async def create_rq_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> RequestQueueClient:
        """Create a request queue client."""

    def get_rate_limit_errors(self) -> dict[int, int]:
        """Return statistics about rate limit errors encountered by the HTTP client in storage client."""
        return {}

    async def _purge_if_needed(
        self,
        client: DatasetClient | KeyValueStoreClient | RequestQueueClient,
        configuration: Configuration,
    ) -> None:
        """Purge the client if needed.

        The purge is only performed if the configuration indicates that it should be done and the client
        is not a named storage. Named storages are considered global and will typically outlive the run,
        so they are not purged.

        Args:
            client: The storage client to potentially purge.
            configuration: Configuration that determines whether purging should occur.
        """
        metadata = await client.get_metadata()
        if configuration.purge_on_start and metadata.name is None:
            await client.purge()


================================================
FILE: src/crawlee/storage_clients/_base/py.typed
================================================


================================================
FILE: src/crawlee/storage_clients/_file_system/__init__.py
================================================
from ._dataset_client import FileSystemDatasetClient
from ._key_value_store_client import FileSystemKeyValueStoreClient
from ._request_queue_client import FileSystemRequestQueueClient
from ._storage_client import FileSystemStorageClient

__all__ = [
    'FileSystemDatasetClient',
    'FileSystemKeyValueStoreClient',
    'FileSystemRequestQueueClient',
    'FileSystemStorageClient',
]


================================================
FILE: src/crawlee/storage_clients/_file_system/_dataset_client.py
================================================
from __future__ import annotations

import asyncio
import json
import shutil
from datetime import datetime, timezone
from logging import getLogger
from pathlib import Path
from typing import TYPE_CHECKING, Any

from pydantic import ValidationError
from typing_extensions import Self, override

from crawlee._consts import METADATA_FILENAME
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.file import atomic_write, json_dumps
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.storage_clients._base import DatasetClient
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata

if TYPE_CHECKING:
    from collections.abc import AsyncIterator

    from crawlee.configuration import Configuration

logger = getLogger(__name__)


class FileSystemDatasetClient(DatasetClient):
    """File system implementation of the dataset client.

    This client persists dataset items to the file system as individual JSON files within a structured
    directory hierarchy following the pattern:

    ```
    {STORAGE_DIR}/datasets/{DATASET_ID}/{ITEM_ID}.json
    ```

    Each item is stored as a separate file, which allows for durability and the ability to
    recover after process termination. Dataset operations like filtering, sorting, and pagination are
    implemented by processing the stored files according to the requested parameters.

    This implementation is ideal for long-running crawlers where data persistence is important,
    and for development environments where you want to easily inspect the collected data between runs.
    """

    _STORAGE_SUBDIR = 'datasets'
    """The name of the subdirectory where datasets are stored."""

    _STORAGE_SUBSUBDIR_DEFAULT = 'default'
    """The name of the subdirectory for the default dataset."""

    _ITEM_FILENAME_DIGITS = 9
    """Number of digits used for the dataset item file names (e.g., 000000019.json)."""

    def __init__(
        self,
        *,
        metadata: DatasetMetadata,
        path_to_dataset: Path,
        lock: asyncio.Lock,
    ) -> None:
        """Initialize a new instance.

        Preferably use the `FileSystemDatasetClient.open` class method to create a new instance.
        """
        self._metadata = metadata

        self._path_to_dataset = path_to_dataset
        """The full path to the dataset directory."""

        self._lock = lock
        """A lock to ensure that only one operation is performed at a time."""

    @override
    async def get_metadata(self) -> DatasetMetadata:
        return self._metadata

    @property
    def path_to_dataset(self) -> Path:
        """The full path to the dataset directory."""
        return self._path_to_dataset

    @property
    def path_to_metadata(self) -> Path:
        """The full path to the dataset metadata file."""
        return self.path_to_dataset / METADATA_FILENAME

    @classmethod
    async def open(
        cls,
        *,
        id: str | None,
        name: str | None,
        alias: str | None,
        configuration: Configuration,
    ) -> Self:
        """Open or create a file system dataset client.

        This method attempts to open an existing dataset from the file system. If a dataset with the specified ID
        or name exists, it loads the metadata from the stored files. If no existing dataset is found, a new one
        is created.

        Args:
            id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
            name: The name of the dataset for named (global scope) storages.
            alias: The alias of the dataset for unnamed (run scope) storages.
            configuration: The configuration object containing storage directory settings.

        Returns:
            An instance for the opened or created storage client.

        Raises:
            ValueError: If a dataset with the specified ID is not found, if metadata is invalid,
                or if both name and alias are provided.
        """
        # Validate input parameters.
        raise_if_too_many_kwargs(id=id, name=name, alias=alias)

        dataset_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR

        if not dataset_base_path.exists():
            await asyncio.to_thread(dataset_base_path.mkdir, parents=True, exist_ok=True)

        # Get a new instance by ID.
        if id:
            found = False
            for dataset_dir in dataset_base_path.iterdir():
                if not dataset_dir.is_dir():
                    continue

                path_to_metadata = dataset_dir / METADATA_FILENAME
                if not path_to_metadata.exists():
                    continue

                try:
                    file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
                    try:
                        file_content = json.load(file)
                        metadata = DatasetMetadata(**file_content)
                        if metadata.id == id:
                            client = cls(
                                metadata=metadata,
                                path_to_dataset=dataset_base_path / dataset_dir,
                                lock=asyncio.Lock(),
                            )
                            await client._update_metadata(update_accessed_at=True)
                            found = True
                            break
                    finally:
                        await asyncio.to_thread(file.close)
                except (json.JSONDecodeError, ValidationError):
                    continue

            if not found:
                raise ValueError(f'Dataset with ID "{id}" not found')

        # Get a new instance by name or alias.
        else:
            dataset_dir = Path(name) if name else Path(alias) if alias else Path('default')
            path_to_dataset = dataset_base_path / dataset_dir
            path_to_metadata = path_to_dataset / METADATA_FILENAME

            # If the dataset directory exists, reconstruct the client from the metadata file.
            if path_to_dataset.exists() and path_to_metadata.exists():
                file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
                try:
                    file_content = json.load(file)
                finally:
                    await asyncio.to_thread(file.close)
                try:
                    metadata = DatasetMetadata(**file_content)
                except ValidationError as exc:
                    raise ValueError(f'Invalid metadata file for dataset "{name or alias}"') from exc

                client = cls(
                    metadata=metadata,
                    path_to_dataset=path_to_dataset,
                    lock=asyncio.Lock(),
                )

                await client._update_metadata(update_accessed_at=True)

            # Otherwise, create a new dataset client.
            else:
                now = datetime.now(timezone.utc)
                metadata = DatasetMetadata(
                    id=crypto_random_object_id(),
                    name=name,
                    created_at=now,
                    accessed_at=now,
                    modified_at=now,
                    item_count=0,
                )
                client = cls(
                    metadata=metadata,
                    path_to_dataset=path_to_dataset,
                    lock=asyncio.Lock(),
                )
                await client._update_metadata()

        return client

    @override
    async def drop(self) -> None:
        async with self._lock:
            if self.path_to_dataset.exists():
                await asyncio.to_thread(shutil.rmtree, self.path_to_dataset)

    @override
    async def purge(self) -> None:
        async with self._lock:
            for file_path in await self._get_sorted_data_files():
                await asyncio.to_thread(file_path.unlink, missing_ok=True)

            await self._update_metadata(
                update_accessed_at=True,
                update_modified_at=True,
                new_item_count=0,
            )

    @override
    async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
        async with self._lock:
            new_item_count = self._metadata.item_count
            if isinstance(data, list):
                for item in data:
                    new_item_count += 1
                    await self._push_item(item, new_item_count)
            else:
                new_item_count += 1
                await self._push_item(data, new_item_count)

            # now update metadata under the same lock
            await self._update_metadata(
                update_accessed_at=True,
                update_modified_at=True,
                new_item_count=new_item_count,
            )

    @override
    async def get_data(
        self,
        *,
        offset: int = 0,
        limit: int | None = 999_999_999_999,
        clean: bool = False,
        desc: bool = False,
        fields: list[str] | None = None,
        omit: list[str] | None = None,
        unwind: list[str] | None = None,
        skip_empty: bool = False,
        skip_hidden: bool = False,
        flatten: list[str] | None = None,
        view: str | None = None,
    ) -> DatasetItemsListPage:
        # Check for unsupported arguments and log a warning if found.
        unsupported_args: dict[str, Any] = {
            'clean': clean,
            'fields': fields,
            'omit': omit,
            'unwind': unwind,
            'skip_hidden': skip_hidden,
            'flatten': flatten,
            'view': view,
        }
        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}

        if unsupported:
            logger.warning(
                f'The arguments {list(unsupported.keys())} of get_data are not supported by the '
                f'{self.__class__.__name__} client.'
            )

        # If the dataset directory does not exist, log a warning and return an empty page.
        if not self.path_to_dataset.exists():
            logger.warning(f'Dataset directory not found: {self.path_to_dataset}')
            return DatasetItemsListPage(
                count=0,
                offset=offset,
                limit=limit or 0,
                total=0,
                desc=desc,
                items=[],
            )

        # Get the list of sorted data files.
        async with self._lock:
            try:
                data_files = await self._get_sorted_data_files()
            except FileNotFoundError:
                # directory was dropped mid-check
                return DatasetItemsListPage(count=0, offset=offset, limit=limit or 0, total=0, desc=desc, items=[])

        total = len(data_files)

        # Reverse the order if descending order is requested.
        if desc:
            data_files.reverse()

        # Apply offset and limit slicing.
        selected_files = data_files[offset:]
        if limit is not None:
            selected_files = selected_files[:limit]

        # Read and parse each data file.
        items = list[dict[str, Any]]()
        for file_path in selected_files:
            try:
                file_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8')
            except FileNotFoundError:
                logger.warning(f'File disappeared during iterate_items(): {file_path}, skipping')
                continue

            try:
                item = json.loads(file_content)
            except json.JSONDecodeError:
                logger.exception(f'Corrupt JSON in {file_path}, skipping')
                continue

            # Skip empty items if requested.
            if skip_empty and not item:
                continue

            items.append(item)

        async with self._lock:
            await self._update_metadata(update_accessed_at=True)

        # Return a paginated list page of dataset items.
        return DatasetItemsListPage(
            count=len(items),
            offset=offset,
            limit=limit or total - offset,
            total=total,
            desc=desc,
            items=items,
        )

    @override
    async def iterate_items(
        self,
        *,
        offset: int = 0,
        limit: int | None = None,
        clean: bool = False,
        desc: bool = False,
        fields: list[str] | None = None,
        omit: list[str] | None = None,
        unwind: list[str] | None = None,
        skip_empty: bool = False,
        skip_hidden: bool = False,
    ) -> AsyncIterator[dict[str, Any]]:
        # Check for unsupported arguments and log a warning if found.
        unsupported_args: dict[str, Any] = {
            'clean': clean,
            'fields': fields,
            'omit': omit,
            'unwind': unwind,
            'skip_hidden': skip_hidden,
        }
        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}

        if unsupported:
            logger.warning(
                f'The arguments {list(unsupported.keys())} of iterate are not supported '
                f'by the {self.__class__.__name__} client.'
            )

        # If the dataset directory does not exist, log a warning and return immediately.
        if not self.path_to_dataset.exists():
            logger.warning(f'Dataset directory not found: {self.path_to_dataset}')
            return

        # Get the list of sorted data files.
        async with self._lock:
            try:
                data_files = await self._get_sorted_data_files()
            except FileNotFoundError:
                return

        # Reverse the order if descending order is requested.
        if desc:
            data_files.reverse()

        # Apply offset and limit slicing.
        selected_files = data_files[offset:]
        if limit is not None:
            selected_files = selected_files[:limit]

        # Iterate over each data file, reading and yielding its parsed content.
        for file_path in selected_files:
            try:
                file_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8')
            except FileNotFoundError:
                logger.warning(f'File disappeared during iterate_items(): {file_path}, skipping')
                continue

            try:
                item = json.loads(file_content)
            except json.JSONDecodeError:
                logger.exception(f'Corrupt JSON in {file_path}, skipping')
                continue

            # Skip empty items if requested.
            if skip_empty and not item:
                continue

            yield item

        async with self._lock:
            await self._update_metadata(update_accessed_at=True)

    async def _update_metadata(
        self,
        *,
        new_item_count: int | None = None,
        update_accessed_at: bool = False,
        update_modified_at: bool = False,
    ) -> None:
        """Update the dataset metadata file with current information.

        Args:
            new_item_count: If provided, update the item count to this value.
            update_accessed_at: If True, update the `accessed_at` timestamp to the current time.
            update_modified_at: If True, update the `modified_at` timestamp to the current time.
        """
        now = datetime.now(timezone.utc)

        if update_accessed_at:
            self._metadata.accessed_at = now
        if update_modified_at:
            self._metadata.modified_at = now
        if new_item_count is not None:
            self._metadata.item_count = new_item_count

        # Ensure the parent directory for the metadata file exists.
        await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True)

        # Dump the serialized metadata to the file.
        data = await json_dumps(self._metadata.model_dump())
        await atomic_write(self.path_to_metadata, data)

    async def _push_item(self, item: dict[str, Any], item_id: int) -> None:
        """Push a single item to the dataset.

        This method writes the item as a JSON file with a zero-padded numeric filename
        that reflects its position in the dataset sequence.

        Args:
            item: The data item to add to the dataset.
            item_id: The sequential ID to use for this item's filename.
        """
        # Generate the filename for the new item using zero-padded numbering.
        filename = f'{str(item_id).zfill(self._ITEM_FILENAME_DIGITS)}.json'
        file_path = self.path_to_dataset / filename

        # Ensure the dataset directory exists.
        await asyncio.to_thread(self.path_to_dataset.mkdir, parents=True, exist_ok=True)

        # Dump the serialized item to the file.
        data = await json_dumps(item)
        await atomic_write(file_path, data)

    async def _get_sorted_data_files(self) -> list[Path]:
        """Retrieve and return a sorted list of data files in the dataset directory.

        The files are sorted numerically based on the filename (without extension),
        which corresponds to the order items were added to the dataset.

        Returns:
            A list of `Path` objects pointing to data files, sorted by numeric filename.
        """
        # Retrieve and sort all JSON files in the dataset directory numerically.
        files = await asyncio.to_thread(
            lambda: sorted(
                self.path_to_dataset.glob('*.json'),
                key=lambda f: int(f.stem) if f.stem.isdigit() else 0,
            )
        )

        # Remove the metadata file from the list if present.
        if self.path_to_metadata in files:
            files.remove(self.path_to_metadata)

        return files


================================================
FILE: src/crawlee/storage_clients/_file_system/_key_value_store_client.py
================================================
from __future__ import annotations

import asyncio
import functools
import json
import shutil
import urllib.parse
from datetime import datetime, timezone
from logging import getLogger
from pathlib import Path
from typing import TYPE_CHECKING, Any

from pydantic import ValidationError
from typing_extensions import Self, override

from crawlee._consts import METADATA_FILENAME
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.storage_clients._base import KeyValueStoreClient
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata

if TYPE_CHECKING:
    from collections.abc import AsyncIterator

    from crawlee.configuration import Configuration


logger = getLogger(__name__)


class FileSystemKeyValueStoreClient(KeyValueStoreClient):
    """File system implementation of the key-value store client.

    This client persists data to the file system, making it suitable for scenarios where data needs to
    survive process restarts. Keys are mapped to file paths in a directory structure following the pattern:

    ```
    {STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY}
    ```

    Binary data is stored as-is, while JSON and text data are stored in human-readable format.
    The implementation automatically handles serialization based on the content type and
    maintains metadata about each record.

    This implementation is ideal for long-running crawlers where persistence is important and
    for development environments where you want to easily inspect the stored data between runs.
    """

    _STORAGE_SUBDIR = 'key_value_stores'
    """The name of the subdirectory where key-value stores are stored."""

    _STORAGE_SUBSUBDIR_DEFAULT = 'default'
    """The name of the subdirectory for the default key-value store."""

    def __init__(
        self,
        *,
        metadata: KeyValueStoreMetadata,
        path_to_kvs: Path,
        lock: asyncio.Lock,
    ) -> None:
        """Initialize a new instance.

        Preferably use the `FileSystemKeyValueStoreClient.open` class method to create a new instance.
        """
        self._metadata = metadata

        self._path_to_kvs = path_to_kvs
        """The full path to the key-value store directory."""

        self._lock = lock
        """A lock to ensure that only one operation is performed at a time."""

    @override
    async def get_metadata(self) -> KeyValueStoreMetadata:
        return self._metadata

    @property
    def path_to_kvs(self) -> Path:
        """The full path to the key-value store directory."""
        return self._path_to_kvs

    @property
    def path_to_metadata(self) -> Path:
        """The full path to the key-value store metadata file."""
        return self.path_to_kvs / METADATA_FILENAME

    @classmethod
    async def open(
        cls,
        *,
        id: str | None,
        name: str | None,
        alias: str | None,
        configuration: Configuration,
    ) -> Self:
        """Open or create a file system key-value store client.

        This method attempts to open an existing key-value store from the file system. If a KVS with the specified
        ID or name exists, it loads the metadata from the stored files. If no existing store is found, a new one
        is created.

        Args:
            id: The ID of the key-value store to open. If provided, searches for existing store by ID.
            name: The name of the key-value store for named (global scope) storages.
            alias: The alias of the key-value store for unnamed (run scope) storages.
            configuration: The configuration object containing storage directory settings.

        Returns:
            An instance for the opened or created storage client.

        Raises:
            ValueError: If a store with the specified ID is not found, if metadata is invalid,
                or if both name and alias are provided.
        """
        # Validate input parameters.
        raise_if_too_many_kwargs(id=id, name=name, alias=alias)

        kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR

        if not kvs_base_path.exists():
            await asyncio.to_thread(kvs_base_path.mkdir, parents=True, exist_ok=True)

        # Get a new instance by ID.
        if id:
            found = False
            for kvs_dir in kvs_base_path.iterdir():
                if not kvs_dir.is_dir():
                    continue

                path_to_metadata = kvs_dir / METADATA_FILENAME
                if not path_to_metadata.exists():
                    continue

                try:
                    file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
                    try:
                        file_content = json.load(file)
                        metadata = KeyValueStoreMetadata(**file_content)
                        if metadata.id == id:
                            client = cls(
                                metadata=metadata,
                                path_to_kvs=kvs_base_path / kvs_dir,
                                lock=asyncio.Lock(),
                            )
                            await client._update_metadata(update_accessed_at=True)
                            found = True
                            break
                    finally:
                        await asyncio.to_thread(file.close)
                except (json.JSONDecodeError, ValidationError):
                    continue

            if not found:
                raise ValueError(f'Key-value store with ID "{id}" not found.')

        # Get a new instance by name or alias.
        else:
            kvs_dir = Path(name) if name else Path(alias) if alias else Path('default')
            path_to_kvs = kvs_base_path / kvs_dir
            path_to_metadata = path_to_kvs / METADATA_FILENAME

            # If the key-value store directory exists, reconstruct the client from the metadata file.
            if path_to_kvs.exists() and path_to_metadata.exists():
                file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
                try:
                    file_content = json.load(file)
                finally:
                    await asyncio.to_thread(file.close)
                try:
                    metadata = KeyValueStoreMetadata(**file_content)
                except ValidationError as exc:
                    raise ValueError(f'Invalid metadata file for key-value store "{name or alias}"') from exc

                client = cls(
                    metadata=metadata,
                    path_to_kvs=path_to_kvs,
                    lock=asyncio.Lock(),
                )

                await client._update_metadata(update_accessed_at=True)

            # Otherwise, create a new key-value store client.
            else:
                now = datetime.now(timezone.utc)
                metadata = KeyValueStoreMetadata(
                    id=crypto_random_object_id(),
                    name=name,
                    created_at=now,
                    accessed_at=now,
                    modified_at=now,
                )
                client = cls(
                    metadata=metadata,
                    path_to_kvs=path_to_kvs,
                    lock=asyncio.Lock(),
                )
                await client._update_metadata()

        return client

    @override
    async def drop(self) -> None:
        # If the client directory exists, remove it recursively.
        if self.path_to_kvs.exists():
            async with self._lock:
                await asyncio.to_thread(shutil.rmtree, self.path_to_kvs)

    @override
    async def purge(self) -> None:
        async with self._lock:
            for file_path in self.path_to_kvs.glob('*'):
                if file_path.name == METADATA_FILENAME:
                    continue
                await asyncio.to_thread(file_path.unlink, missing_ok=True)

            await self._update_metadata(
                update_accessed_at=True,
                update_modified_at=True,
            )

    @override
    async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
        # Update the metadata to record access
        async with self._lock:
            await self._update_metadata(update_accessed_at=True)

        record_path = self.path_to_kvs / self._encode_key(key)

        if not record_path.exists():
            return None

        # Found a file for this key, now look for its metadata
        record_metadata_filepath = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}')
        if not record_metadata_filepath.exists():
            logger.warning(f'Found value file for key "{key}" but no metadata file.')
            return None

        # Read the metadata file
        async with self._lock:
            try:
                file = await asyncio.to_thread(
                    functools.partial(record_metadata_filepath.open, mode='r', encoding='utf-8'),
                )
            except FileNotFoundError:
                logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value')
                return None

            try:
                metadata_content = json.load(file)
            except json.JSONDecodeError:
                logger.warning(f'Invalid metadata file for key "{key}"')
                return None
            finally:
                await asyncio.to_thread(file.close)

        try:
            metadata = KeyValueStoreRecordMetadata(**metadata_content)
        except ValidationError:
            logger.warning(f'Invalid metadata schema for key "{key}"')
            return None

        # Read the actual value
        try:
            value_bytes = await asyncio.to_thread(record_path.read_bytes)
        except FileNotFoundError:
            logger.warning(f'Value file disappeared for key "{key}"')
            return None

        # Handle None values
        if metadata.content_type == 'application/x-none':
            value = None
        # Handle JSON values
        elif 'application/json' in metadata.content_type:
            try:
                value = json.loads(value_bytes.decode('utf-8'))
            except (json.JSONDecodeError, UnicodeDecodeError):
                logger.warning(f'Failed to decode JSON value for key "{key}"')
                return None
        # Handle text values
        elif metadata.content_type.startswith('text/'):
            try:
                value = value_bytes.decode('utf-8')
            except UnicodeDecodeError:
                logger.warning(f'Failed to decode text value for key "{key}"')
                return None
        # Handle binary values
        else:
            value = value_bytes

        # Calculate the size of the value in bytes
        size = len(value_bytes)

        return KeyValueStoreRecord(
            key=metadata.key,
            value=value,
            content_type=metadata.content_type,
            size=size,
        )

    @override
    async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:
        # Special handling for None values
        if value is None:
            content_type = 'application/x-none'  # Special content type to identify None values
            value_bytes = b''
        else:
            content_type = content_type or infer_mime_type(value)

            # Serialize the value to bytes.
            if 'application/json' in content_type:
                value_bytes = (await json_dumps(value)).encode('utf-8')
            elif isinstance(value, str):
                value_bytes = value.encode('utf-8')
            elif isinstance(value, (bytes, bytearray)):
                value_bytes = value
            else:
                # Fallback: attempt to convert to string and encode.
                value_bytes = str(value).encode('utf-8')

        record_path = self.path_to_kvs / self._encode_key(key)

        # Prepare the metadata
        size = len(value_bytes)
        record_metadata = KeyValueStoreRecordMetadata(key=key, content_type=content_type, size=size)
        record_metadata_filepath = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}')
        record_metadata_content = await json_dumps(record_metadata.model_dump())

        async with self._lock:
            # Ensure the key-value store directory exists.
            await asyncio.to_thread(self.path_to_kvs.mkdir, parents=True, exist_ok=True)

            # Write the value to the file.
            await atomic_write(record_path, value_bytes)

            # Write the record metadata to the file.
            await atomic_write(record_metadata_filepath, record_metadata_content)

            # Update the KVS metadata to record the access and modification.
            await self._update_metadata(update_accessed_at=True, update_modified_at=True)

    @override
    async def delete_value(self, *, key: str) -> None:
        record_path = self.path_to_kvs / self._encode_key(key)
        metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}')
        deleted = False

        async with self._lock:
            # Delete the value file and its metadata if found
            if record_path.exists():
                await asyncio.to_thread(record_path.unlink, missing_ok=True)

                # Delete the metadata file if it exists
                if metadata_path.exists():
                    await asyncio.to_thread(metadata_path.unlink, missing_ok=True)
                else:
                    logger.warning(f'Found value file for key "{key}" but no metadata file when trying to delete it.')

                deleted = True

            # If we deleted something, update the KVS metadata
            if deleted:
                await self._update_metadata(update_accessed_at=True, update_modified_at=True)

    @override
    async def iterate_keys(
        self,
        *,
        exclusive_start_key: str | None = None,
        limit: int | None = None,
    ) -> AsyncIterator[KeyValueStoreRecordMetadata]:
        # Check if the KVS directory exists
        if not self.path_to_kvs.exists():
            return

        # List and sort all files *inside* a brief lock, then release it immediately:
        async with self._lock:
            files = sorted(await asyncio.to_thread(lambda: list(self.path_to_kvs.glob('*'))))

        count = 0

        for file_path in files:
            # Skip the main metadata file
            if file_path.name == METADATA_FILENAME:
                continue

            # Only process metadata files for records
            if not file_path.name.endswith(f'.{METADATA_FILENAME}'):
                continue

            # Extract the base key name from the metadata filename
            key_name = self._decode_key(file_path.name[: -len(f'.{METADATA_FILENAME}')])

            # Apply exclusive_start_key filter if provided
            if exclusive_start_key is not None and key_name <= exclusive_start_key:
                continue

            # Try to read and parse the metadata file
            try:
                metadata_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8')
            except FileNotFoundError:
                logger.warning(f'Metadata file disappeared for key "{key_name}", skipping it.')
                continue

            try:
                metadata_dict = json.loads(metadata_content)
            except json.JSONDecodeError:
                logger.warning(f'Failed to decode metadata file for key "{key_name}", skipping it.')
                continue

            try:
                record_metadata = KeyValueStoreRecordMetadata(**metadata_dict)
            except ValidationError:
                logger.warning(f'Invalid metadata schema for key "{key_name}", skipping it.')

            yield record_metadata

            count += 1
            if limit and count >= limit:
                break

        # Update accessed_at timestamp
        async with self._lock:
            await self._update_metadata(update_accessed_at=True)

    @override
    async def get_public_url(self, *, key: str) -> str:
        """Return a file:// URL for the given key.

        Args:
            key: The key to get the public URL for.

        Returns:
            A file:// URL pointing to the file on the local filesystem.
        """
        record_path = self.path_to_kvs / self._encode_key(key)
        absolute_path = record_path.absolute()
        return absolute_path.as_uri()

    @override
    async def record_exists(self, *, key: str) -> bool:
        """Check if a record with the given key exists in the key-value store.

        Args:
            key: The key to check for existence.

        Returns:
            True if a record with the given key exists, False otherwise.
        """
        # Update the metadata to record access
        async with self._lock:
            await self._update_metadata(update_accessed_at=True)

        record_path = self.path_to_kvs / self._encode_key(key)
        record_metadata_filepath = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}')

        # Both the value file and metadata file must exist for a record to be considered existing
        return record_path.exists() and record_metadata_filepath.exists()

    async def _update_metadata(
        self,
        *,
        update_accessed_at: bool = False,
        update_modified_at: bool = False,
    ) -> None:
        """Update the KVS metadata file with current information.

        Args:
            update_accessed_at: If True, update the `accessed_at` timestamp to the current time.
            update_modified_at: If True, update the `modified_at` timestamp to the current time.
        """
        now = datetime.now(timezone.utc)

        if update_accessed_at:
            self._metadata.accessed_at = now
        if update_modified_at:
            self._metadata.modified_at = now

        # Ensure the parent directory for the metadata file exists.
        await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True)

        # Dump the serialized metadata to the file.
        data = await json_dumps(self._metadata.model_dump())
        await atomic_write(self.path_to_metadata, data)

    def _encode_key(self, key: str) -> str:
        """Encode a key to make it safe for use in a file path."""
        return urllib.parse.quote(key, safe='')

    def _decode_key(self, encoded_key: str) -> str:
        """Decode a key that was encoded to make it safe for use in a file path."""
        return urllib.parse.unquote(encoded_key)


================================================
FILE: src/crawlee/storage_clients/_file_system/_request_queue_client.py
================================================
from __future__ import annotations

import asyncio
import functools
import json
import shutil
from collections import deque
from datetime import datetime, timezone
from hashlib import sha256
from logging import getLogger
from pathlib import Path
from typing import TYPE_CHECKING

from pydantic import BaseModel, ValidationError
from typing_extensions import Self, override

from crawlee import Request
from crawlee._consts import METADATA_FILENAME
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.file import atomic_write, json_dumps
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee._utils.recoverable_state import RecoverableState
from crawlee.storage_clients._base import RequestQueueClient
from crawlee.storage_clients.models import (
    AddRequestsResponse,
    ProcessedRequest,
    RequestQueueMetadata,
    UnprocessedRequest,
)

if TYPE_CHECKING:
    from collections.abc import Sequence

    from crawlee.configuration import Configuration
    from crawlee.storages import KeyValueStore

logger = getLogger(__name__)


class RequestQueueState(BaseModel):
    """State model for the `FileSystemRequestQueueClient`."""

    sequence_counter: int = 0
    """Counter for regular request ordering."""

    forefront_sequence_counter: int = 0
    """Counter for forefront request ordering."""

    forefront_requests: dict[str, int] = {}
    """Mapping of forefront request unique keys to their sequence numbers."""

    regular_requests: dict[str, int] = {}
    """Mapping of regular request unique keys to their sequence numbers."""

    in_progress_requests: set[str] = set()
    """Set of request unique keys currently being processed."""

    handled_requests: set[str] = set()
    """Set of request unique keys that have been handled."""


class FileSystemRequestQueueClient(RequestQueueClient):
    """A file system implementation of the request queue client.

    This client persists requests to the file system as individual JSON files, making it suitable for scenarios
    where data needs to survive process restarts. Each request is stored as a separate file in a directory
    structure following the pattern:

    ```
    {STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json
    ```

    The implementation uses `RecoverableState` to maintain ordering information, in-progress status, and
    request handling status. This allows for proper state recovery across process restarts without
    embedding metadata in individual request files. File system storage provides durability at the cost of
    slower I/O operations compared to memory only-based storage.

    This implementation is ideal for long-running crawlers where persistence is important and for situations
    where you need to resume crawling after process termination.
    """

    _STORAGE_SUBDIR = 'request_queues'
    """The name of the subdirectory where request queues are stored."""

    _STORAGE_SUBSUBDIR_DEFAULT = 'default'
    """The name of the subdirectory for the default request queue."""

    _MAX_REQUESTS_IN_CACHE = 100_000
    """Maximum number of requests to keep in cache for faster access."""

    def __init__(
        self,
        *,
        metadata: RequestQueueMetadata,
        path_to_rq: Path,
        lock: asyncio.Lock,
        recoverable_state: RecoverableState[RequestQueueState],
    ) -> None:
        """Initialize a new instance.

        Preferably use the `FileSystemRequestQueueClient.open` class method to create a new instance.
        """
        self._metadata = metadata

        self._path_to_rq = path_to_rq
        """The full path to the request queue directory."""

        self._lock = lock
        """A lock to ensure that only one operation is performed at a time."""

        self._request_cache = deque[Request]()
        """Cache for requests: forefront requests at the beginning, regular requests at the end."""

        self._request_cache_needs_refresh = True
        """Flag indicating whether the cache needs to be refreshed from filesystem."""

        self._is_empty_cache: bool | None = None
        """Cache for is_empty result: None means unknown, True/False is cached state."""

        self._state = recoverable_state
        """Recoverable state to maintain request ordering, in-progress status, and handled status."""

    @override
    async def get_metadata(self) -> RequestQueueMetadata:
        return self._metadata

    @property
    def path_to_rq(self) -> Path:
        """The full path to the request queue directory."""
        return self._path_to_rq

    @property
    def path_to_metadata(self) -> Path:
        """The full path to the request queue metadata file."""
        return self.path_to_rq / METADATA_FILENAME

    @classmethod
    async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
        async def kvs_factory() -> KeyValueStore:
            from crawlee.storage_clients import FileSystemStorageClient  # noqa: PLC0415 avoid circular import
            from crawlee.storages import KeyValueStore  # noqa: PLC0415 avoid circular import

            return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)

        return RecoverableState[RequestQueueState](
            default_state=RequestQueueState(),
            persist_state_key=f'__RQ_STATE_{id}',
            persist_state_kvs_factory=kvs_factory,
            persistence_enabled=True,
            logger=logger,
        )

    @classmethod
    async def open(
        cls,
        *,
        id: str | None,
        name: str | None,
        alias: str | None,
        configuration: Configuration,
    ) -> Self:
        """Open or create a file system request queue client.

        This method attempts to open an existing request queue from the file system. If a queue with the specified
        ID or name exists, it loads the metadata and state from the stored files. If no existing queue is found,
        a new one is created.

        Args:
            id: The ID of the request queue to open. If provided, searches for existing queue by ID.
            name: The name of the request queue for named (global scope) storages.
            alias: The alias of the request queue for unnamed (run scope) storages.
            configuration: The configuration object containing storage directory settings.

        Returns:
            An instance for the opened or created storage client.

        Raises:
            ValueError: If a queue with the specified ID is not found, if metadata is invalid,
                or if both name and alias are provided.
        """
        # Validate input parameters.
        raise_if_too_many_kwargs(id=id, name=name, alias=alias)

        rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR

        if not rq_base_path.exists():
            await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True)

        # Open an existing RQ by its ID, raise an error if not found.
        if id:
            found = False
            for rq_dir in rq_base_path.iterdir():
                if not rq_dir.is_dir():
                    continue

                path_to_metadata = rq_dir / METADATA_FILENAME
                if not path_to_metadata.exists():
                    continue

                try:
                    file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
                    try:
                        file_content = json.load(file)
                        metadata = RequestQueueMetadata(**file_content)

                        if metadata.id == id:
                            client = cls(
                                metadata=metadata,
                                path_to_rq=rq_base_path / rq_dir,
                                lock=asyncio.Lock(),
                                recoverable_state=await cls._create_recoverable_state(
                                    id=id, configuration=configuration
                                ),
                            )
                            await client._state.initialize()
                            await client._discover_existing_requests()
                            await client._update_metadata(update_accessed_at=True)
                            found = True
                            break
                    finally:
                        await asyncio.to_thread(file.close)
                except (json.JSONDecodeError, ValidationError):
                    continue

            if not found:
                raise ValueError(f'Request queue with ID "{id}" not found')

        # Open an existing RQ by its name or alias, or create a new one if not found.
        else:
            rq_dir = Path(name) if name else Path(alias) if alias else Path('default')
            path_to_rq = rq_base_path / rq_dir
            path_to_metadata = path_to_rq / METADATA_FILENAME

            # If the RQ directory exists, reconstruct the client from the metadata file.
            if path_to_rq.exists() and path_to_metadata.exists():
                file = await asyncio.to_thread(path_to_metadata.open, encoding='utf-8')
                try:
                    file_content = json.load(file)
                finally:
                    await asyncio.to_thread(file.close)
                try:
                    metadata = RequestQueueMetadata(**file_content)
                except ValidationError as exc:
                    raise ValueError(f'Invalid metadata file for request queue "{name or alias}"') from exc

                client = cls(
                    metadata=metadata,
                    path_to_rq=path_to_rq,
                    lock=asyncio.Lock(),
                    recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
                )

                await client._state.initialize()
                await client._discover_existing_requests()
                await client._update_metadata(update_accessed_at=True)

            # Otherwise, create a new dataset client.
            else:
                now = datetime.now(timezone.utc)
                metadata = RequestQueueMetadata(
                    id=crypto_random_object_id(),
                    name=name,
                    created_at=now,
                    accessed_at=now,
                    modified_at=now,
                    had_multiple_clients=False,
                    handled_request_count=0,
                    pending_request_count=0,
                    total_request_count=0,
                )
                client = cls(
                    metadata=metadata,
                    path_to_rq=path_to_rq,
                    lock=asyncio.Lock(),
                    recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
                )
                await client._state.initialize()
                await client._update_metadata()

        return client

    @override
    async def drop(self) -> None:
        async with self._lock:
            # Remove the RQ dir recursively if it exists.
            if self.path_to_rq.exists():
                await asyncio.to_thread(shutil.rmtree, self.path_to_rq)

            # Clear recoverable state
            await self._state.reset()
            await self._state.teardown()
            self._request_cache.clear()
            self._request_cache_needs_refresh = True

            # Invalidate is_empty cache.
            self._is_empty_cache = None

    @override
    async def purge(self) -> None:
        async with self._lock:
            request_files = await self._get_request_files(self.path_to_rq)

            for file_path in request_files:
                await asyncio.to_thread(file_path.unlink, missing_ok=True)

            # Clear recoverable state
            await self._state.reset()
            self._request_cache.clear()
            self._request_cache_needs_refresh = True

            await self._update_metadata(
                update_modified_at=True,
                update_accessed_at=True,
                new_pending_request_count=0,
                new_handled_request_count=0,
                new_total_request_count=0,
            )

            # Invalidate is_empty cache.
            self._is_empty_cache = None

    @override
    async def add_batch_of_requests(
        self,
        requests: Sequence[Request],
        *,
        forefront: bool = False,
    ) -> AddRequestsResponse:
        async with self._lock:
            self._is_empty_cache = None
            new_total_request_count = self._metadata.total_request_count
            new_pending_request_count = self._metadata.pending_request_count
            processed_requests = list[ProcessedRequest]()
            unprocessed_requests = list[UnprocessedRequest]()
            state = self._state.current_value

            all_requests = state.forefront_requests | state.regular_requests

            requests_to_enqueue = {}

            # Determine which requests can be added or are modified.
            for request in requests:
                # Check if the request has already been handled.
                if request.unique_key in state.handled_requests:
                    processed_requests.append(
                        ProcessedRequest(
                            unique_key=request.unique_key,
                            was_already_present=True,
                            was_already_handled=True,
                        )
                    )
                # Check if the request is already in progress.
                # Or if the request is already in the queue and the `forefront` flag is not used, we do not change the
                # position of the request.
                elif (request.unique_key in state.in_progress_requests) or (
                    request.unique_key in all_requests and not forefront
                ):
                    processed_requests.append(
                        ProcessedRequest(
                            unique_key=request.unique_key,
                            was_already_present=True,
                            was_already_handled=False,
                        )
                    )
                # These requests must either be added or update their position.
                else:
                    requests_to_enqueue[request.unique_key] = request

            # Process each request in the batch.
            for request in requests_to_enqueue.values():
                # If the request is not already in the RQ, this is a new request.
                if request.unique_key not in all_requests:
                    request_path = self._get_request_path(request.unique_key)
                    # Add sequence number to ensure FIFO ordering using state.
                    if forefront:
                        sequence_number = state.forefront_sequence_counter
                        state.forefront_sequence_counter += 1
                        state.forefront_requests[request.unique_key] = sequence_number
                    else:
                        sequence_number = state.sequence_counter
                        state.sequence_counter += 1
                        state.regular_requests[request.unique_key] = sequence_number

                    # Save the clean request without extra fields
                    request_data = await json_dumps(request.model_dump())
                    await atomic_write(request_path, request_data)

                    # Update the metadata counts.
                    new_total_request_count += 1
                    new_pending_request_count += 1

                    processed_requests.append(
                        ProcessedRequest(
                            unique_key=request.unique_key,
                            was_already_present=False,
                            was_already_handled=False,
                        )
                    )

                # If the request already exists in the RQ and use the forefront flag to update its position
                elif forefront:
                    # If the request is among `regular`, remove it from its current position.
                    if request.unique_key in state.regular_requests:
                        state.regular_requests.pop(request.unique_key)

                    # If the request is already in `forefront`, we just need to update its position.
                    state.forefront_requests[request.unique_key] = state.forefront_sequence_counter
                    state.forefront_sequence_counter += 1

                    processed_requests.append(
                        ProcessedRequest(
                            unique_key=request.unique_key,
                            was_already_present=True,
                            was_already_handled=False,
                        )
                    )

                else:
                    logger.warning(f'Request with unique key "{request.unique_key}" could not be processed.')
                    unprocessed_requests.append(
                        UnprocessedRequest(
                            unique_key=request.unique_key,
                            url=request.url,
                            method=request.method,
                        )
                    )

            await self._update_metadata(
                update_modified_at=True,
                update_accessed_at=True,
                new_total_request_count=new_total_request_count,
                new_pending_request_count=new_pending_request_count,
            )

            # Invalidate the cache if we added forefront requests.
            if forefront:
                self._request_cache_needs_refresh = True

            # Invalidate is_empty cache.
            self._is_empty_cache = None

            return AddRequestsResponse(
                processed_requests=processed_requests,
                unprocessed_requests=unprocessed_requests,
            )

    @override
    async def get_request(self, unique_key: str) -> Request | None:
        async with self._lock:
            request_path = self._get_request_path(unique_key)
            request = await self._parse_request_file(request_path)

            if request is None:
                logger.warning(f'Request with unique key "{unique_key}" not found in the queue.')
                return None

            await self._update_metadata(update_accessed_at=True)
            return request

    @override
    async def fetch_next_request(self) -> Request | None:
        async with self._lock:
            # Refresh cache if needed or if it's empty.
            if self._request_cache_needs_refresh or not self._request_cache:
                await self._refresh_cache()

            next_request: Request | None = None
            state = self._state.current_value

            # Fetch from the front of the deque (forefront requests are at the beginning).
            while self._request_cache and next_request is None:
                candidate = self._request_cache.popleft()

                # Skip requests that are already in progress, however this should not happen.
                if candidate.unique_key not in state.in_progress_requests:
                    next_request = candidate

            if next_request is not None:
                state.in_progress_requests.add(next_request.unique_key)

            return next_request

    @override
    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
        async with self._lock:
            self._is_empty_cache = None
            state = self._state.current_value

            # Check if the request is in progress.
            if request.unique_key not in state.in_progress_requests:
                logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.')
                return None

            # Update the request's handled_at timestamp.
            if request.handled_at is None:
                request.handled_at = datetime.now(timezone.utc)

            # Dump the updated request to the file.
            request_path = self._get_request_path(request.unique_key)

            if not await asyncio.to_thread(request_path.exists):
                logger.warning(f'Request file for {request.unique_key} does not exist, cannot mark as handled.')
                return None

            request_data = await json_dumps(request.model_dump())
            await atomic_write(request_path, request_data)

            # Update state: remove from in-progress and add to handled.
            state.in_progress_requests.discard(request.unique_key)
            state.handled_requests.add(request.unique_key)

            # Update RQ metadata.
            await self._update_metadata(
                update_modified_at=True,
                update_accessed_at=True,
                new_handled_request_count=self._metadata.handled_request_count + 1,
                new_pending_request_count=self._metadata.pending_request_count - 1,
            )

            return ProcessedRequest(
                unique_key=request.unique_key,
                was_already_present=True,
                was_already_handled=True,
            )

    @override
    async def reclaim_request(
        self,
        request: Request,
        *,
        forefront: bool = False,
    ) -> ProcessedRequest | None:
        async with self._lock:
            self._is_empty_cache = None
            state = self._state.current_value

            # Check if the request is in progress.
            if request.unique_key not in state.in_progress_requests:
                logger.info(f'Reclaiming request {request.unique_key} that is not in progress.')
                return None

            request_path = self._get_request_path(request.unique_key)

            if not await asyncio.to_thread(request_path.exists):
                logger.warning(f'Request file for {request.unique_key} does not exist, cannot reclaim.')
                return None

            # Update sequence number and state to ensure proper ordering.
            if forefront:
                # Remove from regular requests if it was there
                state.regular_requests.pop(request.unique_key, None)
                sequence_number = state.forefront_sequence_counter
                state.forefront_sequence_counter += 1
                state.forefront_requests[request.unique_key] = sequence_number
            else:
                # Remove from forefront requests if it was there
                state.forefront_requests.pop(request.unique_key, None)
                sequence_number = state.sequence_counter
                state.sequence_counter += 1
                state.regular_requests[request.unique_key] = sequence_number

            # Save the clean request without extra fields
            request_data = await json_dumps(request.model_dump())
            await atomic_write(request_path, request_data)

            # Remove from in-progress.
            state.in_progress_requests.discard(request.unique_key)

            # Update RQ metadata.
            await self._update_metadata(
                update_modified_at=True,
                update_accessed_at=True,
            )

            # Add the request back to the cache.
            if forefront:
                self._request_cache.appendleft(request)
            else:
                self._request_cache.append(request)

            return ProcessedRequest(
                unique_key=request.unique_key,
                was_already_present=True,
                was_already_handled=False,
            )

    @override
    async def is_empty(self) -> bool:
        async with self._lock:
            # If we have a cached value, return it immediately.
            if self._is_empty_cache is not None:
                return self._is_empty_cache

            state = self._state.current_value

            # If there are in-progress requests, return False immediately.
            if len(state.in_progress_requests) > 0:
                self._is_empty_cache = False
                return False

            # If we have a cached requests, check them first (fast path).
            if self._request_cache:
                for req in self._request_cache:
                    if req.unique_key not in state.handled_requests:
                        self._is_empty_cache = False
                        return False
                self._is_empty_cache = True
                return len(state.in_progress_requests) == 0

            # Fallback: check state for unhandled requests.
            await self._update_metadata(update_accessed_at=True)

            # Check if there are any requests that are not handled
            all_requests = set(state.forefront_requests.keys()) | set(state.regular_requests.keys())
            unhandled_requests = all_requests - state.handled_requests

            if unhandled_requests:
                self._is_empty_cache = False
                return False

            self._is_empty_cache = True
            return True

    def _get_request_path(self, unique_key: str) -> Path:
        """Get the path to a specific request file.

        Args:
            unique_key: Unique key of the request.

        Returns:
            The path to the request file.
        """
        return self.path_to_rq / f'{self._get_file_base_name_from_unique_key(unique_key)}.json'

    async def _update_metadata(
        self,
        *,
        new_handled_request_count: int | None = None,
        new_pending_request_count: int | None = None,
        new_total_request_count: int | None = None,
        update_had_multiple_clients: bool = False,
        update_accessed_at: bool = False,
        update_modified_at: bool = False,
    ) -> None:
        """Update the dataset metadata file with current information.

        Args:
            new_handled_request_count: If provided, update the handled_request_count to this value.
            new_pending_request_count: If provided, update the pending_request_count to this value.
            new_total_request_count: If provided, update the total_request_count to this value.
            update_had_multiple_clients: If True, set had_multiple_clients to True.
            update_accessed_at: If True, update the `accessed_at` timestamp to the current time.
            update_modified_at: If True, update the `modified_at` timestamp to the current time.
        """
        # Always create a new timestamp to ensure it's truly updated
        now = datetime.now(timezone.utc)

        # Update timestamps according to parameters
        if update_accessed_at:
            self._metadata.accessed_at = now

        if update_modified_at:
            self._metadata.modified_at = now

        # Update request counts if provided
        if new_handled_request_count is not None:
            self._metadata.handled_request_count = new_handled_request_count

        if new_pending_request_count is not None:
            self._metadata.pending_request_count = new_pending_request_count

        if new_total_request_count is not None:
            self._metadata.total_request_count = new_total_request_count

        if update_had_multiple_clients:
            self._metadata.had_multiple_clients = True

        # Ensure the parent directory for the metadata file exists.
        await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True)

        # Dump the serialized metadata to the file.
        data = await json_dumps(self._metadata.model_dump())
        await atomic_write(self.path_to_metadata, data)

    async def _refresh_cache(self) -> None:
        """Refresh the request cache from filesystem.

        This method loads up to _MAX_REQUESTS_IN_CACHE requests from the filesystem,
        prioritizing forefront requests and maintaining proper ordering.
        """
        self._request_cache.clear()
        state = self._state.current_value

        forefront_requests = list[tuple[Request, int]]()  # (request, sequence)
        regular_requests = list[tuple[Request, int]]()  # (request, sequence)

        request_files = await self._get_request_files(self.path_to_rq)

        for request_file in request_files:
            request = await self._parse_request_file(request_file)

            if request is None:
                continue

            # Skip handled requests
            if request.unique_key in state.handled_requests:
                continue

            # Skip in-progress requests
            if request.unique_key in state.in_progress_requests:
                continue

            # Determine if request is forefront or regular based on state
            if request.unique_key in state.forefront_requests:
                sequence = state.forefront_requests[request.unique_key]
                forefront_requests.append((request, sequence))
            elif request.unique_key in state.regular_requests:
                sequence = state.regular_requests[request.unique_key]
                regular_requests.append((request, sequence))
            else:
                # Request not in state, skip it (might be orphaned)
                logger.warning(f'Request {request.unique_key} not found in state, skipping.')
                continue

        # Sort forefront requests by sequence (newest first for LIFO behavior).
        forefront_requests.sort(key=lambda item: item[1], reverse=True)

        # Sort regular requests by sequence (oldest first for FIFO behavior).
        regular_requests.sort(key=lambda item: item[1], reverse=False)

        # Add forefront requests to the beginning of the cache (left side). Since forefront_requests are sorted
        # by sequence (newest first), we need to add them in reverse order to maintain correct priority.
        for request, _ in reversed(forefront_requests):
            if len(self._request_cache) >= self._MAX_REQUESTS_IN_CACHE:
                break
            self._request_cache.appendleft(request)

        # Add regular requests to the end of the cache (right side).
        for request, _ in regular_requests:
            if len(self._request_cache) >= self._MAX_REQUESTS_IN_CACHE:
                break
            self._request_cache.append(request)

        self._request_cache_needs_refresh = False

    @classmethod
    async def _get_request_files(cls, path_to_rq: Path) -> list[Path]:
        """Get all request files from the RQ.

        Args:
            path_to_rq: The path to the request queue directory.

        Returns:
            A list of paths to all request files.
        """
        # Create the requests directory if it doesn't exist.
        await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True)

        # List all the json files.
        files = list(await asyncio.to_thread(path_to_rq.glob, '*.json'))

        # Filter out metadata file and non-file entries.
        filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files)

        return list(filtered)

    @classmethod
    async def _parse_request_file(cls, file_path: Path) -> Request | None:
        """Parse a request file and return the `Request` object.

        Args:
            file_path: The path to the request file.

        Returns:
            The parsed `Request` object or `None` if the file could not be read or parsed.
        """
        # Open the request file.
        try:
            file = await asyncio.to_thread(functools.partial(file_path.open, mode='r', encoding='utf-8'))
        except FileNotFoundError:
            logger.warning(f'Request file "{file_path}" not found.')
            return None

        # Read the file content and parse it as JSON.
        try:
            file_content = json.load(file)
        except json.JSONDecodeError as exc:
            logger.warning(f'Failed to parse request file {file_path}: {exc!s}')
            return None
        finally:
            await asyncio.to_thread(file.close)

        # Validate the content against the Request model.
        try:
            return Request.model_validate(file_content)
        except ValidationError as exc:
            logger.warning(f'Failed to validate request file {file_path}: {exc!s}')
            return None

    async def _discover_existing_requests(self) -> None:
        """Discover and load existing requests into the state when opening an existing request queue."""
        request_files = await self._get_request_files(self.path_to_rq)
        state = self._state.current_value

        for request_file in request_files:
            request = await self._parse_request_file(request_file)
            if request is None:
                continue

            # Add request to state as regular request (assign sequence numbers)
            if request.unique_key not in state.regular_requests and request.unique_key not in state.forefront_requests:
                # Assign as regular request with current sequence counter
                state.regular_requests[request.unique_key] = state.sequence_counter
                state.sequence_counter += 1

                # Check if request was already handled
                if request.handled_at is not None:
                    state.handled_requests.add(request.unique_key)

    @staticmethod
    def _get_file_base_name_from_unique_key(unique_key: str) -> str:
        """Generate a deterministic file name for a unique_key.

        Args:
            unique_key: Unique key to be used to generate filename.

        Returns:
            A file name based on the unique_key.
        """
        # hexdigest produces filenames compliant strings
        hashed_key = sha256(unique_key.encode('utf-8')).hexdigest()
        name_length = 15
        # Truncate the key to the desired length
        return hashed_key[:name_length]


================================================
FILE: src/crawlee/storage_clients/_file_system/_storage_client.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

from typing_extensions import override

from crawlee._utils.docs import docs_group
from crawlee.configuration import Configuration
from crawlee.storage_clients._base import StorageClient

from ._dataset_client import FileSystemDatasetClient
from ._key_value_store_client import FileSystemKeyValueStoreClient
from ._request_queue_client import FileSystemRequestQueueClient

if TYPE_CHECKING:
    from collections.abc import Hashable


@docs_group('Storage clients')
class FileSystemStorageClient(StorageClient):
    """File system implementation of the storage client.

    This storage client provides access to datasets, key-value stores, and request queues that persist data
    to the local file system. Each storage type is implemented with its own specific file system client
    that stores data in a structured directory hierarchy.

    Data is stored in JSON format in predictable file paths, making it easy to inspect and manipulate
    the stored data outside of the Crawlee application if needed.

    All data persists between program runs but is limited to access from the local machine
    where the files are stored.

    Warning: This storage client is not safe for concurrent access from multiple crawler processes.
    Use it only when running a single crawler process at a time.
    """

    @override
    def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable:
        # Even different client instances should return same storage if the storage_dir is the same.
        return super().get_storage_client_cache_key(configuration), configuration.storage_dir

    @override
    async def create_dataset_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> FileSystemDatasetClient:
        configuration = configuration or Configuration.get_global_configuration()
        client = await FileSystemDatasetClient.open(id=id, name=name, alias=alias, configuration=configuration)
        await self._purge_if_needed(client, configuration)
        return client

    @override
    async def create_kvs_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> FileSystemKeyValueStoreClient:
        configuration = configuration or Configuration.get_global_configuration()
        client = await FileSystemKeyValueStoreClient.open(id=id, name=name, alias=alias, configuration=configuration)
        await self._purge_if_needed(client, configuration)
        return client

    @override
    async def create_rq_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> FileSystemRequestQueueClient:
        configuration = configuration or Configuration.get_global_configuration()
        client = await FileSystemRequestQueueClient.open(id=id, name=name, alias=alias, configuration=configuration)
        await self._purge_if_needed(client, configuration)
        return client


================================================
FILE: src/crawlee/storage_clients/_file_system/_utils.py
================================================


================================================
FILE: src/crawlee/storage_clients/_file_system/py.typed
================================================


================================================
FILE: src/crawlee/storage_clients/_memory/__init__.py
================================================
from ._dataset_client import MemoryDatasetClient
from ._key_value_store_client import MemoryKeyValueStoreClient
from ._request_queue_client import MemoryRequestQueueClient
from ._storage_client import MemoryStorageClient

__all__ = [
    'MemoryDatasetClient',
    'MemoryKeyValueStoreClient',
    'MemoryRequestQueueClient',
    'MemoryStorageClient',
]


================================================
FILE: src/crawlee/storage_clients/_memory/_dataset_client.py
================================================
from __future__ import annotations

from datetime import datetime, timezone
from logging import getLogger
from typing import TYPE_CHECKING, Any

from typing_extensions import Self, override

from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.storage_clients._base import DatasetClient
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata

if TYPE_CHECKING:
    from collections.abc import AsyncIterator

logger = getLogger(__name__)


class MemoryDatasetClient(DatasetClient):
    """Memory implementation of the dataset client.

    This client stores dataset items in memory using Python lists and dictionaries. No data is persisted
    between process runs, meaning all stored data is lost when the program terminates. This implementation
    is primarily useful for testing, development, and short-lived crawler operations where persistent
    storage is not required.

    The memory implementation provides fast access to data but is limited by available memory and
    does not support data sharing across different processes. It supports all dataset operations including
    sorting, filtering, and pagination, but performs them entirely in memory.
    """

    def __init__(
        self,
        *,
        metadata: DatasetMetadata,
    ) -> None:
        """Initialize a new instance.

        Preferably use the `MemoryDatasetClient.open` class method to create a new instance.
        """
        self._metadata = metadata

        self._records = list[dict[str, Any]]()
        """List to hold dataset items. Each item is a dictionary representing a record."""

    @override
    async def get_metadata(self) -> DatasetMetadata:
        return self._metadata

    @classmethod
    async def open(
        cls,
        *,
        id: str | None,
        name: str | None,
        alias: str | None,
    ) -> Self:
        """Open or create a new memory dataset client.

        This method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory
        datasets don't check for existing datasets with the same name or ID since all data exists only in memory
        and is lost when the process terminates.

        Alias does not have any effect on the memory storage client implementation, because unnamed storages
        are supported by default, since data are not persisted.

        Args:
            id: The ID of the dataset. If not provided, a random ID will be generated.
            name: The name of the dataset for named (global scope) storages.
            alias: The alias of the dataset for unnamed (run scope) storages.

        Returns:
            An instance for the opened or created storage client.

        Raises:
            ValueError: If both name and alias are provided, or if neither id, name, nor alias is provided.
        """
        # Validate input parameters.
        raise_if_too_many_kwargs(id=id, name=name, alias=alias)

        # Create a new dataset
        dataset_id = id or crypto_random_object_id()
        now = datetime.now(timezone.utc)

        metadata = DatasetMetadata(
            id=dataset_id,
            name=name,
            created_at=now,
            accessed_at=now,
            modified_at=now,
            item_count=0,
        )

        return cls(metadata=metadata)

    @override
    async def drop(self) -> None:
        self._records.clear()
        await self._update_metadata(
            update_accessed_at=True,
            update_modified_at=True,
            new_item_count=0,
        )

    @override
    async def purge(self) -> None:
        self._records.clear()
        await self._update_metadata(
            update_accessed_at=True,
            update_modified_at=True,
            new_item_count=0,
        )

    @override
    async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
        metadata = await self.get_metadata()
        new_item_count = metadata.item_count

        if isinstance(data, list):
            for item in data:
                new_item_count += 1
                await self._push_item(item)
        else:
            new_item_count += 1
            await self._push_item(data)

        await self._update_metadata(
            update_accessed_at=True,
            update_modified_at=True,
            new_item_count=new_item_count,
        )

    @override
    async def get_data(
        self,
        *,
        offset: int = 0,
        limit: int | None = 999_999_999_999,
        clean: bool = False,
        desc: bool = False,
        fields: list[str] | None = None,
        omit: list[str] | None = None,
        unwind: list[str] | None = None,
        skip_empty: bool = False,
        skip_hidden: bool = False,
        flatten: list[str] | None = None,
        view: str | None = None,
    ) -> DatasetItemsListPage:
        # Check for unsupported arguments and log a warning if found
        unsupported_args: dict[str, Any] = {
            'clean': clean,
            'fields': fields,
            'omit': omit,
            'unwind': unwind,
            'skip_hidden': skip_hidden,
            'flatten': flatten,
            'view': view,
        }
        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}

        if unsupported:
            logger.warning(
                f'The arguments {list(unsupported.keys())} of get_data are not supported '
                f'by the {self.__class__.__name__} client.'
            )

        total = len(self._records)
        items = self._records.copy()

        # Apply skip_empty filter if requested
        if skip_empty:
            items = [item for item in items if item]

        # Apply sorting
        if desc:
            items = list(reversed(items))

        # Apply pagination
        sliced_items = items[offset : (offset + limit) if limit is not None else total]

        await self._update_metadata(update_accessed_at=True)

        return DatasetItemsListPage(
            count=len(sliced_items),
            offset=offset,
            limit=limit or (total - offset),
            total=total,
            desc=desc,
            items=sliced_items,
        )

    @override
    async def iterate_items(
        self,
        *,
        offset: int = 0,
        limit: int | None = None,
        clean: bool = False,
        desc: bool = False,
        fields: list[str] | None = None,
        omit: list[str] | None = None,
        unwind: list[str] | None = None,
        skip_empty: bool = False,
        skip_hidden: bool = False,
    ) -> AsyncIterator[dict[str, Any]]:
        # Check for unsupported arguments and log a warning if found
        unsupported_args: dict[str, Any] = {
            'clean': clean,
            'fields': fields,
            'omit': omit,
            'unwind': unwind,
            'skip_hidden': skip_hidden,
        }
        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}

        if unsupported:
            logger.warning(
                f'The arguments {list(unsupported.keys())} of iterate are not supported '
                f'by the {self.__class__.__name__} client.'
            )

        items = self._records.copy()

        # Apply sorting
        if desc:
            items = list(reversed(items))

        # Apply pagination
        sliced_items = items[offset : (offset + limit) if limit is not None else len(items)]

        # Yield items one by one
        for item in sliced_items:
            if skip_empty and not item:
                continue
            yield item

        await self._update_metadata(update_accessed_at=True)

    async def _update_metadata(
        self,
        *,
        new_item_count: int | None = None,
        update_accessed_at: bool = False,
        update_modified_at: bool = False,
    ) -> None:
        """Update the dataset metadata with current information.

        Args:
            new_item_count: If provided, update the item count to this value.
            update_accessed_at: If True, update the `accessed_at` timestamp to the current time.
            update_modified_at: If True, update the `modified_at` timestamp to the current time.
        """
        now = datetime.now(timezone.utc)

        if update_accessed_at:
            self._metadata.accessed_at = now
        if update_modified_at:
            self._metadata.modified_at = now
        if new_item_count is not None:
            self._metadata.item_count = new_item_count

    async def _push_item(self, item: dict[str, Any]) -> None:
        """Push a single item to the dataset.

        Args:
            item: The data item to add to the dataset.
        """
        self._records.append(item)


================================================
FILE: src/crawlee/storage_clients/_memory/_key_value_store_client.py
================================================
from __future__ import annotations

import sys
from datetime import datetime, timezone
from typing import TYPE_CHECKING, Any

from typing_extensions import Self, override

from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.file import infer_mime_type
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.storage_clients._base import KeyValueStoreClient
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata

if TYPE_CHECKING:
    from collections.abc import AsyncIterator


class MemoryKeyValueStoreClient(KeyValueStoreClient):
    """Memory implementation of the key-value store client.

    This client stores data in memory as Python dictionaries. No data is persisted between
    process runs, meaning all stored data is lost when the program terminates. This implementation
    is primarily useful for testing, development, and short-lived crawler operations where
    persistence is not required.

    The memory implementation provides fast access to data but is limited by available memory and
    does not support data sharing across different processes.
    """

    def __init__(
        self,
        *,
        metadata: KeyValueStoreMetadata,
    ) -> None:
        """Initialize a new instance.

        Preferably use the `MemoryKeyValueStoreClient.open` class method to create a new instance.
        """
        self._metadata = metadata

        self._records = dict[str, KeyValueStoreRecord]()
        """Dictionary to hold key-value records."""

    @override
    async def get_metadata(self) -> KeyValueStoreMetadata:
        return self._metadata

    @classmethod
    async def open(
        cls,
        *,
        id: str | None,
        name: str | None,
        alias: str | None,
    ) -> Self:
        """Open or create a new memory key-value store client.

        This method creates a new in-memory key-value store instance. Unlike persistent storage implementations,
        memory KVS don't check for existing stores with the same name or ID since all data exists only in memory
        and is lost when the process terminates.

        Alias does not have any effect on the memory storage client implementation, because unnamed storages
        are supported by default, since data are not persisted.

        Args:
            id: The ID of the key-value store. If not provided, a random ID will be generated.
            name: The name of the key-value store for named (global scope) storages.
            alias: The alias of the key-value store for unnamed (run scope) storages.

        Returns:
            An instance for the opened or created storage client.

        Raises:
            ValueError: If both name and alias are provided.
        """
        # Validate input parameters.
        raise_if_too_many_kwargs(id=id, name=name, alias=alias)

        # Create a new key-value store
        store_id = id or crypto_random_object_id()
        now = datetime.now(timezone.utc)

        metadata = KeyValueStoreMetadata(
            id=store_id,
            name=name,
            created_at=now,
            accessed_at=now,
            modified_at=now,
        )

        return cls(metadata=metadata)

    @override
    async def drop(self) -> None:
        self._records.clear()
        await self._update_metadata(update_accessed_at=True, update_modified_at=True)

    @override
    async def purge(self) -> None:
        self._records.clear()
        await self._update_metadata(update_accessed_at=True, update_modified_at=True)

    @override
    async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
        await self._update_metadata(update_accessed_at=True)

        # Return None if key doesn't exist
        return self._records.get(key, None)

    @override
    async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:
        content_type = content_type or infer_mime_type(value)
        size = sys.getsizeof(value)

        # Create and store the record
        record = KeyValueStoreRecord(
            key=key,
            value=value,
            content_type=content_type,
            size=size,
        )

        self._records[key] = record

        await self._update_metadata(update_accessed_at=True, update_modified_at=True)

    @override
    async def delete_value(self, *, key: str) -> None:
        if key in self._records:
            del self._records[key]
            await self._update_metadata(update_accessed_at=True, update_modified_at=True)

    @override
    async def iterate_keys(
        self,
        *,
        exclusive_start_key: str | None = None,
        limit: int | None = None,
    ) -> AsyncIterator[KeyValueStoreRecordMetadata]:
        await self._update_metadata(update_accessed_at=True)

        # Get all keys, sorted alphabetically
        keys = sorted(self._records.keys())

        # Apply exclusive_start_key filter if provided
        if exclusive_start_key is not None:
            keys = [k for k in keys if k > exclusive_start_key]

        # Apply limit if provided
        if limit is not None:
            keys = keys[:limit]

        # Yield metadata for each key
        for key in keys:
            record = self._records[key]
            yield KeyValueStoreRecordMetadata(
                key=key,
                content_type=record.content_type,
                size=record.size,
            )

    @override
    async def get_public_url(self, *, key: str) -> str:
        raise NotImplementedError('Public URLs are not supported for memory key-value stores.')

    @override
    async def record_exists(self, *, key: str) -> bool:
        await self._update_metadata(update_accessed_at=True)
        return key in self._records

    async def _update_metadata(
        self,
        *,
        update_accessed_at: bool = False,
        update_modified_at: bool = False,
    ) -> None:
        """Update the key-value store metadata with current information.

        Args:
            update_accessed_at: If True, update the `accessed_at` timestamp to the current time.
            update_modified_at: If True, update the `modified_at` timestamp to the current time.
        """
        now = datetime.now(timezone.utc)

        if update_accessed_at:
            self._metadata.accessed_at = now
        if update_modified_at:
            self._metadata.modified_at = now


================================================
FILE: src/crawlee/storage_clients/_memory/_request_queue_client.py
================================================
from __future__ import annotations

from collections import deque
from contextlib import suppress
from datetime import datetime, timezone
from logging import getLogger
from typing import TYPE_CHECKING

from typing_extensions import Self, override

from crawlee import Request
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.storage_clients._base import RequestQueueClient
from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata

if TYPE_CHECKING:
    from collections.abc import Sequence

logger = getLogger(__name__)


class MemoryRequestQueueClient(RequestQueueClient):
    """Memory implementation of the request queue client.

    No data is persisted between process runs, which means all requests are lost when the program terminates.
    This implementation is primarily useful for testing, development, and short-lived crawler runs where
    persistence is not required.

    This client provides fast access to request data but is limited by available memory and does not support
    data sharing across different processes.
    """

    def __init__(
        self,
        *,
        metadata: RequestQueueMetadata,
    ) -> None:
        """Initialize a new instance.

        Preferably use the `MemoryRequestQueueClient.open` class method to create a new instance.
        """
        self._metadata = metadata

        self._pending_requests = deque[Request]()
        """Pending requests are those that have been added to the queue but not yet fetched for processing."""

        self._handled_requests = dict[str, Request]()
        """Handled requests are those that have been processed and marked as handled."""

        self._in_progress_requests = dict[str, Request]()
        """In-progress requests are those that have been fetched but not yet marked as handled or reclaimed."""

        self._requests_by_unique_key = dict[str, Request]()
        """Unique key -> Request mapping for fast lookup by unique key."""

    @override
    async def get_metadata(self) -> RequestQueueMetadata:
        return self._metadata

    @classmethod
    async def open(
        cls,
        *,
        id: str | None,
        name: str | None,
        alias: str | None,
    ) -> Self:
        """Open or create a new memory request queue client.

        This method creates a new in-memory request queue instance. Unlike persistent storage implementations,
        memory queues don't check for existing queues with the same name or ID since all data exists only
        in memory and is lost when the process terminates.

        Alias does not have any effect on the memory storage client implementation, because unnamed storages
        are supported by default, since data are not persisted.

        Args:
            id: The ID of the request queue. If not provided, a random ID will be generated.
            name: The name of the request queue for named (global scope) storages.
            alias: The alias of the request queue for unnamed (run scope) storages.

        Returns:
            An instance for the opened or created storage client.

        Raises:
            ValueError: If both name and alias are provided.
        """
        # Validate input parameters.
        raise_if_too_many_kwargs(id=id, name=name, alias=alias)

        # Create a new queue
        queue_id = id or crypto_random_object_id()
        now = datetime.now(timezone.utc)

        metadata = RequestQueueMetadata(
            id=queue_id,
            name=name,
            created_at=now,
            accessed_at=now,
            modified_at=now,
            had_multiple_clients=False,
            handled_request_count=0,
            pending_request_count=0,
            total_request_count=0,
        )

        return cls(metadata=metadata)

    @override
    async def drop(self) -> None:
        self._pending_requests.clear()
        self._handled_requests.clear()
        self._requests_by_unique_key.clear()
        self._in_progress_requests.clear()

        await self._update_metadata(
            update_modified_at=True,
            update_accessed_at=True,
            new_handled_request_count=0,
            new_pending_request_count=0,
            new_total_request_count=0,
        )

    @override
    async def purge(self) -> None:
        self._pending_requests.clear()
        self._handled_requests.clear()
        self._requests_by_unique_key.clear()
        self._in_progress_requests.clear()

        await self._update_metadata(
            update_modified_at=True,
            update_accessed_at=True,
            new_pending_request_count=0,
            new_handled_request_count=0,
            new_total_request_count=0,
        )

    @override
    async def add_batch_of_requests(
        self,
        requests: Sequence[Request],
        *,
        forefront: bool = False,
    ) -> AddRequestsResponse:
        processed_requests = []
        for request in requests:
            # Check if the request is already in the queue by unique_key.
            existing_request = self._requests_by_unique_key.get(request.unique_key)

            was_already_present = existing_request is not None
            was_already_handled = was_already_present and existing_request and existing_request.handled_at is not None
            is_in_progress = request.unique_key in self._in_progress_requests

            # If the request is already in the queue and handled, don't add it again.
            if was_already_handled:
                processed_requests.append(
                    ProcessedRequest(
                        unique_key=request.unique_key,
                        was_already_present=True,
                        was_already_handled=True,
                    )
                )
                continue

            # If the request is already in progress, don't add it again.
            if is_in_progress:
                processed_requests.append(
                    ProcessedRequest(
                        unique_key=request.unique_key,
                        was_already_present=True,
                        was_already_handled=False,
                    )
                )
                continue

            # If the request is already in the queue but not handled, update it.
            if was_already_present and existing_request:
                # Update indexes.
                self._requests_by_unique_key[request.unique_key] = request

                # We only update `forefront` by updating its position by shifting it to the left.
                if forefront:
                    # Update the existing request with any new data and
                    # remove old request from pending queue if it's there.
                    with suppress(ValueError):
                        self._pending_requests.remove(existing_request)

                    # Add updated request back to queue.
                    self._pending_requests.appendleft(request)

                processed_requests.append(
                    ProcessedRequest(
                        unique_key=request.unique_key,
                        was_already_present=True,
                        was_already_handled=False,
                    )
                )

            # Add the new request to the queue.
            else:
                if forefront:
                    self._pending_requests.appendleft(request)
                else:
                    self._pending_requests.append(request)

                # Update indexes.
                self._requests_by_unique_key[request.unique_key] = request

                await self._update_metadata(
                    new_total_request_count=self._metadata.total_request_count + 1,
                    new_pending_request_count=self._metadata.pending_request_count + 1,
                )

            processed_requests.append(
                ProcessedRequest(
                    unique_key=request.unique_key,
                    was_already_present=was_already_present,
                    was_already_handled=False,
                )
            )

        await self._update_metadata(update_accessed_at=True, update_modified_at=True)

        return AddRequestsResponse(
            processed_requests=processed_requests,
            unprocessed_requests=[],
        )

    @override
    async def fetch_next_request(self) -> Request | None:
        while self._pending_requests:
            request = self._pending_requests.popleft()

            # Skip if already handled (shouldn't happen, but safety check).
            if request.was_already_handled:
                continue

            # Skip if already in progress (shouldn't happen, but safety check).
            if request.unique_key in self._in_progress_requests:
                continue

            # Mark as in progress.
            self._in_progress_requests[request.unique_key] = request
            return request

        return None

    @override
    async def get_request(self, unique_key: str) -> Request | None:
        await self._update_metadata(update_accessed_at=True)
        return self._requests_by_unique_key.get(unique_key)

    @override
    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
        # Check if the request is in progress.
        if request.unique_key not in self._in_progress_requests:
            return None

        # Set handled_at timestamp if not already set.
        if not request.was_already_handled:
            request.handled_at = datetime.now(timezone.utc)

        # Move request to handled storage.
        self._handled_requests[request.unique_key] = request

        # Update index (keep the request in indexes for get_request to work).
        self._requests_by_unique_key[request.unique_key] = request

        # Remove from in-progress.
        del self._in_progress_requests[request.unique_key]

        # Update metadata.
        await self._update_metadata(
            new_handled_request_count=self._metadata.handled_request_count + 1,
            new_pending_request_count=self._metadata.pending_request_count - 1,
            update_modified_at=True,
        )

        return ProcessedRequest(
            unique_key=request.unique_key,
            was_already_present=True,
            was_already_handled=True,
        )

    @override
    async def reclaim_request(
        self,
        request: Request,
        *,
        forefront: bool = False,
    ) -> ProcessedRequest | None:
        # Check if the request is in progress.
        if request.unique_key not in self._in_progress_requests:
            return None

        # Remove from in-progress.
        del self._in_progress_requests[request.unique_key]

        # Add request back to pending queue.
        if forefront:
            self._pending_requests.appendleft(request)
        else:
            self._pending_requests.append(request)

        # Update metadata timestamps.
        await self._update_metadata(update_modified_at=True)

        return ProcessedRequest(
            unique_key=request.unique_key,
            was_already_present=True,
            was_already_handled=False,
        )

    @override
    async def is_empty(self) -> bool:
        """Check if the queue is empty.

        Returns:
            True if the queue is empty, False otherwise.
        """
        await self._update_metadata(update_accessed_at=True)

        # Queue is empty if there are no pending requests and no requests in progress.
        return len(self._pending_requests) == 0 and len(self._in_progress_requests) == 0

    async def _update_metadata(
        self,
        *,
        update_accessed_at: bool = False,
        update_modified_at: bool = False,
        new_handled_request_count: int | None = None,
        new_pending_request_count: int | None = None,
        new_total_request_count: int | None = None,
    ) -> None:
        """Update the request queue metadata with current information.

        Args:
            update_accessed_at: If True, update the `accessed_at` timestamp to the current time.
            update_modified_at: If True, update the `modified_at` timestamp to the current time.
            new_handled_request_count: If provided, set the handled request count to this value.
            new_pending_request_count: If provided, set the pending request count to this value.
            new_total_request_count: If provided, set the total request count to this value.
        """
        now = datetime.now(timezone.utc)

        if update_accessed_at:
            self._metadata.accessed_at = now
        if update_modified_at:
            self._metadata.modified_at = now
        if new_handled_request_count is not None:
            self._metadata.handled_request_count = new_handled_request_count
        if new_pending_request_count is not None:
            self._metadata.pending_request_count = new_pending_request_count
        if new_total_request_count is not None:
            self._metadata.total_request_count = new_total_request_count


================================================
FILE: src/crawlee/storage_clients/_memory/_storage_client.py
================================================
from __future__ import annotations

from typing_extensions import override

from crawlee._utils.docs import docs_group
from crawlee.configuration import Configuration
from crawlee.storage_clients._base import StorageClient

from ._dataset_client import MemoryDatasetClient
from ._key_value_store_client import MemoryKeyValueStoreClient
from ._request_queue_client import MemoryRequestQueueClient


@docs_group('Storage clients')
class MemoryStorageClient(StorageClient):
    """Memory implementation of the storage client.

    This storage client provides access to datasets, key-value stores, and request queues that store all data
    in memory using Python data structures (lists and dictionaries). No data is persisted between process runs,
    meaning all stored data is lost when the program terminates.

    The memory implementation provides fast access to data but is limited by available memory and does not
    support data sharing across different processes. All storage operations happen entirely in memory with
    no disk operations.

    The memory storage client is useful for testing and development environments, or short-lived crawler
    operations where persistence is not required.
    """

    @override
    async def create_dataset_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> MemoryDatasetClient:
        configuration = configuration or Configuration.get_global_configuration()
        client = await MemoryDatasetClient.open(id=id, name=name, alias=alias)
        await self._purge_if_needed(client, configuration)
        return client

    @override
    async def create_kvs_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> MemoryKeyValueStoreClient:
        configuration = configuration or Configuration.get_global_configuration()
        client = await MemoryKeyValueStoreClient.open(id=id, name=name, alias=alias)
        await self._purge_if_needed(client, configuration)
        return client

    @override
    async def create_rq_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> MemoryRequestQueueClient:
        configuration = configuration or Configuration.get_global_configuration()
        client = await MemoryRequestQueueClient.open(id=id, name=name, alias=alias)
        await self._purge_if_needed(client, configuration)
        return client


================================================
FILE: src/crawlee/storage_clients/_memory/py.typed
================================================


================================================
FILE: src/crawlee/storage_clients/_redis/__init__.py
================================================
from ._dataset_client import RedisDatasetClient
from ._key_value_store_client import RedisKeyValueStoreClient
from ._request_queue_client import RedisRequestQueueClient
from ._storage_client import RedisStorageClient

__all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']


================================================
FILE: src/crawlee/storage_clients/_redis/_client_mixin.py
================================================
from __future__ import annotations

import asyncio
from contextlib import asynccontextmanager
from datetime import datetime, timezone
from logging import getLogger
from typing import TYPE_CHECKING, Any, ClassVar, TypedDict, overload

from crawlee._utils.crypto import crypto_random_object_id

from ._utils import await_redis_response, read_lua_script

if TYPE_CHECKING:
    from collections.abc import AsyncIterator

    from redis.asyncio import Redis
    from redis.asyncio.client import Pipeline
    from redis.commands.core import AsyncScript
    from typing_extensions import NotRequired, Self

    from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata


logger = getLogger(__name__)


class MetadataUpdateParams(TypedDict, total=False):
    """Parameters for updating metadata."""

    update_accessed_at: NotRequired[bool]
    update_modified_at: NotRequired[bool]


class RedisClientMixin:
    """Mixin class for Redis clients.

    This mixin provides common Redis operations and basic methods for Redis storage clients.
    """

    _DEFAULT_NAME = 'default'
    """Default storage name in key prefix when none provided."""

    _MAIN_KEY: ClassVar[str]
    """Main Redis key prefix for this storage type."""

    _CLIENT_TYPE: ClassVar[str]
    """Human-readable client type for error messages."""

    def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None:
        self._storage_name = storage_name
        self._storage_id = storage_id
        self._redis = redis

        self._scripts_loaded = False

    @property
    def redis(self) -> Redis:
        """Return the Redis client instance."""
        return self._redis

    @property
    def metadata_key(self) -> str:
        """Return the Redis key for the metadata of this storage."""
        return f'{self._MAIN_KEY}:{self._storage_name}:metadata'

    @classmethod
    async def _get_metadata_by_name(cls, name: str, redis: Redis, *, with_wait: bool = False) -> dict | None:
        """Retrieve metadata by storage name.

        Args:
            name: The name of the storage.
            redis: The Redis client instance.
            with_wait: Whether to wait for the storage to be created if it doesn't exist.
        """
        if with_wait:
            # Wait for the creation signal (max 30 seconds)
            await await_redis_response(redis.blpop([f'{cls._MAIN_KEY}:{name}:created_signal'], timeout=30))
            # Signal consumed, push it back for other waiters
            await await_redis_response(redis.lpush(f'{cls._MAIN_KEY}:{name}:created_signal', 1))

        response = await await_redis_response(redis.json().get(f'{cls._MAIN_KEY}:{name}:metadata'))
        data = response[0] if response is not None and isinstance(response, list) else response
        if data is not None and not isinstance(data, dict):
            raise TypeError('The metadata data was received in an incorrect format.')
        return data

    @classmethod
    async def _get_metadata_name_by_id(cls, id: str, redis: Redis) -> str | None:
        """Retrieve storage name by ID from id_to_name index.

        Args:
            id: The ID of the storage.
            redis: The Redis client instance.
        """
        name = await await_redis_response(redis.hget(f'{cls._MAIN_KEY}:id_to_name', id))
        if isinstance(name, str) or name is None:
            return name
        if isinstance(name, bytes):
            return name.decode('utf-8')
        return None

    @classmethod
    async def _open(
        cls,
        *,
        id: str | None,
        name: str | None,
        alias: str | None,
        metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata],
        redis: Redis,
        extra_metadata_fields: dict[str, Any],
        instance_kwargs: dict[str, Any],
    ) -> Self:
        """Open or create a new Redis storage client.

        Args:
            id: The ID of the storage. If not provided, a random ID will be generated.
            name: The name of the storage for named (global scope) storages.
            alias: The alias of the storage for unnamed (run scope) storages.
            redis: Redis client instance.
            metadata_model: Pydantic model for metadata validation.
            extra_metadata_fields: Storage-specific metadata fields.
            instance_kwargs: Additional arguments for the client constructor.

        Returns:
            An instance for the opened or created storage client.
        """
        internal_name = name or alias or cls._DEFAULT_NAME
        storage_id: str | None = None
        # Determine if storage exists by ID or name
        if id:
            storage_name = await cls._get_metadata_name_by_id(id=id, redis=redis)
            storage_id = id
            if storage_name is None:
                raise ValueError(f'{cls._CLIENT_TYPE} with ID "{id}" does not exist.')
        else:
            metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis)
            storage_name = internal_name if metadata_data is not None else None
            storage_id = metadata_data['id'] if metadata_data is not None else None
        # If both storage_name and storage_id are found, open existing storage
        if storage_name and storage_id:
            client = cls(storage_name=storage_name, storage_id=storage_id, redis=redis, **instance_kwargs)
            async with client._get_pipeline() as pipe:
                await client._update_metadata(pipe, update_accessed_at=True)
        # Otherwise, create a new storage
        else:
            now = datetime.now(timezone.utc)
            metadata = metadata_model(
                id=crypto_random_object_id(),
                name=name,
                created_at=now,
                accessed_at=now,
                modified_at=now,
                **extra_metadata_fields,
            )
            client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis, **instance_kwargs)
            created = await client._create_metadata_and_storage(internal_name, metadata.model_dump())
            # The client was probably not created due to a race condition. Let's try to open it using the name.
            if not created:
                metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis, with_wait=True)
                client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis, **instance_kwargs)

        # Ensure Lua scripts are loaded
        await client._ensure_scripts_loaded()
        return client

    async def _load_scripts(self) -> None:
        """Load Lua scripts in Redis."""
        return

    async def _ensure_scripts_loaded(self) -> None:
        """Ensure Lua scripts are loaded in Redis."""
        if not self._scripts_loaded:
            await self._load_scripts()
            self._scripts_loaded = True

    @asynccontextmanager
    async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIterator[Pipeline]:
        """Create a new Redis pipeline."""
        async with self._redis.pipeline() as pipe:
            try:
                pipe.multi()
                yield pipe
            finally:
                if with_execute:
                    await pipe.execute()

    async def _create_storage(self, pipeline: Pipeline) -> None:
        """Create the actual storage structure in Redis."""

    async def _create_script(self, script_name: str) -> AsyncScript:
        """Load a Lua script from a file and return a Script object."""
        script_content = await asyncio.to_thread(read_lua_script, script_name)

        return self._redis.register_script(script_content)

    async def _create_metadata_and_storage(self, storage_name: str, metadata: dict) -> bool:
        index_id_to_name = f'{self._MAIN_KEY}:id_to_name'
        index_name_to_id = f'{self._MAIN_KEY}:name_to_id'
        metadata['created_at'] = metadata['created_at'].isoformat()
        metadata['accessed_at'] = metadata['accessed_at'].isoformat()
        metadata['modified_at'] = metadata['modified_at'].isoformat()

        # Try to create name_to_id index entry, if it already exists, return False.
        name_to_id = await await_redis_response(self._redis.hsetnx(index_name_to_id, storage_name, metadata['id']))
        # If name already exists, return False. Probably an attempt at parallel creation.
        if not name_to_id:
            return False

        # Create id_to_name index entry, metadata, and storage structure in a transaction.
        async with self._get_pipeline() as pipe:
            await await_redis_response(pipe.hsetnx(index_id_to_name, metadata['id'], storage_name))
            await await_redis_response(pipe.json().set(self.metadata_key, '$', metadata))
            await await_redis_response(pipe.lpush(f'{self._MAIN_KEY}:{storage_name}:created_signal', 1))

            await self._create_storage(pipe)

        return True

    async def _drop(self, extra_keys: list[str]) -> None:
        async with self._get_pipeline() as pipe:
            await pipe.delete(self.metadata_key)
            await pipe.delete(f'{self._MAIN_KEY}:id_to_name', self._storage_id)
            await pipe.delete(f'{self._MAIN_KEY}:name_to_id', self._storage_name)
            await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:created_signal')
            for key in extra_keys:
                await pipe.delete(key)

    async def _purge(self, extra_keys: list[str], metadata_kwargs: MetadataUpdateParams) -> None:
        async with self._get_pipeline() as pipe:
            for key in extra_keys:
                await pipe.delete(key)
            await self._update_metadata(pipe, **metadata_kwargs)
            await self._create_storage(pipe)

    @overload
    async def _get_metadata(self, metadata_model: type[DatasetMetadata]) -> DatasetMetadata: ...
    @overload
    async def _get_metadata(self, metadata_model: type[KeyValueStoreMetadata]) -> KeyValueStoreMetadata: ...
    @overload
    async def _get_metadata(self, metadata_model: type[RequestQueueMetadata]) -> RequestQueueMetadata: ...

    async def _get_metadata(
        self, metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata]
    ) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata:
        """Retrieve client metadata."""
        metadata_dict = await self._get_metadata_by_name(name=self._storage_name, redis=self._redis)
        if metadata_dict is None:
            raise ValueError(f'{self._CLIENT_TYPE} with name "{self._storage_name}" does not exist.')
        async with self._get_pipeline() as pipe:
            await self._update_metadata(pipe, update_accessed_at=True)

        return metadata_model.model_validate(metadata_dict)

    async def _specific_update_metadata(self, pipeline: Pipeline, **kwargs: Any) -> None:
        """Pipeline operations storage-specific metadata updates.

        Must be implemented by concrete classes.

        Args:
            pipeline: The Redis pipeline to use for the update.
            **kwargs: Storage-specific update parameters.
        """

    async def _update_metadata(
        self,
        pipeline: Pipeline,
        *,
        update_accessed_at: bool = False,
        update_modified_at: bool = False,
        **kwargs: Any,
    ) -> None:
        """Update storage metadata combining common and specific fields.

        Args:
            pipeline: The Redis pipeline to use for the update.
            update_accessed_at: Whether to update accessed_at timestamp.
            update_modified_at: Whether to update modified_at timestamp.
            **kwargs: Additional arguments for _specific_update_metadata.
        """
        now = datetime.now(timezone.utc)

        if update_accessed_at:
            await await_redis_response(
                pipeline.json().set(self.metadata_key, '$.accessed_at', now.isoformat(), nx=False, xx=True)
            )
        if update_modified_at:
            await await_redis_response(
                pipeline.json().set(self.metadata_key, '$.modified_at', now.isoformat(), nx=False, xx=True)
            )

        await self._specific_update_metadata(pipeline, **kwargs)


================================================
FILE: src/crawlee/storage_clients/_redis/_dataset_client.py
================================================
from __future__ import annotations

from logging import getLogger
from typing import TYPE_CHECKING, Any, cast

from typing_extensions import NotRequired, override

from crawlee.storage_clients._base import DatasetClient
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata

from ._client_mixin import MetadataUpdateParams, RedisClientMixin
from ._utils import await_redis_response

if TYPE_CHECKING:
    from collections.abc import AsyncIterator

    from redis.asyncio import Redis
    from redis.asyncio.client import Pipeline

logger = getLogger(__name__)


class _DatasetMetadataUpdateParams(MetadataUpdateParams):
    """Parameters for updating dataset metadata."""

    new_item_count: NotRequired[int]
    delta_item_count: NotRequired[int]


class RedisDatasetClient(DatasetClient, RedisClientMixin):
    """Redis implementation of the dataset client.

    This client persists dataset items to Redis using JSON arrays for efficient storage and retrieval.
    Items are stored as JSON objects with automatic ordering preservation through Redis list operations.

    The dataset data is stored in Redis using the following key pattern:
    - `datasets:{name}:items` - Redis JSON array containing all dataset items.
    - `datasets:{name}:metadata` - Redis JSON object containing dataset metadata.

    Items must be JSON-serializable dictionaries. Single items or lists of items can be pushed to the dataset.
    The item ordering is preserved through Redis JSON array operations. All operations provide atomic consistency
    through Redis transactions and pipeline operations.
    """

    _DEFAULT_NAME = 'default'
    """Default Dataset name key prefix when none provided."""

    _MAIN_KEY = 'datasets'
    """Main Redis key prefix for Dataset."""

    _CLIENT_TYPE = 'Dataset'
    """Human-readable client type for error messages."""

    def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None:
        """Initialize a new instance.

        Preferably use the `RedisDatasetClient.open` class method to create a new instance.

        Args:
            storage_name: Internal storage name used for Redis keys.
            storage_id: Unique identifier for the dataset.
            redis: Redis client instance.
        """
        super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis)

    @property
    def _items_key(self) -> str:
        """Return the Redis key for the items of this dataset."""
        return f'{self._MAIN_KEY}:{self._storage_name}:items'

    @classmethod
    async def open(
        cls,
        *,
        id: str | None,
        name: str | None,
        alias: str | None,
        redis: Redis,
    ) -> RedisDatasetClient:
        """Open or create a new Redis dataset client.

        This method attempts to open an existing dataset from the Redis database. If a dataset with the specified
        ID or name exists, it loads the metadata from the database. If no existing store is found, a new one
        is created.

        Args:
            id: The ID of the dataset. If not provided, a random ID will be generated.
            name: The name of the dataset for named (global scope) storages.
            alias: The alias of the dataset for unnamed (run scope) storages.
            redis: Redis client instance.

        Returns:
            An instance for the opened or created storage client.
        """
        return await cls._open(
            id=id,
            name=name,
            alias=alias,
            redis=redis,
            metadata_model=DatasetMetadata,
            extra_metadata_fields={'item_count': 0},
            instance_kwargs={},
        )

    @override
    async def get_metadata(self) -> DatasetMetadata:
        return await self._get_metadata(DatasetMetadata)

    @override
    async def drop(self) -> None:
        await self._drop(extra_keys=[self._items_key])

    @override
    async def purge(self) -> None:
        await self._purge(
            extra_keys=[self._items_key],
            metadata_kwargs=_DatasetMetadataUpdateParams(
                new_item_count=0, update_accessed_at=True, update_modified_at=True
            ),
        )

    @override
    async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
        if isinstance(data, dict):
            data = [data]

        async with self._get_pipeline() as pipe:
            pipe.json().arrappend(self._items_key, '$', *data)
            await self._update_metadata(
                pipe,
                **_DatasetMetadataUpdateParams(
                    update_accessed_at=True, update_modified_at=True, delta_item_count=len(data)
                ),
            )

    @override
    async def get_data(
        self,
        *,
        offset: int = 0,
        limit: int | None = 999_999_999_999,
        clean: bool = False,
        desc: bool = False,
        fields: list[str] | None = None,
        omit: list[str] | None = None,
        unwind: list[str] | None = None,
        skip_empty: bool = False,
        skip_hidden: bool = False,
        flatten: list[str] | None = None,
        view: str | None = None,
    ) -> DatasetItemsListPage:
        # Check for unsupported arguments and log a warning if found
        unsupported_args: dict[str, Any] = {
            'clean': clean,
            'fields': fields,
            'omit': omit,
            'unwind': unwind,
            'skip_hidden': skip_hidden,
            'flatten': flatten,
            'view': view,
        }
        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}

        if unsupported:
            logger.warning(
                f'The arguments {list(unsupported.keys())} of get_data are not supported '
                f'by the {self.__class__.__name__} client.'
            )

        metadata = await self.get_metadata()

        total = metadata.item_count
        json_path = '$'

        # Apply sorting and pagination
        match (desc, offset, limit):
            case (True, 0, int()):
                json_path += f'[-{limit}:]'
            case (True, int(), None):
                json_path += f'[:-{offset}]'
            case (True, int(), int()):
                # ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887.
                json_path += f'[-{offset + limit}:-{offset}]'  # ty: ignore[unsupported-operator]
            case (False, 0, int()):
                json_path += f'[:{limit}]'
            case (False, int(), None):
                json_path += f'[{offset}:]'
            case (False, int(), int()):
                # ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887.
                json_path += f'[{offset}:{offset + limit}]'  # ty: ignore[unsupported-operator]

        if json_path == '$':
            json_path = '$[*]'

        data = await await_redis_response(self._redis.json().get(self._items_key, json_path))

        if data is None:
            data = []

        data = [item for item in data if isinstance(item, dict)]

        if skip_empty:
            data = [item for item in data if item]

        if desc:
            data = list(reversed(data))

        async with self._get_pipeline() as pipe:
            await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True))

        return DatasetItemsListPage(
            count=len(data),
            offset=offset,
            limit=limit or (total - offset),
            total=total,
            desc=desc,
            items=data,
        )

    @override
    async def iterate_items(
        self,
        *,
        offset: int = 0,
        limit: int | None = None,
        clean: bool = False,
        desc: bool = False,
        fields: list[str] | None = None,
        omit: list[str] | None = None,
        unwind: list[str] | None = None,
        skip_empty: bool = False,
        skip_hidden: bool = False,
    ) -> AsyncIterator[dict[str, Any]]:
        """Iterate over dataset items one by one.

        This method yields items individually instead of loading all items at once,
        which is more memory efficient for large datasets.
        """
        # Log warnings for unsupported arguments
        unsupported_args: dict[str, Any] = {
            'clean': clean,
            'fields': fields,
            'omit': omit,
            'unwind': unwind,
            'skip_hidden': skip_hidden,
        }
        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}

        if unsupported:
            logger.warning(
                f'The arguments {list(unsupported.keys())} of iterate_items are not supported '
                f'by the {self.__class__.__name__} client.'
            )

        metadata = await self.get_metadata()
        total_items = metadata.item_count

        # Calculate actual range based on parameters
        start_idx = offset
        end_idx = min(total_items, offset + limit) if limit is not None else total_items

        # Update accessed_at timestamp
        async with self._get_pipeline() as pipe:
            await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True))

        # Process items in batches for better network efficiency
        batch_size = 100

        for batch_start in range(start_idx, end_idx, batch_size):
            batch_end = min(batch_start + batch_size, end_idx)

            # Build JsonPath for batch slice
            if desc:
                # For descending order, we need to reverse the slice calculation
                desc_batch_start = total_items - batch_end
                desc_batch_end = total_items - batch_start
                json_path = f'$[{desc_batch_start}:{desc_batch_end}]'
            else:
                json_path = f'$[{batch_start}:{batch_end}]'

            # Get batch of items
            batch_items = await await_redis_response(self._redis.json().get(self._items_key, json_path))

            # Handle case where batch_items might be None or not a list
            if batch_items is None:
                continue

            # Reverse batch if desc order (since we got items in normal order but need desc)
            items_iter = reversed(batch_items) if desc else iter(batch_items)

            # Yield items from batch
            for item in items_iter:
                # Apply skip_empty filter
                if skip_empty and not item:
                    continue

                yield cast('dict[str, Any]', item)

        async with self._get_pipeline() as pipe:
            await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True))

    @override
    async def _create_storage(self, pipeline: Pipeline) -> None:
        """Create the main dataset keys in Redis."""
        # Create an empty JSON array for items
        await await_redis_response(pipeline.json().set(self._items_key, '$', []))

    @override
    async def _specific_update_metadata(
        self,
        pipeline: Pipeline,
        *,
        new_item_count: int | None = None,
        delta_item_count: int | None = None,
        **_kwargs: Any,
    ) -> None:
        """Update the dataset metadata in the database.

        Args:
            pipeline: The Redis pipeline to use for the update.
            new_item_count: If provided, update the item count to this value.
            delta_item_count: If provided, increment the item count by this value.
        """
        if new_item_count is not None:
            await await_redis_response(
                pipeline.json().set(self.metadata_key, '$.item_count', new_item_count, nx=False, xx=True)
            )
        elif delta_item_count is not None:
            await await_redis_response(pipeline.json().numincrby(self.metadata_key, '$.item_count', delta_item_count))


================================================
FILE: src/crawlee/storage_clients/_redis/_key_value_store_client.py
================================================
from __future__ import annotations

import json
from logging import getLogger
from typing import TYPE_CHECKING, Any

from typing_extensions import override

from crawlee._utils.file import infer_mime_type
from crawlee.storage_clients._base import KeyValueStoreClient
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata

from ._client_mixin import MetadataUpdateParams, RedisClientMixin
from ._utils import await_redis_response

if TYPE_CHECKING:
    from collections.abc import AsyncIterator

    from redis.asyncio import Redis

logger = getLogger(__name__)


class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):
    """Redis implementation of the key-value store client.

    This client persists key-value data to Redis using hash data structures for efficient storage and retrieval.
    Keys are mapped to values with automatic content type detection and size tracking for metadata management.

    The key-value store data is stored in Redis using the following key pattern:
    - `key_value_stores:{name}:items` - Redis hash containing key-value pairs (values stored as binary data).
    - `key_value_stores:{name}:metadata_items` - Redis hash containing metadata for each key.
    - `key_value_stores:{name}:metadata` - Redis JSON object containing store metadata.

    Values are serialized based on their type: JSON objects are stored as UTF-8 encoded JSON strings,
    text values as UTF-8 encoded strings, and binary data as-is. The implementation automatically handles
    content type detection and maintains metadata about each record including size and MIME type information.

    All operations are atomic through Redis hash operations and pipeline transactions. The client supports
    concurrent access through Redis's built-in atomic operations for hash fields.
    """

    _DEFAULT_NAME = 'default'
    """Default Key-Value Store name key prefix when none provided."""

    _MAIN_KEY = 'key_value_stores'
    """Main Redis key prefix for Key-Value Store."""

    _CLIENT_TYPE = 'Key-value store'
    """Human-readable client type for error messages."""

    def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None:
        """Initialize a new instance.

        Preferably use the `RedisKeyValueStoreClient.open` class method to create a new instance.
        """
        super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis)

    @property
    def _items_key(self) -> str:
        """Return the Redis key for the items of KVS."""
        return f'{self._MAIN_KEY}:{self._storage_name}:items'

    @property
    def _metadata_items_key(self) -> str:
        """Return the Redis key for the items metadata of KVS."""
        return f'{self._MAIN_KEY}:{self._storage_name}:metadata_items'

    @classmethod
    async def open(
        cls,
        *,
        id: str | None,
        name: str | None,
        alias: str | None,
        redis: Redis,
    ) -> RedisKeyValueStoreClient:
        """Open or create a new Redis key-value store client.

        This method attempts to open an existing key-value store from the Redis database. If a store with the specified
        ID or name exists, it loads the metadata from the database. If no existing store is found, a new one
        is created.

        Args:
            id: The ID of the key-value store. If not provided, a random ID will be generated.
            name: The name of the key-value store for named (global scope) storages.
            alias: The alias of the key-value store for unnamed (run scope) storages.
            redis: Redis client instance.

        Returns:
            An instance for the opened or created storage client.
        """
        return await cls._open(
            id=id,
            name=name,
            alias=alias,
            redis=redis,
            metadata_model=KeyValueStoreMetadata,
            extra_metadata_fields={},
            instance_kwargs={},
        )

    @override
    async def get_metadata(self) -> KeyValueStoreMetadata:
        return await self._get_metadata(KeyValueStoreMetadata)

    @override
    async def drop(self) -> None:
        await self._drop(extra_keys=[self._items_key, self._metadata_items_key])

    @override
    async def purge(self) -> None:
        await self._purge(
            extra_keys=[self._items_key, self._metadata_items_key],
            metadata_kwargs=MetadataUpdateParams(update_accessed_at=True, update_modified_at=True),
        )

    @override
    async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:
        # Special handling for None values
        if value is None:
            content_type = 'application/x-none'  # Special content type to identify None values
            value_bytes = b''
        else:
            content_type = content_type or infer_mime_type(value)

            # Serialize the value to bytes.
            if 'application/json' in content_type:
                value_bytes = json.dumps(value, default=str, ensure_ascii=False).encode('utf-8')
            elif isinstance(value, str):
                value_bytes = value.encode('utf-8')
            elif isinstance(value, (bytes, bytearray)):
                value_bytes = value
            else:
                # Fallback: attempt to convert to string and encode.
                value_bytes = str(value).encode('utf-8')

        size = len(value_bytes)
        item_metadata = KeyValueStoreRecordMetadata(
            key=key,
            content_type=content_type,
            size=size,
        )

        async with self._get_pipeline() as pipe:
            # redis-py typing issue
            await await_redis_response(pipe.hset(self._items_key, key, value_bytes))  # ty: ignore[invalid-argument-type]

            await await_redis_response(
                pipe.hset(
                    self._metadata_items_key,
                    key,
                    item_metadata.model_dump_json(),
                )
            )
            await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True))

    @override
    async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
        serialized_metadata_item = await await_redis_response(self._redis.hget(self._metadata_items_key, key))

        async with self._get_pipeline() as pipe:
            await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True))

        if not isinstance(serialized_metadata_item, (str, bytes, bytearray)):
            logger.warning(f'Metadata for key "{key}" is missing or invalid.')
            return None

        metadata_item = KeyValueStoreRecordMetadata.model_validate_json(serialized_metadata_item)

        # Handle None values
        if metadata_item.content_type == 'application/x-none':
            return KeyValueStoreRecord(value=None, **metadata_item.model_dump())

        # Query the record by key
        # redis-py typing issue
        value_bytes: bytes | None = await await_redis_response(self._redis.hget(self._items_key, key))  # ty: ignore[invalid-assignment]

        if value_bytes is None:
            logger.warning(f'Value for key "{key}" is missing.')
            return None

        # Handle JSON values
        if 'application/json' in metadata_item.content_type:
            try:
                value = json.loads(value_bytes.decode('utf-8'))
            except (json.JSONDecodeError, UnicodeDecodeError):
                logger.warning(f'Failed to decode JSON value for key "{key}"')
                return None
        # Handle text values
        elif metadata_item.content_type.startswith('text/'):
            try:
                value = value_bytes.decode('utf-8')
            except UnicodeDecodeError:
                logger.warning(f'Failed to decode text value for key "{key}"')
                return None
        # Handle binary values
        else:
            value = value_bytes

        return KeyValueStoreRecord(value=value, **metadata_item.model_dump())

    @override
    async def delete_value(self, *, key: str) -> None:
        async with self._get_pipeline() as pipe:
            await await_redis_response(pipe.hdel(self._items_key, key))
            await await_redis_response(pipe.hdel(self._metadata_items_key, key))
            await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True))

    @override
    async def iterate_keys(
        self,
        *,
        exclusive_start_key: str | None = None,
        limit: int | None = None,
    ) -> AsyncIterator[KeyValueStoreRecordMetadata]:
        items_data = await await_redis_response(self._redis.hgetall(self._metadata_items_key))

        if not items_data:
            return  # No items to iterate over

        if not isinstance(items_data, dict):
            raise TypeError('The items data was received in an incorrect format.')

        # Get all keys, sorted alphabetically
        keys = sorted(items_data.keys())

        # Apply exclusive_start_key filter if provided
        if exclusive_start_key is not None:
            bytes_exclusive_start_key = exclusive_start_key.encode()
            keys = [k for k in keys if k > bytes_exclusive_start_key]

        # Apply limit if provided
        if limit is not None:
            keys = keys[:limit]

        # Yield metadata for each key
        for key in keys:
            record = items_data[key]
            if not isinstance(record, (str, bytes)):
                raise TypeError(f'Expected str or bytes, got {type(record)}')
            yield KeyValueStoreRecordMetadata.model_validate_json(record)

        async with self._get_pipeline() as pipe:
            await self._update_metadata(
                pipe,
                **MetadataUpdateParams(update_accessed_at=True),
            )

    @override
    async def get_public_url(self, *, key: str) -> str:
        raise NotImplementedError('Public URLs are not supported for memory key-value stores.')

    @override
    async def record_exists(self, *, key: str) -> bool:
        async with self._get_pipeline(with_execute=False) as pipe:
            await await_redis_response(pipe.hexists(self._items_key, key))
            await self._update_metadata(
                pipe,
                **MetadataUpdateParams(update_accessed_at=True),
            )
            results = await pipe.execute()

        return bool(results[0])


================================================
FILE: src/crawlee/storage_clients/_redis/_request_queue_client.py
================================================
from __future__ import annotations

import json
from collections import deque
from datetime import datetime, timedelta, timezone
from logging import getLogger
from typing import TYPE_CHECKING, Any, Literal

from typing_extensions import NotRequired, override

from crawlee import Request
from crawlee._utils.crypto import crypto_random_object_id
from crawlee.storage_clients._base import RequestQueueClient
from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata

from ._client_mixin import MetadataUpdateParams, RedisClientMixin
from ._utils import await_redis_response

if TYPE_CHECKING:
    from collections.abc import Sequence

    from redis.asyncio import Redis
    from redis.asyncio.client import Pipeline
    from redis.commands.core import AsyncScript

logger = getLogger(__name__)


class _QueueMetadataUpdateParams(MetadataUpdateParams):
    """Parameters for updating queue metadata."""

    new_handled_request_count: NotRequired[int]
    new_pending_request_count: NotRequired[int]
    new_total_request_count: NotRequired[int]
    delta_handled_request_count: NotRequired[int]
    delta_pending_request_count: NotRequired[int]
    delta_total_request_count: NotRequired[int]
    recalculate: NotRequired[bool]
    update_had_multiple_clients: NotRequired[bool]


class RedisRequestQueueClient(RequestQueueClient, RedisClientMixin):
    """Redis implementation of the request queue client.

    This client persists requests to Redis using multiple data structures for efficient queue operations,
    deduplication, and concurrent access safety. Requests are stored with FIFO ordering and support
    both regular and forefront (high-priority) insertion modes.

    The implementation uses Bloom filters for efficient request deduplication and Redis lists for
    queue operations. Request blocking and client coordination is handled through Redis hashes
    with timestamp-based expiration for stale request recovery.

    The request queue data is stored in Redis using the following key patterns:
    - `request_queues:{name}:queue` - Redis list for FIFO request ordering
    - `request_queues:{name}:data` - Redis hash storing serialized Request objects by unique_key
    - `request_queues:{name}:in_progress` - Redis hash tracking requests currently being processed
    - `request_queues:{name}:added_bloom_filter` - Bloom filter for added request deduplication (`bloom` dedup_strategy)
    - `request_queues:{name}:handled_bloom_filter` - Bloom filter for completed request tracking (`bloom`
        dedup_strategy)
    - `request_queues:{name}:pending_set` - Redis set for added request deduplication (`default` dedup_strategy)
    - `request_queues:{name}:handled_set` - Redis set for completed request tracking (`default` dedup_strategy)
    - `request_queues:{name}:metadata` - Redis JSON object containing queue metadata

    Requests are serialized to JSON for storage and maintain proper FIFO ordering through Redis list
    operations. The implementation provides concurrent access safety through atomic Lua scripts,
    Bloom filter operations, and Redis's built-in atomicity guarantees for individual operations.
    """

    _DEFAULT_NAME = 'default'
    """Default Request Queue name key prefix when none provided."""

    _MAIN_KEY = 'request_queues'
    """Main Redis key prefix for Request Queue."""

    _CLIENT_TYPE = 'Request queue'
    """Human-readable client type for error messages."""

    _MAX_BATCH_FETCH_SIZE = 10
    """Maximum number of requests to fetch in a single batch operation."""

    _BLOCK_REQUEST_TIME = 300_000  # milliseconds
    """Time in milliseconds to block a fetched request for other clients before it can be autoreclaimed."""

    _RECLAIM_INTERVAL = timedelta(seconds=30)
    """Interval to check for stale requests to reclaim."""

    def __init__(
        self,
        storage_name: str,
        storage_id: str,
        redis: Redis,
        dedup_strategy: Literal['default', 'bloom'] = 'default',
        bloom_error_rate: float = 1e-7,
    ) -> None:
        """Initialize a new instance.

        Preferably use the `RedisRequestQueueClient.open` class method to create a new instance.
        """
        super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis)

        self._dedup_strategy = dedup_strategy
        """Deduplication strategy for the queue."""

        self._bloom_error_rate = bloom_error_rate
        """Desired false positive rate for Bloom filters."""

        self._pending_fetch_cache: deque[Request] = deque()
        """Cache for requests: ordered by sequence number."""

        self.client_key = crypto_random_object_id(length=32)[:32]
        """Unique identifier for this client instance."""

        # Lua scripts for atomic operations
        self._fetch_script: AsyncScript | None = None
        self._reclaim_stale_script: AsyncScript | None = None
        self._add_requests_script: AsyncScript | None = None

        self._next_reclaim_stale: None | datetime = None

    @property
    def _added_filter_key(self) -> str:
        """Return the Redis key for the added requests Bloom filter."""
        if self._dedup_strategy != 'bloom':
            raise RuntimeError('The added requests filter is only available with the bloom deduplication strategy.')
        return f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter'

    @property
    def _handled_filter_key(self) -> str:
        """Return the Redis key for the handled requests Bloom filter."""
        if self._dedup_strategy != 'bloom':
            raise RuntimeError('The handled requests filter is only available with the bloom deduplication strategy.')
        return f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter'

    @property
    def _pending_set_key(self) -> str:
        """Return the Redis key for the pending requests set."""
        if self._dedup_strategy != 'default':
            raise RuntimeError('The pending requests set is only available with the default deduplication strategy.')
        return f'{self._MAIN_KEY}:{self._storage_name}:pending_set'

    @property
    def _handled_set_key(self) -> str:
        """Return the Redis key for the handled requests set."""
        if self._dedup_strategy != 'default':
            raise RuntimeError('The handled requests set is only available with the default deduplication strategy.')
        return f'{self._MAIN_KEY}:{self._storage_name}:handled_set'

    @property
    def _queue_key(self) -> str:
        """Return the Redis key for the request queue."""
        return f'{self._MAIN_KEY}:{self._storage_name}:queue'

    @property
    def _data_key(self) -> str:
        """Return the Redis key for the request data hash."""
        return f'{self._MAIN_KEY}:{self._storage_name}:data'

    @property
    def _in_progress_key(self) -> str:
        """Return the Redis key for the in-progress requests hash."""
        return f'{self._MAIN_KEY}:{self._storage_name}:in_progress'

    @classmethod
    async def open(
        cls,
        *,
        id: str | None,
        name: str | None,
        alias: str | None,
        redis: Redis,
        dedup_strategy: Literal['default', 'bloom'] = 'default',
        bloom_error_rate: float = 1e-7,
    ) -> RedisRequestQueueClient:
        """Open or create a new Redis request queue client.

        This method attempts to open an existing request queue from the Redis database. If a queue with the specified
        ID or name exists, it loads the metadata from the database. If no existing queue is found, a new one
        is created.

        Args:
            id: The ID of the request queue. If not provided, a random ID will be generated.
            name: The name of the dataset for named (global scope) storages.
            alias: The alias of the dataset for unnamed (run scope) storages.
            redis: Redis client instance.
            dedup_strategy: Strategy for request queue deduplication. Options are:
                - 'default': Uses Redis sets for exact deduplication.
                - 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using
                    this approach, there is a possibility 1e-7 that requests will be skipped in the queue.
            bloom_error_rate: Desired false positive rate for Bloom filter deduplication. Only relevant if
                `dedup_strategy` is set to 'bloom'.

        Returns:
            An instance for the opened or created storage client.
        """
        return await cls._open(
            id=id,
            name=name,
            alias=alias,
            redis=redis,
            metadata_model=RequestQueueMetadata,
            extra_metadata_fields={
                'had_multiple_clients': False,
                'handled_request_count': 0,
                'pending_request_count': 0,
                'total_request_count': 0,
            },
            instance_kwargs={'dedup_strategy': dedup_strategy, 'bloom_error_rate': bloom_error_rate},
        )

    @override
    async def get_metadata(self) -> RequestQueueMetadata:
        return await self._get_metadata(RequestQueueMetadata)

    @override
    async def drop(self) -> None:
        if self._dedup_strategy == 'bloom':
            extra_keys = [self._added_filter_key, self._handled_filter_key]
        elif self._dedup_strategy == 'default':
            extra_keys = [self._pending_set_key, self._handled_set_key]
        else:
            raise RuntimeError(f'Unknown deduplication strategy: {self._dedup_strategy}')
        extra_keys.extend([self._queue_key, self._data_key, self._in_progress_key])
        await self._drop(extra_keys=extra_keys)

    @override
    async def purge(self) -> None:
        if self._dedup_strategy == 'bloom':
            extra_keys = [self._added_filter_key, self._handled_filter_key]
        elif self._dedup_strategy == 'default':
            extra_keys = [self._pending_set_key, self._handled_set_key]
        else:
            raise RuntimeError(f'Unknown deduplication strategy: {self._dedup_strategy}')
        extra_keys.extend([self._queue_key, self._data_key, self._in_progress_key])
        await self._purge(
            extra_keys=extra_keys,
            metadata_kwargs=_QueueMetadataUpdateParams(
                update_accessed_at=True,
                update_modified_at=True,
                new_pending_request_count=0,
                new_handled_request_count=0,
                new_total_request_count=0,
            ),
        )

    @override
    async def add_batch_of_requests(
        self,
        requests: Sequence[Request],
        *,
        forefront: bool = False,
    ) -> AddRequestsResponse:
        if self._add_requests_script is None:
            raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.')

        processed_requests = []

        delta_pending = 0
        delta_total = 0

        requests_by_unique_key = {req.unique_key: req for req in requests}
        unique_keys = list(requests_by_unique_key.keys())
        # Check which requests are already added or handled
        async with self._get_pipeline(with_execute=False) as pipe:
            if self._dedup_strategy == 'default':
                await await_redis_response(pipe.smismember(self._pending_set_key, unique_keys))
                await await_redis_response(pipe.smismember(self._handled_set_key, unique_keys))
            elif self._dedup_strategy == 'bloom':
                await await_redis_response(pipe.bf().mexists(self._added_filter_key, *unique_keys))
                await await_redis_response(pipe.bf().mexists(self._handled_filter_key, *unique_keys))

            pipe_results = await pipe.execute()

        added_pending_flags = pipe_results[0]
        handled_flags = pipe_results[1]

        new_unique_keys = []
        new_request_data = {}
        delta_pending = 0
        delta_total = 0

        for i, unique_key in enumerate(unique_keys):
            # Already handled - skip
            if handled_flags[i]:
                processed_requests.append(
                    ProcessedRequest(
                        unique_key=unique_key,
                        was_already_present=True,
                        was_already_handled=True,
                    )
                )
                continue

            # Already in queue - skip
            if added_pending_flags[i]:
                processed_requests.append(
                    ProcessedRequest(
                        unique_key=unique_key,
                        was_already_present=True,
                        was_already_handled=False,
                    )
                )
                continue

            # New request - will add to queue
            request = requests_by_unique_key[unique_key]

            new_unique_keys.append(unique_key)
            new_request_data[unique_key] = request.model_dump_json()

        if new_unique_keys:
            # Add new requests to the queue atomically, get back which were actually added
            script_results = await self._add_requests_script(
                keys=[
                    self._added_filter_key if self._dedup_strategy == 'bloom' else self._pending_set_key,
                    self._queue_key,
                    self._data_key,
                ],
                args=[int(forefront), json.dumps(new_unique_keys), json.dumps(new_request_data)],
            )
            actually_added = set(json.loads(script_results))

            delta_pending = len(actually_added)
            delta_total = len(actually_added)

            processed_requests.extend(
                [
                    ProcessedRequest(
                        unique_key=unique_key,
                        was_already_present=unique_key not in actually_added,
                        was_already_handled=False,
                    )
                    for unique_key in new_unique_keys
                ]
            )

        async with self._get_pipeline() as pipe:
            await self._update_metadata(
                pipe,
                **_QueueMetadataUpdateParams(
                    update_accessed_at=True,
                    update_modified_at=True,
                    delta_pending_request_count=delta_pending,
                    delta_total_request_count=delta_total,
                ),
            )

        return AddRequestsResponse(
            processed_requests=processed_requests,
            unprocessed_requests=[],
        )

    @override
    async def fetch_next_request(self) -> Request | None:
        if self._pending_fetch_cache:
            return self._pending_fetch_cache.popleft()

        if self._fetch_script is None:
            raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.')

        blocked_until_timestamp = int(datetime.now(tz=timezone.utc).timestamp() * 1000) + self._BLOCK_REQUEST_TIME

        # The script retrieves requests from the queue and places them in the in_progress hash.
        requests_json = await self._fetch_script(
            keys=[self._queue_key, self._in_progress_key, self._data_key],
            args=[self.client_key, blocked_until_timestamp, self._MAX_BATCH_FETCH_SIZE],
        )

        async with self._get_pipeline() as pipe:
            await self._update_metadata(pipe, **_QueueMetadataUpdateParams(update_accessed_at=True))

        if not requests_json:
            return None

        requests = [Request.model_validate_json(req_json) for req_json in requests_json]

        self._pending_fetch_cache.extend(requests[1:])

        return requests[0]

    @override
    async def get_request(self, unique_key: str) -> Request | None:
        request_data = await await_redis_response(self._redis.hget(self._data_key, unique_key))

        if isinstance(request_data, (str, bytes, bytearray)):
            return Request.model_validate_json(request_data)

        return None

    @override
    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
        # Check if the request is in progress.
        check_in_progress = await await_redis_response(self._redis.hexists(self._in_progress_key, request.unique_key))
        if not check_in_progress:
            logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.')
            return None

        # Update the request's handled_at timestamp.
        if request.handled_at is None:
            request.handled_at = datetime.now(timezone.utc)

        async with self._get_pipeline() as pipe:
            if self._dedup_strategy == 'default':
                await await_redis_response(pipe.sadd(self._handled_set_key, request.unique_key))
                await await_redis_response(pipe.srem(self._pending_set_key, request.unique_key))
            elif self._dedup_strategy == 'bloom':
                await await_redis_response(pipe.bf().add(self._handled_filter_key, request.unique_key))

            await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key))
            await await_redis_response(pipe.hset(self._data_key, request.unique_key, request.model_dump_json()))

            await self._update_metadata(
                pipe,
                **_QueueMetadataUpdateParams(
                    update_accessed_at=True,
                    update_modified_at=True,
                    delta_handled_request_count=1,
                    delta_pending_request_count=-1,
                ),
            )

        return ProcessedRequest(
            unique_key=request.unique_key,
            was_already_present=True,
            was_already_handled=True,
        )

    @override
    async def reclaim_request(
        self,
        request: Request,
        *,
        forefront: bool = False,
    ) -> ProcessedRequest | None:
        check_in_progress = await await_redis_response(self._redis.hexists(self._in_progress_key, request.unique_key))
        if not check_in_progress:
            logger.info(f'Reclaiming request {request.unique_key} that is not in progress.')
            return None

        async with self._get_pipeline() as pipe:
            if forefront:
                blocked_until_timestamp = (
                    int(datetime.now(tz=timezone.utc).timestamp() * 1000) + self._BLOCK_REQUEST_TIME
                )

                await await_redis_response(
                    pipe.hset(
                        self._in_progress_key,
                        request.unique_key,
                        f'{{"client_id":"{self.client_key}","blocked_until_timestamp":{blocked_until_timestamp}}}',
                    )
                )
                await await_redis_response(pipe.hset(self._data_key, request.unique_key, request.model_dump_json()))
                self._pending_fetch_cache.appendleft(request)
            else:
                await await_redis_response(pipe.rpush(self._queue_key, request.unique_key))
                await await_redis_response(pipe.hset(self._data_key, request.unique_key, request.model_dump_json()))
                await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key))
            await self._update_metadata(
                pipe,
                **_QueueMetadataUpdateParams(
                    update_modified_at=True,
                    update_accessed_at=True,
                ),
            )

        return ProcessedRequest(
            unique_key=request.unique_key,
            was_already_present=True,
            was_already_handled=False,
        )

    @override
    async def is_empty(self) -> bool:
        """Check if the queue is empty.

        Returns:
            True if the queue is empty, False otherwise.
        """
        if self._pending_fetch_cache:
            return False

        # Reclaim stale requests if needed
        if self._next_reclaim_stale is None or datetime.now(tz=timezone.utc) >= self._next_reclaim_stale:
            await self._reclaim_stale_requests()
            self._next_reclaim_stale = datetime.now(tz=timezone.utc) + self._RECLAIM_INTERVAL

        metadata = await self.get_metadata()

        return metadata.pending_request_count == 0

    async def _load_scripts(self) -> None:
        """Ensure Lua scripts are loaded in Redis."""
        self._fetch_script = await self._create_script('atomic_fetch_request.lua')
        self._reclaim_stale_script = await self._create_script('reclaim_stale_requests.lua')
        if self._dedup_strategy == 'bloom':
            self._add_requests_script = await self._create_script('atomic_bloom_add_requests.lua')
        elif self._dedup_strategy == 'default':
            self._add_requests_script = await self._create_script('atomic_set_add_requests.lua')

    @override
    async def _create_storage(self, pipeline: Pipeline) -> None:
        # Create Bloom filters for added and handled requests
        if self._dedup_strategy == 'bloom':
            await await_redis_response(
                pipeline.bf().create(
                    self._added_filter_key, errorRate=self._bloom_error_rate, capacity=100000, expansion=10
                )
            )
            await await_redis_response(
                pipeline.bf().create(
                    self._handled_filter_key, errorRate=self._bloom_error_rate, capacity=100000, expansion=10
                )
            )

    async def _reclaim_stale_requests(self) -> None:
        """Reclaim requests that have been in progress for too long."""
        if self._reclaim_stale_script is None:
            raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.')

        current_time = int(datetime.now(tz=timezone.utc).timestamp() * 1000)

        await self._reclaim_stale_script(
            keys=[self._in_progress_key, self._queue_key, self._data_key], args=[current_time]
        )

    @override
    async def _specific_update_metadata(
        self,
        pipeline: Pipeline,
        *,
        delta_handled_request_count: int | None = None,
        new_handled_request_count: int | None = None,
        delta_pending_request_count: int | None = None,
        new_pending_request_count: int | None = None,
        delta_total_request_count: int | None = None,
        new_total_request_count: int | None = None,
        update_had_multiple_clients: bool = False,
        **_kwargs: Any,
    ) -> None:
        """Update the dataset metadata with current information.

        Args:
            pipeline: The Redis pipeline to use for the update.
            new_handled_request_count: If provided, update the handled_request_count to this value.
            new_pending_request_count: If provided, update the pending_request_count to this value.
            new_total_request_count: If provided, update the total_request_count to this value.
            delta_handled_request_count: If provided, add this value to the handled_request_count.
            delta_pending_request_count: If provided, add this value to the pending_request_count.
            delta_total_request_count: If provided, add this value to the total_request_count.
            update_had_multiple_clients: If True, set had_multiple_clients to True.
        """
        if new_pending_request_count is not None:
            await await_redis_response(
                pipeline.json().set(
                    self.metadata_key, '$.pending_request_count', new_pending_request_count, nx=False, xx=True
                )
            )
        elif delta_pending_request_count is not None:
            await await_redis_response(
                pipeline.json().numincrby(self.metadata_key, '$.pending_request_count', delta_pending_request_count)
            )

        if new_handled_request_count is not None:
            await await_redis_response(
                pipeline.json().set(
                    self.metadata_key, '$.handled_request_count', new_handled_request_count, nx=False, xx=True
                )
            )
        elif delta_handled_request_count is not None:
            await await_redis_response(
                pipeline.json().numincrby(self.metadata_key, '$.handled_request_count', delta_handled_request_count)
            )

        if new_total_request_count is not None:
            await await_redis_response(
                pipeline.json().set(
                    self.metadata_key, '$.total_request_count', new_total_request_count, nx=False, xx=True
                )
            )
        elif delta_total_request_count is not None:
            await await_redis_response(
                pipeline.json().numincrby(self.metadata_key, '$.total_request_count', delta_total_request_count)
            )

        if update_had_multiple_clients:
            await await_redis_response(
                pipeline.json().set(
                    self.metadata_key, '$.had_multiple_clients', update_had_multiple_clients, nx=False, xx=True
                )
            )


================================================
FILE: src/crawlee/storage_clients/_redis/_storage_client.py
================================================
from __future__ import annotations

import warnings
from typing import Literal

from redis.asyncio import Redis
from typing_extensions import override

from crawlee._utils.docs import docs_group
from crawlee.configuration import Configuration
from crawlee.storage_clients._base import StorageClient

from ._dataset_client import RedisDatasetClient
from ._key_value_store_client import RedisKeyValueStoreClient
from ._request_queue_client import RedisRequestQueueClient


@docs_group('Storage clients')
class RedisStorageClient(StorageClient):
    """Redis implementation of the storage client.

    This storage client provides access to datasets, key-value stores, and request queues that persist data
    to a Redis database v8.0+. Each storage type uses Redis-specific data structures and key patterns for
    efficient storage and retrieval.

    The client accepts either a Redis connection string or a pre-configured Redis client instance.
    Exactly one of these parameters must be provided during initialization.

    Storage types use the following Redis data structures:
    - **Datasets**: Redis JSON arrays for item storage with metadata in JSON objects
    - **Key-value stores**: Redis hashes for key-value pairs with separate metadata storage
    - **Request queues**: Redis lists for FIFO queuing, hashes for request data and in-progress tracking,
      and Bloom filters for request deduplication

    Warning:
        This is an experimental feature. The behavior and interface may change in future versions.
    """

    def __init__(
        self,
        *,
        connection_string: str | None = None,
        redis: Redis | None = None,
        queue_dedup_strategy: Literal['default', 'bloom'] = 'default',
        queue_bloom_error_rate: float = 1e-7,
    ) -> None:
        """Initialize the Redis storage client.

        Args:
            connection_string: Redis connection string (e.g., "redis://localhost:6379").
                Supports standard Redis URL format with optional database selection.
            redis: Pre-configured Redis client instance.
            queue_dedup_strategy: Strategy for request queue deduplication. Options are:
                - 'default': Uses Redis sets for exact deduplication.
                - 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using
                    this approach, approximately 1 in 1e-7 requests will be falsely considered duplicate.
            queue_bloom_error_rate: Desired false positive rate for Bloom filter deduplication. Only relevant if
                `queue_dedup_strategy` is set to 'bloom'.
        """
        if redis is None and connection_string is None:
            raise ValueError('Either redis or connection_string must be provided.')

        if redis is not None and connection_string is not None:
            raise ValueError('Either redis or connection_string must be provided, not both.')

        if isinstance(redis, Redis) and connection_string is None:
            self._redis = redis

        if isinstance(connection_string, str) and redis is None:
            self._redis = Redis.from_url(connection_string)

        self._redis: Redis  # to help type checker
        self._queue_dedup_strategy = queue_dedup_strategy
        self._queue_bloom_error_rate = queue_bloom_error_rate

        # Call the notification only once
        warnings.warn(
            (
                'RedisStorageClient is experimental and its API, behavior, and key structure may change in future '
                'releases.'
            ),
            category=UserWarning,
            stacklevel=2,
        )

    @override
    async def create_dataset_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> RedisDatasetClient:
        configuration = configuration or Configuration.get_global_configuration()

        client = await RedisDatasetClient.open(
            id=id,
            name=name,
            alias=alias,
            redis=self._redis,
        )

        await self._purge_if_needed(client, configuration)
        return client

    @override
    async def create_kvs_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> RedisKeyValueStoreClient:
        configuration = configuration or Configuration.get_global_configuration()

        client = await RedisKeyValueStoreClient.open(
            id=id,
            name=name,
            alias=alias,
            redis=self._redis,
        )

        await self._purge_if_needed(client, configuration)
        return client

    @override
    async def create_rq_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> RedisRequestQueueClient:
        configuration = configuration or Configuration.get_global_configuration()

        client = await RedisRequestQueueClient.open(
            id=id,
            name=name,
            alias=alias,
            redis=self._redis,
            dedup_strategy=self._queue_dedup_strategy,
            bloom_error_rate=self._queue_bloom_error_rate,
        )

        await self._purge_if_needed(client, configuration)
        return client


================================================
FILE: src/crawlee/storage_clients/_redis/_utils.py
================================================
from collections.abc import Awaitable
from pathlib import Path
from typing import TypeVar, cast, overload

T = TypeVar('T')


@overload
async def await_redis_response(response: Awaitable[T]) -> T: ...
@overload
async def await_redis_response(response: T) -> T: ...


async def await_redis_response(response: Awaitable[T] | T) -> T:
    """Solve the problem of ambiguous typing for redis."""
    if isinstance(response, Awaitable):
        return cast('T', await response)
    return response


def read_lua_script(script_name: str) -> str:
    """Read a Lua script from a file."""
    file_path = Path(__file__).parent / 'lua_scripts' / script_name
    with file_path.open(mode='r', encoding='utf-8') as file:
        return file.read()


================================================
FILE: src/crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua
================================================
local added_filter_key = KEYS[1]
local queue_key = KEYS[2]
local data_key = KEYS[3]

local forefront = ARGV[1] == '1'
local unique_keys = cjson.decode(ARGV[2])
local requests_data = cjson.decode(ARGV[3])

-- Add and check which unique keys are actually new using Bloom filter
local bf_results = redis.call('bf.madd', added_filter_key, unpack(unique_keys))

local actually_added = {}
local hset_args = {}

-- Process the results
for i, unique_key in ipairs(unique_keys) do
    if bf_results[i] == 1 then
        -- This key was added by us (did not exist before)
        table.insert(hset_args, unique_key)
        table.insert(hset_args, requests_data[unique_key])
        table.insert(actually_added, unique_key)
    end
end

-- Add only those that are actually new
if #actually_added > 0 then
    redis.call('hset', data_key, unpack(hset_args))

    if forefront then
        redis.call('lpush', queue_key, unpack(actually_added))
    else
        redis.call('rpush', queue_key, unpack(actually_added))
    end
end

return cjson.encode(actually_added)


================================================
FILE: src/crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua
================================================
local queue_key = KEYS[1]
local in_progress_key = KEYS[2]
local data_key = KEYS[3]
local client_id = ARGV[1]
local blocked_until_timestamp = ARGV[2]
local batch_size = tonumber(ARGV[3])

-- Pop batch unique_key from queue
local batch_result = redis.call('LMPOP', 1, queue_key, 'LEFT', 'COUNT', batch_size)
if not batch_result then
    return nil
end
local unique_keys = batch_result[2]

-- Get requests data
local requests_data = redis.call('HMGET', data_key, unpack(unique_keys))
if not requests_data then
    -- Data missing, skip this request
    return nil
end

-- Prepare results and update in_progress
local final_result = {}
local in_progress_hmset = {}
local pending_decrement = 0
local in_progress_data = cjson.encode({
    client_id = client_id,
    blocked_until_timestamp = tonumber(blocked_until_timestamp)
})
for i = 1, #unique_keys do
    local unique_key = unique_keys[i]
    local request_data = requests_data[i]

    if request_data then
        -- Add to in_progress hash
        table.insert(in_progress_hmset, unique_key)
        table.insert(in_progress_hmset, in_progress_data)

        table.insert(final_result, request_data)
    end
end

-- Update in_progress hash
if #in_progress_hmset > 0 then
    redis.call('HMSET', in_progress_key, unpack(in_progress_hmset))
end

-- Return result with requests data
return final_result


================================================
FILE: src/crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua
================================================
local added_filter_key = KEYS[1]
local queue_key = KEYS[2]
local data_key = KEYS[3]

local forefront = ARGV[1] == '1'
local unique_keys = cjson.decode(ARGV[2])
local requests_data = cjson.decode(ARGV[3])

-- Add and check which unique keys are actually new using Redis set
local actually_added = {}
local hset_args = {}

-- Process each unique key
for _, unique_key in ipairs(unique_keys) do
    -- Try to add the key to the set, returns 1 if added, 0 if already existed
    local set_result = redis.call('sadd', added_filter_key, unique_key)

    if set_result == 1 then
        -- This key was added by us (did not exist before)
        table.insert(hset_args, unique_key)
        table.insert(hset_args, requests_data[unique_key])
        table.insert(actually_added, unique_key)
    end
end

-- Add only those that are actually new
if #actually_added > 0 then
    redis.call('hset', data_key, unpack(hset_args))

    if forefront then
        redis.call('lpush', queue_key, unpack(actually_added))
    else
        redis.call('rpush', queue_key, unpack(actually_added))
    end
end

return cjson.encode(actually_added)


================================================
FILE: src/crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua
================================================
local in_progress_key = KEYS[1]
local queue_key = KEYS[2]
local data_key = KEYS[3]
local current_time = tonumber(ARGV[1])

local max_reclaim = 1000

local cursor = "0"
local count = 0

repeat
    local result = redis.call('hscan', in_progress_key, cursor, 'COUNT', 100)
    cursor = result[1]
    local entries = result[2]

    for i = 1, #entries, 2 do
        if count >= max_reclaim then
            break
        end

        local unique_key = entries[i]
        local data = cjson.decode(entries[i + 1])

        -- Check if timed out
        if current_time > data.blocked_until_timestamp then
            -- Atomically remove from in_progress and add back to queue
            redis.call('hdel', in_progress_key, unique_key)
            redis.call('rpush', queue_key, unique_key)
            count = count + 1
        end
    end
until cursor == "0" or count >= max_reclaim

return count


================================================
FILE: src/crawlee/storage_clients/_redis/py.typed
================================================


================================================
FILE: src/crawlee/storage_clients/_sql/__init__.py
================================================
from ._dataset_client import SqlDatasetClient
from ._key_value_store_client import SqlKeyValueStoreClient
from ._request_queue_client import SqlRequestQueueClient
from ._storage_client import SqlStorageClient

__all__ = ['SqlDatasetClient', 'SqlKeyValueStoreClient', 'SqlRequestQueueClient', 'SqlStorageClient']


================================================
FILE: src/crawlee/storage_clients/_sql/_client_mixin.py
================================================
from __future__ import annotations

from abc import ABC, abstractmethod
from contextlib import asynccontextmanager
from datetime import datetime, timedelta, timezone
from logging import getLogger
from typing import TYPE_CHECKING, Any, ClassVar, TypedDict, cast, overload

from sqlalchemy import CursorResult, delete, select, text, update
from sqlalchemy import func as sql_func
from sqlalchemy.dialects.mysql import insert as mysql_insert
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.dialects.sqlite import insert as lite_insert
from sqlalchemy.exc import OperationalError, SQLAlchemyError

from crawlee._utils.crypto import crypto_random_object_id

if TYPE_CHECKING:
    from collections.abc import AsyncIterator

    from sqlalchemy import Insert
    from sqlalchemy.ext.asyncio import AsyncSession
    from sqlalchemy.orm import DeclarativeBase
    from typing_extensions import NotRequired, Self

    from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata

    from ._db_models import (
        DatasetItemDb,
        DatasetMetadataBufferDb,
        DatasetMetadataDb,
        KeyValueStoreMetadataBufferDb,
        KeyValueStoreMetadataDb,
        KeyValueStoreRecordDb,
        RequestDb,
        RequestQueueMetadataBufferDb,
        RequestQueueMetadataDb,
    )
    from ._storage_client import SqlStorageClient


logger = getLogger(__name__)


class MetadataUpdateParams(TypedDict, total=False):
    """Parameters for updating metadata."""

    accessed_at: NotRequired[datetime]
    modified_at: NotRequired[datetime]


class SqlClientMixin(ABC):
    """Mixin class for SQL clients.

    This mixin provides common SQL operations and basic methods for SQL storage clients.
    """

    _DEFAULT_NAME: ClassVar[str]
    """Default name when none provided."""

    _METADATA_TABLE: ClassVar[type[DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb]]
    """SQLAlchemy model for metadata."""

    _BUFFER_TABLE: ClassVar[
        type[KeyValueStoreMetadataBufferDb | DatasetMetadataBufferDb | RequestQueueMetadataBufferDb]
    ]
    """SQLAlchemy model for metadata buffer."""

    _ITEM_TABLE: ClassVar[type[DatasetItemDb | KeyValueStoreRecordDb | RequestDb]]
    """SQLAlchemy model for items."""

    _CLIENT_TYPE: ClassVar[str]
    """Human-readable client type for error messages."""

    _BLOCK_BUFFER_TIME = timedelta(seconds=1)
    """Time interval that blocks buffer reading to update metadata."""

    def __init__(self, *, id: str, storage_client: SqlStorageClient) -> None:
        self._id = id
        self._storage_client = storage_client

    @classmethod
    async def _open(
        cls,
        *,
        id: str | None,
        name: str | None,
        internal_name: str,
        storage_client: SqlStorageClient,
        metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata],
        session: AsyncSession,
        extra_metadata_fields: dict[str, Any],
    ) -> Self:
        """Open existing storage or create new one.

        Internal method used by _safely_open.

        Args:
            id: Storage ID to open (takes precedence over name).
            name: The name of the storage.
            internal_name: The database name for the storage based on name or alias.
            storage_client: SQL storage client instance.
            metadata_model: Pydantic model for metadata validation.
            session: Active database session.
            extra_metadata_fields: Storage-specific metadata fields.
        """
        orm_metadata: DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None = None
        if id:
            orm_metadata = await session.get(cls._METADATA_TABLE, id)
            if not orm_metadata:
                raise ValueError(f'{cls._CLIENT_TYPE} with ID "{id}" not found.')
        else:
            stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name)
            result = await session.execute(stmt)
            orm_metadata = result.scalar_one_or_none()

        if orm_metadata:
            client = cls(id=orm_metadata.id, storage_client=storage_client)
            await client._add_buffer_record(session)
            # Ensure any pending buffer updates are processed
            await client._process_buffers()
        else:
            now = datetime.now(timezone.utc)
            metadata = metadata_model(
                id=crypto_random_object_id(),
                name=name,
                created_at=now,
                accessed_at=now,
                modified_at=now,
                **extra_metadata_fields,
            )
            client = cls(id=metadata.id, storage_client=storage_client)
            session.add(cls._METADATA_TABLE(**metadata.model_dump(), internal_name=internal_name))

        return client

    @classmethod
    async def _safely_open(
        cls,
        *,
        id: str | None,
        name: str | None,
        alias: str | None = None,
        storage_client: SqlStorageClient,
        metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata],
        extra_metadata_fields: dict[str, Any],
    ) -> Self:
        """Safely open storage with transaction handling.

        Args:
            id: Storage ID to open (takes precedence over name).
            name: The name of the storage for named (global scope) storages.
            alias: The alias of the storage for unnamed (run scope) storages.
            storage_client: SQL storage client instance.
            client_class: Concrete client class to instantiate.
            metadata_model: Pydantic model for metadata validation.
            extra_metadata_fields: Storage-specific metadata fields.
        """
        # Validate input parameters.
        specified_params = sum(1 for param in [id, name, alias] if param is not None)
        if specified_params > 1:
            raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')

        internal_name = name or alias or cls._DEFAULT_NAME

        async with storage_client.create_session() as session:
            try:
                client = await cls._open(
                    id=id,
                    name=name,
                    internal_name=internal_name,
                    storage_client=storage_client,
                    metadata_model=metadata_model,
                    session=session,
                    extra_metadata_fields=extra_metadata_fields,
                )
                await session.commit()
            except SQLAlchemyError:
                await session.rollback()

                stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name)
                result = await session.execute(stmt)
                orm_metadata: DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None
                orm_metadata = cast(
                    'DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None',
                    result.scalar_one_or_none(),
                )

                if not orm_metadata:
                    raise ValueError(f'{cls._CLIENT_TYPE} with Name "{internal_name}" not found.') from None

                client = cls(id=orm_metadata.id, storage_client=storage_client)

        return client

    @asynccontextmanager
    async def get_session(self, *, with_simple_commit: bool = False) -> AsyncIterator[AsyncSession]:
        """Create a new SQLAlchemy session for this storage."""
        async with self._storage_client.create_session() as session:
            # For operations where a final commit is mandatory and does not require specific processing conditions
            if with_simple_commit:
                try:
                    yield session
                    await session.commit()
                except SQLAlchemyError as e:
                    logger.warning(f'Error occurred during session transaction: {e}')
                    await session.rollback()
            else:
                yield session

    def _build_insert_stmt_with_ignore(
        self, table_model: type[DeclarativeBase], insert_values: dict[str, Any] | list[dict[str, Any]]
    ) -> Insert:
        """Build an insert statement with ignore for the SQL dialect.

        Args:
            table_model: SQLAlchemy table model.
            insert_values: Single dict or list of dicts to insert.
        """
        if isinstance(insert_values, dict):
            insert_values = [insert_values]

        dialect = self._storage_client.get_dialect_name()

        if dialect == 'postgresql':
            return pg_insert(table_model).values(insert_values).on_conflict_do_nothing()

        if dialect == 'sqlite':
            return lite_insert(table_model).values(insert_values).on_conflict_do_nothing()

        if dialect in {'mysql', 'mariadb'}:
            return mysql_insert(table_model).values(insert_values).prefix_with('IGNORE')

        raise NotImplementedError(f'Insert with ignore not supported for dialect: {dialect}')

    def _build_upsert_stmt(
        self,
        table_model: type[DeclarativeBase],
        insert_values: dict[str, Any] | list[dict[str, Any]],
        update_columns: list[str],
        conflict_cols: list[str] | None = None,
    ) -> Insert:
        """Build an upsert statement for the SQL dialect.

        Args:
            table_model: SQLAlchemy table model.
            insert_values: Single dict or list of dicts to upsert.
            update_columns: Column names to update on conflict.
            conflict_cols: Column names that define uniqueness (for PostgreSQL/SQLite).

        """
        if isinstance(insert_values, dict):
            insert_values = [insert_values]

        dialect = self._storage_client.get_dialect_name()

        if dialect == 'postgresql':
            pg_stmt = pg_insert(table_model).values(insert_values)
            set_ = {col: getattr(pg_stmt.excluded, col) for col in update_columns}
            return pg_stmt.on_conflict_do_update(index_elements=conflict_cols, set_=set_)

        if dialect == 'sqlite':
            lite_stmt = lite_insert(table_model).values(insert_values)
            set_ = {col: getattr(lite_stmt.excluded, col) for col in update_columns}
            return lite_stmt.on_conflict_do_update(index_elements=conflict_cols, set_=set_)

        if dialect in {'mysql', 'mariadb'}:
            mysql_stmt = mysql_insert(table_model).values(insert_values)
            set_ = {col: getattr(mysql_stmt.inserted, col) for col in update_columns}
            return mysql_stmt.on_duplicate_key_update(**set_)

        raise NotImplementedError(f'Upsert not supported for dialect: {dialect}')

    async def _purge(self, metadata_kwargs: MetadataUpdateParams) -> None:
        """Drop all items in storage and update metadata.

        Args:
            metadata_kwargs: Arguments to pass to _update_metadata.
        """
        # Process buffers to ensure metadata is up to date before purging
        await self._process_buffers()

        stmt_records = delete(self._ITEM_TABLE).where(self._ITEM_TABLE.storage_id == self._id)
        async with self.get_session(with_simple_commit=True) as session:
            await session.execute(stmt_records)
            await self._update_metadata(session, **metadata_kwargs)

    async def _drop(self) -> None:
        """Delete this storage and all its data.

        This operation is irreversible. Uses CASCADE deletion to remove all related items.
        """
        stmt = delete(self._METADATA_TABLE).where(self._METADATA_TABLE.id == self._id)
        # Delete the buffer records with a separate query, since tables don't link via foreign key.
        buffer_stmt = delete(self._BUFFER_TABLE).where(self._BUFFER_TABLE.storage_id == self._id)

        async with self.get_session(with_simple_commit=True) as session:
            if self._storage_client.get_dialect_name() == 'sqlite':
                # foreign_keys=ON is set at the connection level. Required for cascade deletion.
                await session.execute(text('PRAGMA foreign_keys=ON'))
            await session.execute(stmt)
            await session.execute(buffer_stmt)

    @overload
    async def _get_metadata(self, metadata_model: type[DatasetMetadata]) -> DatasetMetadata: ...
    @overload
    async def _get_metadata(self, metadata_model: type[KeyValueStoreMetadata]) -> KeyValueStoreMetadata: ...
    @overload
    async def _get_metadata(self, metadata_model: type[RequestQueueMetadata]) -> RequestQueueMetadata: ...

    async def _get_metadata(
        self, metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata]
    ) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata:
        """Retrieve client metadata."""
        # Process any pending buffer updates first
        await self._process_buffers()

        async with self.get_session() as session:
            orm_metadata = await session.get(self._METADATA_TABLE, self._id)
            if not orm_metadata:
                raise ValueError(f'{self._CLIENT_TYPE} with ID "{self._id}" not found.')

            return metadata_model.model_validate(orm_metadata)

    @abstractmethod
    def _specific_update_metadata(self, **kwargs: Any) -> dict[str, Any]:
        """Prepare storage-specific metadata updates.

        Must be implemented by concrete classes.

        Args:
            **kwargs: Storage-specific update parameters.
        """

    @abstractmethod
    def _prepare_buffer_data(self, **kwargs: Any) -> dict[str, Any]:
        """Prepare storage-specific buffer data. Must be implemented by concrete classes."""

    @abstractmethod
    async def _apply_buffer_updates(self, session: AsyncSession, max_buffer_id: int) -> None:
        """Apply aggregated buffer updates to metadata. Must be implemented by concrete classes.

        Args:
            session: Active database session.
            max_buffer_id: Maximum buffer record ID to process.
        """

    async def _update_metadata(
        self,
        session: AsyncSession,
        *,
        accessed_at: datetime | None = None,
        modified_at: datetime | None = None,
        **kwargs: Any,
    ) -> None:
        """Directly update storage metadata combining common and specific fields.

        Args:
            session: Active database session.
            accessed_at: Datetime to set as accessed_at timestamp.
            modified_at: Datetime to set as modified_at timestamp.
            **kwargs: Additional arguments for _specific_update_metadata.
        """
        values_to_set: dict[str, Any] = {}

        if accessed_at is not None:
            values_to_set['accessed_at'] = accessed_at

        if modified_at is not None:
            values_to_set['modified_at'] = modified_at

        values_to_set.update(self._specific_update_metadata(**kwargs))

        if values_to_set:
            if (stmt := values_to_set.pop('custom_stmt', None)) is None:
                stmt = update(self._METADATA_TABLE).where(self._METADATA_TABLE.id == self._id)

            stmt = stmt.values(**values_to_set)
            await session.execute(stmt)

    async def _add_buffer_record(
        self,
        session: AsyncSession,
        *,
        update_modified_at: bool = False,
        **kwargs: Any,
    ) -> None:
        """Add a record to the buffer table and update metadata.

        Args:
            session: Active database session.
            update_modified_at: Whether to update modified_at timestamp.
            **kwargs: Additional arguments for _prepare_buffer_data.
        """
        now = datetime.now(timezone.utc)
        values_to_set = {
            'storage_id': self._id,
            'accessed_at': now,  # All entries in the buffer require updating `accessed_at`
            'modified_at': now if update_modified_at else None,
        }
        values_to_set.update(self._prepare_buffer_data(**kwargs))

        session.add(self._BUFFER_TABLE(**values_to_set))

    async def _try_acquire_buffer_lock(self, session: AsyncSession) -> bool:
        """Try to acquire buffer processing lock for a short period.

        Args:
            session: Active database session.

        Returns:
            True if lock was acquired, False if already locked by another process.
        """
        capture_error_code = 1020  # MariaDB error code for "Record has changed since last read"
        now = datetime.now(timezone.utc)
        lock_until = now + self._BLOCK_BUFFER_TIME
        dialect = self._storage_client.get_dialect_name()

        if dialect in {'postgresql', 'mysql', 'mariadb'}:
            select_stmt = (
                select(self._METADATA_TABLE)
                .where(
                    self._METADATA_TABLE.id == self._id,
                    (self._METADATA_TABLE.buffer_locked_until.is_(None))
                    | (self._METADATA_TABLE.buffer_locked_until < now),
                    select(self._BUFFER_TABLE.id).where(self._BUFFER_TABLE.storage_id == self._id).exists(),
                )
                .with_for_update(skip_locked=True)
            )

            try:
                result = await session.execute(select_stmt)
            except OperationalError as e:
                # MariaDB raises error 1020 ("Record has changed since last read") instead of
                # silently skipping locked rows like MySQL/PostgreSQL. Treat it as lock not acquired.
                error_code = getattr(e.orig, 'args', [None])[0]
                if error_code == capture_error_code:
                    return False
                raise

            metadata_row = result.scalar_one_or_none()

            if metadata_row is None:
                # Either conditions not met OR row is locked by another process
                return False

        # Acquire lock only if not currently locked or lock has expired
        update_stmt = (
            update(self._METADATA_TABLE)
            .where(
                self._METADATA_TABLE.id == self._id,
                (self._METADATA_TABLE.buffer_locked_until.is_(None)) | (self._METADATA_TABLE.buffer_locked_until < now),
                select(self._BUFFER_TABLE.id).where(self._BUFFER_TABLE.storage_id == self._id).exists(),
            )
            .values(buffer_locked_until=lock_until)
        )

        result = await session.execute(update_stmt)
        result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result

        if result.rowcount > 0:
            await session.flush()
            return True

        return False

    async def _release_buffer_lock(self, session: AsyncSession) -> None:
        """Release buffer processing lock by setting buffer_locked_until to NULL.

        Args:
            session: Active database session.
        """
        stmt = update(self._METADATA_TABLE).where(self._METADATA_TABLE.id == self._id).values(buffer_locked_until=None)

        await session.execute(stmt)

    async def _has_pending_buffer_updates(self, session: AsyncSession) -> bool:
        """Check if there are pending buffer updates not yet applied to metadata.

        Returns False only when buffer_locked_until is NULL (metadata is consistent).

        Returns:
            True if metadata might be inconsistent due to pending buffer updates.
        """
        result = await session.execute(
            select(self._METADATA_TABLE.buffer_locked_until).where(self._METADATA_TABLE.id == self._id)
        )

        locked_until = result.scalar()

        # Any non-NULL value means there are pending updates
        return locked_until is not None

    async def _process_buffers(self) -> None:
        """Process pending buffer updates and apply them to metadata."""
        async with self.get_session(with_simple_commit=True) as session:
            # Try to acquire buffer processing lock
            if not await self._try_acquire_buffer_lock(session):
                # Another process is currently processing buffers or lock acquisition failed
                return

            # Get the maximum buffer ID at this moment
            # This creates a consistent snapshot - records added during processing won't be included
            max_buffer_id_stmt = select(sql_func.max(self._BUFFER_TABLE.id)).where(
                self._BUFFER_TABLE.storage_id == self._id
            )

            result = await session.execute(max_buffer_id_stmt)
            max_buffer_id = result.scalar()

            if max_buffer_id is None:
                # No buffer records to process. Release the lock and exit.
                await self._release_buffer_lock(session)
                return

            # Apply aggregated buffer updates to metadata using only records <= max_buffer_id
            # This method is implemented by concrete storage classes
            await self._apply_buffer_updates(session, max_buffer_id=max_buffer_id)

            # Clean up only the processed buffer records (those <= max_buffer_id)
            delete_stmt = delete(self._BUFFER_TABLE).where(
                self._BUFFER_TABLE.storage_id == self._id, self._BUFFER_TABLE.id <= max_buffer_id
            )

            await session.execute(delete_stmt)

            # Release the lock after successful processing
            await self._release_buffer_lock(session)


================================================
FILE: src/crawlee/storage_clients/_sql/_dataset_client.py
================================================
from __future__ import annotations

from datetime import datetime, timezone
from logging import getLogger
from typing import TYPE_CHECKING, Any

from sqlalchemy import Select, insert, select
from sqlalchemy import func as sql_func
from typing_extensions import Self, override

from crawlee.storage_clients._base import DatasetClient
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata

from ._client_mixin import MetadataUpdateParams, SqlClientMixin
from ._db_models import DatasetItemDb, DatasetMetadataBufferDb, DatasetMetadataDb

if TYPE_CHECKING:
    from collections.abc import AsyncIterator

    from sqlalchemy import Select
    from sqlalchemy.ext.asyncio import AsyncSession
    from typing_extensions import NotRequired

    from ._storage_client import SqlStorageClient


logger = getLogger(__name__)


class _DatasetMetadataUpdateParams(MetadataUpdateParams):
    """Parameters for updating dataset metadata."""

    new_item_count: NotRequired[int]
    delta_item_count: NotRequired[int]


class SqlDatasetClient(DatasetClient, SqlClientMixin):
    """SQL implementation of the dataset client.

    This client persists dataset items to a SQL database using two tables for storage
    and retrieval. Items are stored as JSON with automatic ordering preservation.

    The dataset data is stored in SQL database tables following the pattern:
    - `datasets` table: Contains dataset metadata (id, name, timestamps, item_count)
    - `dataset_records` table: Contains individual items with JSON data and auto-increment ordering
    - `dataset_metadata_buffer` table: Buffers metadata updates for performance optimization

    Items are stored as a JSON object in SQLite and as JSONB in PostgreSQL. These objects must be JSON-serializable.
    The `item_id` auto-increment primary key ensures insertion order is preserved.
    All operations are wrapped in database transactions with CASCADE deletion support.
    """

    _DEFAULT_NAME = 'default'
    """Default dataset name used when no name is provided."""

    _METADATA_TABLE = DatasetMetadataDb
    """SQLAlchemy model for dataset metadata."""

    _ITEM_TABLE = DatasetItemDb
    """SQLAlchemy model for dataset items."""

    _CLIENT_TYPE = 'Dataset'
    """Human-readable client type for error messages."""

    _BUFFER_TABLE = DatasetMetadataBufferDb
    """SQLAlchemy model for metadata buffer."""

    def __init__(
        self,
        *,
        id: str,
        storage_client: SqlStorageClient,
    ) -> None:
        """Initialize a new instance.

        Preferably use the `SqlDatasetClient.open` class method to create a new instance.
        """
        super().__init__(id=id, storage_client=storage_client)

    @classmethod
    async def open(
        cls,
        *,
        id: str | None,
        name: str | None,
        alias: str | None,
        storage_client: SqlStorageClient,
    ) -> Self:
        """Open an existing dataset or create a new one.

        Args:
            id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
            name: The name of the dataset for named (global scope) storages.
            alias: The alias of the dataset for unnamed (run scope) storages.
            storage_client: The SQL storage client instance.

        Returns:
            An instance for the opened or created storage client.

        Raises:
            ValueError: If a dataset with the specified ID is not found.
        """
        return await cls._safely_open(
            id=id,
            name=name,
            alias=alias,
            storage_client=storage_client,
            metadata_model=DatasetMetadata,
            extra_metadata_fields={'item_count': 0},
        )

    @override
    async def get_metadata(self) -> DatasetMetadata:
        # The database is a single place of truth
        return await self._get_metadata(DatasetMetadata)

    @override
    async def drop(self) -> None:
        """Delete this dataset and all its items from the database.

        This operation is irreversible. Uses CASCADE deletion to remove all related items.
        """
        await self._drop()

    @override
    async def purge(self) -> None:
        """Remove all items from this dataset while keeping the dataset structure.

        Resets item_count to 0 and deletes all records from dataset_records table.
        """
        now = datetime.now(timezone.utc)
        await self._purge(
            metadata_kwargs=_DatasetMetadataUpdateParams(
                new_item_count=0,
                accessed_at=now,
                modified_at=now,
            )
        )

    @override
    async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
        if not isinstance(data, list):
            data = [data]

        db_items = [{'dataset_id': self._id, 'data': item} for item in data]
        stmt = insert(self._ITEM_TABLE).values(db_items)

        async with self.get_session(with_simple_commit=True) as session:
            await session.execute(stmt)

            await self._add_buffer_record(session, update_modified_at=True, delta_item_count=len(data))

    @override
    async def get_data(
        self,
        *,
        offset: int = 0,
        limit: int | None = 999_999_999_999,
        clean: bool = False,
        desc: bool = False,
        fields: list[str] | None = None,
        omit: list[str] | None = None,
        unwind: list[str] | None = None,
        skip_empty: bool = False,
        skip_hidden: bool = False,
        flatten: list[str] | None = None,
        view: str | None = None,
    ) -> DatasetItemsListPage:
        stmt = self._prepare_get_stmt(
            offset=offset,
            limit=limit,
            clean=clean,
            desc=desc,
            fields=fields,
            omit=omit,
            unwind=unwind,
            skip_empty=skip_empty,
            skip_hidden=skip_hidden,
            flatten=flatten,
            view=view,
        )

        async with self.get_session(with_simple_commit=True) as session:
            result = await session.execute(stmt)
            db_items = result.scalars().all()

            await self._add_buffer_record(session)

        items = [db_item.data for db_item in db_items]
        metadata = await self.get_metadata()
        return DatasetItemsListPage(
            items=items,
            count=len(items),
            desc=desc,
            limit=limit or 0,
            offset=offset or 0,
            total=metadata.item_count,
        )

    @override
    async def iterate_items(
        self,
        *,
        offset: int = 0,
        limit: int | None = None,
        clean: bool = False,
        desc: bool = False,
        fields: list[str] | None = None,
        omit: list[str] | None = None,
        unwind: list[str] | None = None,
        skip_empty: bool = False,
        skip_hidden: bool = False,
    ) -> AsyncIterator[dict[str, Any]]:
        stmt = self._prepare_get_stmt(
            offset=offset,
            limit=limit,
            clean=clean,
            desc=desc,
            fields=fields,
            omit=omit,
            unwind=unwind,
            skip_empty=skip_empty,
            skip_hidden=skip_hidden,
        )

        async with self.get_session(with_simple_commit=True) as session:
            db_items = await session.stream_scalars(stmt)

            async for db_item in db_items:
                yield db_item.data

            await self._add_buffer_record(session)

    def _prepare_get_stmt(
        self,
        *,
        offset: int = 0,
        limit: int | None = 999_999_999_999,
        clean: bool = False,
        desc: bool = False,
        fields: list[str] | None = None,
        omit: list[str] | None = None,
        unwind: list[str] | None = None,
        skip_empty: bool = False,
        skip_hidden: bool = False,
        flatten: list[str] | None = None,
        view: str | None = None,
    ) -> Select:
        # Check for unsupported arguments and log a warning if found.
        unsupported_args: dict[str, Any] = {
            'clean': clean,
            'fields': fields,
            'omit': omit,
            'unwind': unwind,
            'skip_hidden': skip_hidden,
            'flatten': flatten,
            'view': view,
        }
        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}

        if unsupported:
            logger.warning(
                f'The arguments {list(unsupported.keys())} of get_data are not supported by the '
                f'{self.__class__.__name__} client.'
            )

        stmt = select(self._ITEM_TABLE).where(self._ITEM_TABLE.dataset_id == self._id)

        if skip_empty:
            # Skip items that are empty JSON objects
            stmt = stmt.where(self._ITEM_TABLE.data != {})

        # Apply ordering by insertion order (item_id)
        stmt = stmt.order_by(self._ITEM_TABLE.item_id.desc()) if desc else stmt.order_by(self._ITEM_TABLE.item_id.asc())

        return stmt.offset(offset).limit(limit)

    @override
    def _specific_update_metadata(
        self,
        new_item_count: int | None = None,
        delta_item_count: int | None = None,
        **_kwargs: dict[str, Any],
    ) -> dict[str, Any]:
        """Directly update the dataset metadata in the database.

        Args:
            session: The SQLAlchemy AsyncSession to use for the update.
            new_item_count: If provided, set item count to this value.
            delta_item_count: If provided, add this value to the current item count.
        """
        values_to_set: dict[str, Any] = {}

        if new_item_count is not None:
            values_to_set['item_count'] = new_item_count
        elif delta_item_count:
            # Use database-level for atomic updates
            values_to_set['item_count'] = self._METADATA_TABLE.item_count + delta_item_count

        return values_to_set

    @override
    def _prepare_buffer_data(self, delta_item_count: int | None = None, **_kwargs: Any) -> dict[str, Any]:
        """Prepare dataset specific buffer data.

        Args:
            delta_item_count: If provided, add this value to the current item count.
        """
        buffer_data = {}
        if delta_item_count is not None:
            buffer_data['delta_item_count'] = delta_item_count

        return buffer_data

    @override
    async def _apply_buffer_updates(self, session: AsyncSession, max_buffer_id: int) -> None:
        aggregation_stmt = select(
            sql_func.max(self._BUFFER_TABLE.accessed_at).label('max_accessed_at'),
            sql_func.max(self._BUFFER_TABLE.modified_at).label('max_modified_at'),
            sql_func.sum(self._BUFFER_TABLE.delta_item_count).label('delta_item_count'),
        ).where(self._BUFFER_TABLE.storage_id == self._id, self._BUFFER_TABLE.id <= max_buffer_id)

        result = await session.execute(aggregation_stmt)
        row = result.first()

        if not row:
            return

        await self._update_metadata(
            session,
            **_DatasetMetadataUpdateParams(
                accessed_at=row.max_accessed_at,
                modified_at=row.max_modified_at,
                delta_item_count=row.delta_item_count,
            ),
        )


================================================
FILE: src/crawlee/storage_clients/_sql/_db_models.py
================================================
from __future__ import annotations

from datetime import datetime, timezone
from typing import TYPE_CHECKING, Any

from sqlalchemy import JSON, BigInteger, Boolean, ForeignKey, Index, Integer, LargeBinary, String, Text, text
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship, synonym
from sqlalchemy.types import DateTime, TypeDecorator
from typing_extensions import override

if TYPE_CHECKING:
    from sqlalchemy.engine import Dialect
    from sqlalchemy.types import TypeEngine


class AwareDateTime(TypeDecorator):
    """Custom SQLAlchemy type for timezone-aware datetime handling.

    Ensures all datetime values are timezone-aware by adding UTC timezone to
    naive datetime values from databases that don't store timezone information.
    """

    impl = DateTime(timezone=True)
    cache_ok = True

    @override
    def process_result_value(self, value: datetime | None, dialect: Dialect) -> datetime | None:
        """Add UTC timezone to naive datetime values."""
        if value is not None and value.tzinfo is None:
            return value.replace(tzinfo=timezone.utc)
        return value


class JsonField(TypeDecorator):
    """Uses JSONB for PostgreSQL and JSON for other databases."""

    impl = JSON
    cache_ok = True

    def load_dialect_impl(self, dialect: Dialect) -> TypeEngine[JSON | JSONB]:
        """Load the appropriate dialect implementation for the JSON type."""
        if dialect.name == 'postgresql':
            return dialect.type_descriptor(JSONB())
        return dialect.type_descriptor(JSON())


class Base(DeclarativeBase):
    """Base class for all database models for correct type annotations."""


class StorageMetadataDb:
    """Base database model for storage metadata."""

    internal_name: Mapped[str] = mapped_column(String(255), nullable=False, index=True, unique=True)
    """Internal unique name for a storage instance based on a name or alias."""

    name: Mapped[str | None] = mapped_column(String(255), nullable=True, unique=True)
    """Human-readable name. None becomes 'default' in database to enforce uniqueness."""

    accessed_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False)
    """Last access datetime for usage tracking."""

    created_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False)
    """Creation datetime."""

    modified_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False)
    """Last modification datetime."""

    buffer_locked_until: Mapped[datetime | None] = mapped_column(AwareDateTime, nullable=True)
    """Timestamp until which buffer processing is locked for this storage. NULL = unlocked."""


class DatasetMetadataDb(StorageMetadataDb, Base):
    """Metadata table for datasets."""

    __tablename__ = 'datasets'

    dataset_id: Mapped[str] = mapped_column(String(20), nullable=False, primary_key=True)
    """Unique identifier for the dataset."""

    item_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    """Number of items in the dataset."""

    # Relationship to dataset items with cascade deletion
    items: Mapped[list[DatasetItemDb]] = relationship(
        back_populates='dataset', cascade='all, delete-orphan', lazy='noload'
    )

    id = synonym('dataset_id')
    """Alias for dataset_id to match Pydantic expectations."""


class RequestQueueMetadataDb(StorageMetadataDb, Base):
    """Metadata table for request queues."""

    __tablename__ = 'request_queues'

    request_queue_id: Mapped[str] = mapped_column(String(20), nullable=False, primary_key=True)
    """Unique identifier for the request queue."""

    had_multiple_clients: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
    """Flag indicating if multiple clients have accessed this queue."""

    handled_request_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    """Number of requests processed."""

    pending_request_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    """Number of requests waiting to be processed."""

    total_request_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    """Total number of requests ever added to this queue."""

    # Relationship to queue requests with cascade deletion
    requests: Mapped[list[RequestDb]] = relationship(
        back_populates='queue', cascade='all, delete-orphan', lazy='noload'
    )
    # Relationship to queue state
    state: Mapped[RequestQueueStateDb] = relationship(
        back_populates='queue', cascade='all, delete-orphan', lazy='noload'
    )

    id = synonym('request_queue_id')
    """Alias for request_queue_id to match Pydantic expectations."""


class KeyValueStoreMetadataDb(StorageMetadataDb, Base):
    """Metadata table for key-value stores."""

    __tablename__ = 'key_value_stores'

    key_value_store_id: Mapped[str] = mapped_column(String(20), nullable=False, primary_key=True)
    """Unique identifier for the key-value store."""

    # Relationship to store records with cascade deletion
    records: Mapped[list[KeyValueStoreRecordDb]] = relationship(
        back_populates='kvs', cascade='all, delete-orphan', lazy='noload'
    )

    id = synonym('key_value_store_id')
    """Alias for key_value_store_id to match Pydantic expectations."""


class KeyValueStoreRecordDb(Base):
    """Records table for key-value stores."""

    __tablename__ = 'key_value_store_records'

    key_value_store_id: Mapped[str] = mapped_column(
        String(20),
        ForeignKey('key_value_stores.key_value_store_id', ondelete='CASCADE'),
        primary_key=True,
        index=True,
        nullable=False,
    )
    """Foreign key to metadata key-value store record."""

    key: Mapped[str] = mapped_column(String(255), primary_key=True)
    """The key part of the key-value pair."""

    value: Mapped[bytes] = mapped_column(LargeBinary, nullable=False)
    """Value stored as binary data to support any content type."""

    content_type: Mapped[str] = mapped_column(String(50), nullable=False)
    """MIME type for proper value deserialization."""

    size: Mapped[int | None] = mapped_column(Integer, nullable=False, default=0)
    """Size of stored value in bytes."""

    # Relationship back to parent store
    kvs: Mapped[KeyValueStoreMetadataDb] = relationship(back_populates='records')

    storage_id = synonym('key_value_store_id')
    """Alias for key_value_store_id to match SqlClientMixin expectations."""


class DatasetItemDb(Base):
    """Items table for datasets."""

    __tablename__ = 'dataset_records'

    item_id: Mapped[int] = mapped_column(Integer, primary_key=True)
    """Auto-increment primary key preserving insertion order."""

    dataset_id: Mapped[str] = mapped_column(
        String(20),
        ForeignKey('datasets.dataset_id', ondelete='CASCADE'),
        index=True,
    )
    """Foreign key to metadata dataset record."""

    data: Mapped[list[dict[str, Any]] | dict[str, Any]] = mapped_column(JsonField, nullable=False)
    """JSON serializable item data."""

    # Relationship back to parent dataset
    dataset: Mapped[DatasetMetadataDb] = relationship(back_populates='items')

    storage_id = synonym('dataset_id')
    """Alias for dataset_id to match SqlClientMixin expectations."""


class RequestDb(Base):
    """Requests table for request queues."""

    __tablename__ = 'request_queue_records'
    __table_args__ = (
        Index(
            'idx_fetch_available',
            'request_queue_id',
            'is_handled',
            'sequence_number',
            postgresql_where=text('is_handled = false'),
        ),
        Index(
            'idx_count_aggregate',
            'request_queue_id',
            'is_handled',
        ),
    )

    request_id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
    """Unique identifier for the request representing the unique_key."""

    request_queue_id: Mapped[str] = mapped_column(
        String(20), ForeignKey('request_queues.request_queue_id', ondelete='CASCADE'), primary_key=True
    )
    """Foreign key to metadata request queue record."""

    data: Mapped[str] = mapped_column(Text, nullable=False)
    """JSON-serialized Request object."""

    sequence_number: Mapped[int] = mapped_column(Integer, nullable=False)
    """Ordering sequence: negative for forefront, positive for regular."""

    is_handled: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
    """Processing status flag."""

    time_blocked_until: Mapped[datetime | None] = mapped_column(AwareDateTime, nullable=True)
    """Timestamp until which this request is considered blocked for processing by other clients."""

    client_key: Mapped[str | None] = mapped_column(String(32), nullable=True)
    """Identifier of the client that has currently locked this request for processing."""

    # Relationship back to metadata table
    queue: Mapped[RequestQueueMetadataDb] = relationship(back_populates='requests')

    storage_id = synonym('request_queue_id')
    """Alias for request_queue_id to match SqlClientMixin expectations."""


class RequestQueueStateDb(Base):
    """State table for request queues."""

    __tablename__ = 'request_queue_state'

    request_queue_id: Mapped[str] = mapped_column(
        String(20), ForeignKey('request_queues.request_queue_id', ondelete='CASCADE'), primary_key=True
    )
    """Foreign key to metadata request queue record."""

    sequence_counter: Mapped[int] = mapped_column(Integer, nullable=False, default=1)
    """Counter for regular request ordering (positive)."""

    forefront_sequence_counter: Mapped[int] = mapped_column(Integer, nullable=False, default=-1)
    """Counter for forefront request ordering (negative)."""

    # Relationship back to metadata table
    queue: Mapped[RequestQueueMetadataDb] = relationship(back_populates='state')


class VersionDb(Base):
    """Table for storing the database schema version."""

    __tablename__ = 'version'

    version: Mapped[str] = mapped_column(String(10), nullable=False, primary_key=True)


class MetadataBufferDb:
    """Base model for metadata update buffer tables."""

    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
    """Auto-increment primary key for ordering."""

    # Timestamp fields - use max value when aggregating
    accessed_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False)
    """New accessed_at timestamp, if being updated."""

    modified_at: Mapped[datetime | None] = mapped_column(AwareDateTime, nullable=True)
    """New modified_at timestamp, if being updated."""


class KeyValueStoreMetadataBufferDb(MetadataBufferDb, Base):
    """Buffer table for deferred key-value store metadata updates to reduce concurrent access issues."""

    __tablename__ = 'key_value_store_metadata_buffer'

    # Don't use foreign key constraint to avoid DB locks on high concurrency.
    key_value_store_id: Mapped[str] = mapped_column(String(20), nullable=False, index=True)
    """ID of the key-value store being updated."""

    storage_id = synonym('key_value_store_id')
    """Alias for key_value_store_id to match SqlClientMixin expectations."""


class DatasetMetadataBufferDb(MetadataBufferDb, Base):
    """Buffer table for deferred dataset metadata updates to reduce concurrent access issues."""

    __tablename__ = 'dataset_metadata_buffer'

    # Don't use foreign key constraint to avoid DB locks on high concurrency.
    dataset_id: Mapped[str] = mapped_column(String(20), nullable=False, index=True)
    """ID of the dataset being updated."""

    # Counter deltas - use SUM when aggregating.
    delta_item_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
    """Delta for dataset item_count."""

    storage_id = synonym('dataset_id')
    """Alias for dataset_id to match SqlClientMixin expectations."""


class RequestQueueMetadataBufferDb(MetadataBufferDb, Base):
    """Buffer table for deferred request queue metadata updates to reduce concurrent access issues."""

    __tablename__ = 'request_queue_metadata_buffer'

    __table_args__ = (Index('idx_rq_client', 'request_queue_id', 'client_id'),)

    # Don't use foreign key constraint to avoid DB locks on high concurrency.
    request_queue_id: Mapped[str] = mapped_column(String(20), nullable=False, index=True)
    """ID of the request queue being updated."""

    client_id: Mapped[str] = mapped_column(String(32), nullable=False)
    """Identifier of the client making this update."""

    # Counter deltas - use SUM when aggregating.
    delta_handled_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
    """Delta for handled_request_count."""

    delta_pending_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
    """Delta for pending_request_count."""

    delta_total_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
    """Delta for total_request_count."""

    need_recalc: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
    """Flag indicating that counters need recalculation from actual data."""

    storage_id = synonym('request_queue_id')
    """Alias for request_queue_id to match SqlClientMixin expectations."""


================================================
FILE: src/crawlee/storage_clients/_sql/_key_value_store_client.py
================================================
from __future__ import annotations

import json
from datetime import datetime, timezone
from logging import getLogger
from typing import TYPE_CHECKING, Any, cast

from sqlalchemy import CursorResult, delete, select
from sqlalchemy import func as sql_func
from typing_extensions import Self, override

from crawlee._utils.file import infer_mime_type
from crawlee.storage_clients._base import KeyValueStoreClient
from crawlee.storage_clients.models import (
    KeyValueStoreMetadata,
    KeyValueStoreRecord,
    KeyValueStoreRecordMetadata,
)

from ._client_mixin import MetadataUpdateParams, SqlClientMixin
from ._db_models import KeyValueStoreMetadataBufferDb, KeyValueStoreMetadataDb, KeyValueStoreRecordDb

if TYPE_CHECKING:
    from collections.abc import AsyncIterator

    from sqlalchemy.ext.asyncio import AsyncSession

    from ._storage_client import SqlStorageClient


logger = getLogger(__name__)


class SqlKeyValueStoreClient(KeyValueStoreClient, SqlClientMixin):
    """SQL implementation of the key-value store client.

    This client persists key-value data to a SQL database with transaction support and
    concurrent access safety. Keys are mapped to rows in database tables with proper indexing
    for efficient retrieval.

    The key-value store data is stored in SQL database tables following the pattern:
    - `key_value_stores` table: Contains store metadata (id, name, timestamps)
    - `key_value_store_records` table: Contains individual key-value pairs with binary value storage, content type,
    and size information
    - `key_value_store_metadata_buffer` table: Buffers metadata updates for performance optimization

    Values are serialized based on their type: JSON objects are stored as formatted JSON,
    text values as UTF-8 encoded strings, and binary data as-is in the `LargeBinary` column.
    The implementation automatically handles content type detection and maintains metadata
    about each record including size and MIME type information.

    All database operations are wrapped in transactions with proper error handling and rollback
    mechanisms. The client supports atomic upsert operations and handles race conditions when
    multiple clients access the same store using composite primary keys (key_value_store_id, key).
    """

    _DEFAULT_NAME = 'default'
    """Default dataset name used when no name is provided."""

    _METADATA_TABLE = KeyValueStoreMetadataDb
    """SQLAlchemy model for key-value store metadata."""

    _ITEM_TABLE = KeyValueStoreRecordDb
    """SQLAlchemy model for key-value store items."""

    _CLIENT_TYPE = 'Key-value store'
    """Human-readable client type for error messages."""

    _BUFFER_TABLE = KeyValueStoreMetadataBufferDb
    """SQLAlchemy model for metadata buffer."""

    def __init__(
        self,
        *,
        storage_client: SqlStorageClient,
        id: str,
    ) -> None:
        """Initialize a new instance.

        Preferably use the `SqlKeyValueStoreClient.open` class method to create a new instance.
        """
        super().__init__(id=id, storage_client=storage_client)

    @classmethod
    async def open(
        cls,
        *,
        id: str | None,
        name: str | None,
        alias: str | None,
        storage_client: SqlStorageClient,
    ) -> Self:
        """Open or create a SQL key-value store client.

        This method attempts to open an existing key-value store from the SQL database. If a KVS with the specified
        ID or name exists, it loads the metadata from the database. If no existing store is found, a new one
        is created.

        Args:
            id: The ID of the key-value store to open. If provided, searches for existing store by ID.
            name: The name of the key-value store for named (global scope) storages.
            alias: The alias of the key-value store for unnamed (run scope) storages.
            storage_client: The SQL storage client used to access the database.

        Returns:
            An instance for the opened or created storage client.

        Raises:
            ValueError: If a store with the specified ID is not found, or if metadata is invalid.
        """
        return await cls._safely_open(
            id=id,
            name=name,
            alias=alias,
            storage_client=storage_client,
            metadata_model=KeyValueStoreMetadata,
            extra_metadata_fields={},
        )

    @override
    async def get_metadata(self) -> KeyValueStoreMetadata:
        # The database is a single place of truth
        return await self._get_metadata(KeyValueStoreMetadata)

    @override
    async def drop(self) -> None:
        """Delete this key-value store and all its records from the database.

        This operation is irreversible. Uses CASCADE deletion to remove all related records.
        """
        await self._drop()

    @override
    async def purge(self) -> None:
        """Remove all items from this key-value store while keeping the key-value store structure.

        Remove all records from key_value_store_records table.
        """
        now = datetime.now(timezone.utc)
        await self._purge(metadata_kwargs=MetadataUpdateParams(accessed_at=now, modified_at=now))

    @override
    async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:
        # Special handling for None values
        if value is None:
            content_type = 'application/x-none'  # Special content type to identify None values
            value_bytes = b''
        else:
            content_type = content_type or infer_mime_type(value)

            # Serialize the value to bytes.
            if 'application/json' in content_type:
                value_bytes = json.dumps(value, default=str, ensure_ascii=False).encode('utf-8')
            elif isinstance(value, str):
                value_bytes = value.encode('utf-8')
            elif isinstance(value, (bytes, bytearray)):
                value_bytes = value
            else:
                # Fallback: attempt to convert to string and encode.
                value_bytes = str(value).encode('utf-8')

        size = len(value_bytes)
        insert_values = {
            'key_value_store_id': self._id,
            'key': key,
            'value': value_bytes,
            'content_type': content_type,
            'size': size,
        }

        upsert_stmt = self._build_upsert_stmt(
            self._ITEM_TABLE,
            insert_values=insert_values,
            update_columns=['value', 'content_type', 'size'],
            conflict_cols=['key_value_store_id', 'key'],
        )

        async with self.get_session(with_simple_commit=True) as session:
            await session.execute(upsert_stmt)

            await self._add_buffer_record(session, update_modified_at=True)

    @override
    async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
        # Query the record by key
        stmt = select(self._ITEM_TABLE).where(
            self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key
        )
        async with self.get_session(with_simple_commit=True) as session:
            result = await session.execute(stmt)
            record_db = result.scalar_one_or_none()

            await self._add_buffer_record(session)

        if not record_db:
            return None

        # Deserialize the value based on content type
        value_bytes = record_db.value

        # Handle None values
        if record_db.content_type == 'application/x-none':
            value = None
        # Handle JSON values
        elif 'application/json' in record_db.content_type:
            try:
                value = json.loads(value_bytes.decode('utf-8'))
            except (json.JSONDecodeError, UnicodeDecodeError):
                logger.warning(f'Failed to decode JSON value for key "{key}"')
                return None
        # Handle text values
        elif record_db.content_type.startswith('text/'):
            try:
                value = value_bytes.decode('utf-8')
            except UnicodeDecodeError:
                logger.warning(f'Failed to decode text value for key "{key}"')
                return None
        # Handle binary values
        else:
            value = value_bytes

        return KeyValueStoreRecord(
            key=record_db.key,
            value=value,
            content_type=record_db.content_type,
            size=record_db.size,
        )

    @override
    async def delete_value(self, *, key: str) -> None:
        stmt = delete(self._ITEM_TABLE).where(
            self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key
        )
        async with self.get_session(with_simple_commit=True) as session:
            # Delete the record if it exists
            result = await session.execute(stmt)
            result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result

            # Update metadata if we actually deleted something
            if result.rowcount > 0:
                await self._add_buffer_record(session, update_modified_at=True)

    @override
    async def iterate_keys(
        self,
        *,
        exclusive_start_key: str | None = None,
        limit: int | None = None,
    ) -> AsyncIterator[KeyValueStoreRecordMetadata]:
        # Build query for record metadata
        stmt = (
            select(self._ITEM_TABLE.key, self._ITEM_TABLE.content_type, self._ITEM_TABLE.size)
            .where(self._ITEM_TABLE.key_value_store_id == self._id)
            .order_by(self._ITEM_TABLE.key)
        )

        # Apply exclusive_start_key filter
        if exclusive_start_key is not None:
            stmt = stmt.where(self._ITEM_TABLE.key > exclusive_start_key)

        # Apply limit
        if limit is not None:
            stmt = stmt.limit(limit)

        async with self.get_session(with_simple_commit=True) as session:
            result = await session.stream(stmt.execution_options(stream_results=True))

            async for row in result:
                yield KeyValueStoreRecordMetadata(
                    key=row.key,
                    content_type=row.content_type,
                    size=row.size,
                )

            await self._add_buffer_record(session)

    @override
    async def record_exists(self, *, key: str) -> bool:
        stmt = select(self._ITEM_TABLE.key).where(
            self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key
        )
        async with self.get_session(with_simple_commit=True) as session:
            # Check if record exists
            result = await session.execute(stmt)

            await self._add_buffer_record(session)

            return result.scalar_one_or_none() is not None

    @override
    async def get_public_url(self, *, key: str) -> str:
        raise NotImplementedError('Public URLs are not supported for SQL key-value stores.')

    @override
    def _specific_update_metadata(self, **_kwargs: dict[str, Any]) -> dict[str, Any]:
        return {}

    @override
    def _prepare_buffer_data(self, **_kwargs: Any) -> dict[str, Any]:
        """Prepare key-value store specific buffer data.

        For KeyValueStore, we don't have specific metadata fields to track in buffer,
        so we just return empty dict. The base buffer will handle accessed_at/modified_at.
        """
        return {}

    @override
    async def _apply_buffer_updates(self, session: AsyncSession, max_buffer_id: int) -> None:
        aggregation_stmt = select(
            sql_func.max(self._BUFFER_TABLE.accessed_at).label('max_accessed_at'),
            sql_func.max(self._BUFFER_TABLE.modified_at).label('max_modified_at'),
        ).where(self._BUFFER_TABLE.storage_id == self._id, self._BUFFER_TABLE.id <= max_buffer_id)

        result = await session.execute(aggregation_stmt)
        row = result.first()

        if not row:
            return

        await self._update_metadata(
            session,
            **MetadataUpdateParams(
                accessed_at=row.max_accessed_at,
                modified_at=row.max_modified_at,
            ),
        )


================================================
FILE: src/crawlee/storage_clients/_sql/_request_queue_client.py
================================================
from __future__ import annotations

from collections import deque
from datetime import datetime, timedelta, timezone
from functools import lru_cache
from hashlib import sha256
from logging import getLogger
from typing import TYPE_CHECKING, Any, cast

from sqlalchemy import CursorResult, exists, func, or_, select, update
from sqlalchemy import func as sql_func
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.orm import load_only
from typing_extensions import NotRequired, Self, override

from crawlee import Request
from crawlee._utils.crypto import crypto_random_object_id
from crawlee.storage_clients._base import RequestQueueClient
from crawlee.storage_clients.models import (
    AddRequestsResponse,
    ProcessedRequest,
    RequestQueueMetadata,
    UnprocessedRequest,
)

from ._client_mixin import MetadataUpdateParams, SqlClientMixin
from ._db_models import RequestDb, RequestQueueMetadataBufferDb, RequestQueueMetadataDb, RequestQueueStateDb

if TYPE_CHECKING:
    from collections.abc import Sequence

    from sqlalchemy.ext.asyncio import AsyncSession
    from sqlalchemy.sql import ColumnElement

    from ._storage_client import SqlStorageClient


logger = getLogger(__name__)


class _QueueMetadataUpdateParams(MetadataUpdateParams):
    """Parameters for updating queue metadata."""

    new_handled_request_count: NotRequired[int]
    new_pending_request_count: NotRequired[int]
    new_total_request_count: NotRequired[int]
    delta_handled_request_count: NotRequired[int]
    delta_pending_request_count: NotRequired[int]
    delta_total_request_count: NotRequired[int]
    recalculate: NotRequired[bool]
    update_had_multiple_clients: NotRequired[bool]


class SqlRequestQueueClient(RequestQueueClient, SqlClientMixin):
    """SQL implementation of the request queue client.

    This client persists requests to a SQL database with transaction handling and
    concurrent access safety. Requests are stored with sequence-based ordering and
    efficient querying capabilities.

    The implementation uses negative sequence numbers for forefront (high-priority) requests
    and positive sequence numbers for regular requests, allowing for efficient single-query
    ordering. A cache mechanism reduces database queries.

    The request queue data is stored in SQL database tables following the pattern:
    - `request_queues` table: Contains queue metadata (id, name, timestamps, request counts, multi-client flag)
    - `request_queue_records` table: Contains individual requests with JSON data, unique keys for deduplication,
    sequence numbers for ordering, and processing status flags
    - `request_queue_state` table: Maintains counters for sequence numbers to ensure proper ordering of requests.
    - `request_queue_metadata_buffer` table: Buffers metadata updates for performance optimization

    Requests are serialized to JSON for storage and maintain proper ordering through sequence
    numbers. The implementation provides concurrent access safety through transaction
    handling, locking mechanisms, and optimized database indexes for efficient querying.
    """

    _DEFAULT_NAME = 'default'
    """Default dataset name used when no name is provided."""

    _MAX_BATCH_FETCH_SIZE = 10
    """Maximum number of requests to fetch from the database in a single batch operation.

    Used to limit the number of requests loaded and locked for processing at once (improves efficiency and reduces
    database load).
    """

    _METADATA_TABLE = RequestQueueMetadataDb
    """SQLAlchemy model for request queue metadata."""

    _ITEM_TABLE = RequestDb
    """SQLAlchemy model for request items."""

    _CLIENT_TYPE = 'Request queue'
    """Human-readable client type for error messages."""

    _BLOCK_REQUEST_TIME = 300
    """Number of seconds for which a request is considered blocked in the database after being fetched for processing.
    """

    _BUFFER_TABLE = RequestQueueMetadataBufferDb
    """SQLAlchemy model for metadata buffer."""

    def __init__(
        self,
        *,
        id: str,
        storage_client: SqlStorageClient,
    ) -> None:
        """Initialize a new instance.

        Preferably use the `SqlRequestQueueClient.open` class method to create a new instance.
        """
        super().__init__(id=id, storage_client=storage_client)

        self._pending_fetch_cache: deque[Request] = deque()
        """Cache for requests: ordered by sequence number."""

        self.client_key = crypto_random_object_id(length=32)[:32]
        """Unique identifier for this client instance."""

        self._had_multiple_clients = False
        """Indicates whether the queue has been accessed by multiple clients."""

    @classmethod
    async def open(
        cls,
        *,
        id: str | None,
        name: str | None,
        alias: str | None,
        storage_client: SqlStorageClient,
    ) -> Self:
        """Open an existing request queue or create a new one.

        This method first tries to find an existing queue by ID or name.
        If found, it returns a client for that queue. If not found, it creates
        a new queue with the specified parameters.

        Args:
            id: The ID of the request queue to open. Takes precedence over name.
            name: The name of the request queue for named (global scope) storages.
            alias: The alias of the request queue for unnamed (run scope) storages.
            storage_client: The SQL storage client used to access the database.

        Returns:
            An instance for the opened or created request queue.

        Raises:
            ValueError: If a queue with the specified ID is not found.
        """
        return await cls._safely_open(
            id=id,
            name=name,
            alias=alias,
            storage_client=storage_client,
            metadata_model=RequestQueueMetadata,
            extra_metadata_fields={
                'had_multiple_clients': False,
                'handled_request_count': 0,
                'pending_request_count': 0,
                'total_request_count': 0,
            },
        )

    @override
    async def get_metadata(self) -> RequestQueueMetadata:
        # The database is a single place of truth
        metadata = await self._get_metadata(RequestQueueMetadata)
        self._had_multiple_clients = metadata.had_multiple_clients
        return metadata

    @override
    async def drop(self) -> None:
        """Delete this request queue and all its records from the database.

        This operation is irreversible. Uses CASCADE deletion to remove all related records.
        """
        await self._drop()

        self._pending_fetch_cache.clear()

    @override
    async def purge(self) -> None:
        """Remove all items from this dataset while keeping the dataset structure.

        Resets pending_request_count and handled_request_count to 0 and deletes all records from request_queue_records
        table.
        """
        now = datetime.now(timezone.utc)
        await self._purge(
            metadata_kwargs=_QueueMetadataUpdateParams(
                accessed_at=now,
                modified_at=now,
                new_pending_request_count=0,
                new_handled_request_count=0,
                new_total_request_count=0,
            )
        )

        # Clear recoverable state
        self._pending_fetch_cache.clear()

    @override
    async def add_batch_of_requests(
        self,
        requests: Sequence[Request],
        *,
        forefront: bool = False,
    ) -> AddRequestsResponse:
        if not requests:
            return AddRequestsResponse(processed_requests=[], unprocessed_requests=[])

        # Clear empty cache since we're adding requests
        processed_requests = []
        unprocessed_requests = []
        transaction_processed_requests = []
        transaction_processed_requests_unique_keys = set()

        approximate_new_request = 0

        # Deduplicate requests by unique_key upfront
        unique_requests = {}
        unique_key_by_request_id = {}
        for req in requests:
            if req.unique_key not in unique_requests:
                request_id = self._get_int_id_from_unique_key(req.unique_key)
                unique_requests[request_id] = req
                unique_key_by_request_id[request_id] = req.unique_key

        # Get existing requests by unique keys
        stmt = (
            select(self._ITEM_TABLE)
            .where(
                self._ITEM_TABLE.request_queue_id == self._id,
                self._ITEM_TABLE.request_id.in_(set(unique_requests.keys())),
            )
            .options(
                load_only(
                    self._ITEM_TABLE.request_id,
                    self._ITEM_TABLE.is_handled,
                    self._ITEM_TABLE.time_blocked_until,
                )
            )
        )

        async with self.get_session() as session:
            result = await session.execute(stmt)
            result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result
            existing_requests = {req.request_id: req for req in result.scalars()}
            state = await self._get_state(session)
            insert_values: list[dict] = []

            for request_id, request in sorted(unique_requests.items()):
                existing_req_db = existing_requests.get(request_id)
                # New Request, add it
                if existing_req_db is None:
                    value = {
                        'request_id': request_id,
                        'request_queue_id': self._id,
                        'data': request.model_dump_json(),
                        'is_handled': False,
                    }
                    if forefront:
                        value['sequence_number'] = state.forefront_sequence_counter
                        state.forefront_sequence_counter -= 1
                    else:
                        value['sequence_number'] = state.sequence_counter
                        state.sequence_counter += 1

                    insert_values.append(value)
                    transaction_processed_requests.append(
                        ProcessedRequest(
                            unique_key=request.unique_key,
                            was_already_present=False,
                            was_already_handled=False,
                        )
                    )
                    transaction_processed_requests_unique_keys.add(request.unique_key)
                # Already handled request, skip adding
                elif existing_req_db and existing_req_db.is_handled:
                    processed_requests.append(
                        ProcessedRequest(
                            unique_key=request.unique_key,
                            was_already_present=True,
                            was_already_handled=True,
                        )
                    )
                # Already in progress in one of the clients
                elif existing_req_db and existing_req_db.time_blocked_until:
                    processed_requests.append(
                        ProcessedRequest(
                            unique_key=request.unique_key,
                            was_already_present=True,
                            was_already_handled=False,
                        )
                    )
                # Request in database but not yet handled and not in progress
                elif existing_req_db and not existing_req_db.is_handled and not existing_req_db.time_blocked_until:
                    # Forefront request, update its sequence number
                    if forefront:
                        insert_values.append(
                            {
                                'request_queue_id': self._id,
                                'request_id': request_id,
                                'sequence_number': state.forefront_sequence_counter,
                                'data': request.model_dump_json(),
                                'is_handled': False,
                            }
                        )
                        state.forefront_sequence_counter -= 1
                        transaction_processed_requests.append(
                            ProcessedRequest(
                                unique_key=request.unique_key,
                                was_already_present=True,
                                was_already_handled=False,
                            )
                        )
                        transaction_processed_requests_unique_keys.add(request.unique_key)
                    # Regular request, keep its position
                    else:
                        processed_requests.append(
                            ProcessedRequest(
                                unique_key=request.unique_key,
                                was_already_present=True,
                                was_already_handled=False,
                            )
                        )
                # Unexpected condition
                else:
                    unprocessed_requests.append(
                        UnprocessedRequest(
                            unique_key=request.unique_key,
                            url=request.url,
                            method=request.method,
                        )
                    )

            try:
                if insert_values:
                    if forefront:
                        # If the request already exists in the database, we update the sequence_number
                        # by shifting request to the left.
                        upsert_stmt = self._build_upsert_stmt(
                            self._ITEM_TABLE,
                            insert_values,
                            update_columns=['sequence_number'],
                            conflict_cols=['request_id', 'request_queue_id'],
                        )
                        result = await session.execute(upsert_stmt)
                    else:
                        # If the request already exists in the database, we ignore this request when inserting.
                        insert_stmt_with_ignore = self._build_insert_stmt_with_ignore(self._ITEM_TABLE, insert_values)
                        result = await session.execute(insert_stmt_with_ignore)

                    result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result
                    approximate_new_request += result.rowcount

                await self._add_buffer_record(
                    session,
                    update_modified_at=True,
                    delta_pending_request_count=approximate_new_request,
                    delta_total_request_count=approximate_new_request,
                )

                await session.commit()
                processed_requests.extend(transaction_processed_requests)
            except SQLAlchemyError as e:
                await session.rollback()
                logger.debug(f'Failed add requests to DB with error: {e}')
                await self._add_buffer_record(
                    session,
                    update_modified_at=True,
                    recalculate=True,
                )
                await session.commit()
                transaction_processed_requests.clear()
                unprocessed_requests.extend(
                    [
                        UnprocessedRequest(
                            unique_key=request.unique_key,
                            url=request.url,
                            method=request.method,
                        )
                        for request in requests
                        if request.unique_key in transaction_processed_requests_unique_keys
                    ]
                )

        return AddRequestsResponse(
            processed_requests=processed_requests,
            unprocessed_requests=unprocessed_requests,
        )

    @override
    async def get_request(self, unique_key: str) -> Request | None:
        request_id = self._get_int_id_from_unique_key(unique_key)

        stmt = select(self._ITEM_TABLE).where(
            self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id == request_id
        )
        async with self.get_session(with_simple_commit=True) as session:
            result = await session.execute(stmt)
            request_db = result.scalar_one_or_none()

            if request_db is None:
                logger.warning(f'Request with ID "{unique_key}" not found in the queue.')
                return None

            await self._add_buffer_record(session)

        return Request.model_validate_json(request_db.data)

    @override
    async def fetch_next_request(self) -> Request | None:
        if self._pending_fetch_cache:
            return self._pending_fetch_cache.popleft()

        now = datetime.now(timezone.utc)
        block_until = now + timedelta(seconds=self._BLOCK_REQUEST_TIME)
        dialect = self._storage_client.get_dialect_name()

        # Get available requests not blocked by another client
        stmt = (
            select(self._ITEM_TABLE)
            .where(
                self._ITEM_TABLE.request_queue_id == self._id,
                self._ITEM_TABLE.is_handled == False,  # noqa: E712
                or_(self._ITEM_TABLE.time_blocked_until.is_(None), self._ITEM_TABLE.time_blocked_until < now),
            )
            .order_by(self._ITEM_TABLE.sequence_number.asc())
            .limit(self._MAX_BATCH_FETCH_SIZE)
        )

        async with self.get_session(with_simple_commit=True) as session:
            # We use the `skip_locked` database mechanism to prevent the 'interception' of requests by another client
            if dialect in {'postgresql', 'mysql', 'mariadb'}:
                stmt = stmt.with_for_update(skip_locked=True)
                result = await session.execute(stmt)
                requests_db = result.scalars().all()

                if not requests_db:
                    return None

                # All requests received have already been reserved for update with the help of `skip_locked`.
                request_ids = {r.request_id for r in requests_db}

                update_stmt = (
                    update(self._ITEM_TABLE)
                    .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id.in_(request_ids))
                    .values(time_blocked_until=block_until, client_key=self.client_key)
                )
                await session.execute(update_stmt)

                blocked_ids = request_ids
            else:
                # For other databases, we first select the requests, then try to update them to be blocked.
                result = await session.execute(stmt)
                requests_db = result.scalars().all()

                if not requests_db:
                    return None

                request_ids = {r.request_id for r in requests_db}

                update_stmt = (
                    update(self._ITEM_TABLE)
                    .where(
                        self._ITEM_TABLE.request_queue_id == self._id,
                        self._ITEM_TABLE.request_id.in_(request_ids),
                        self._ITEM_TABLE.is_handled == False,  # noqa: E712
                        or_(self._ITEM_TABLE.time_blocked_until.is_(None), self._ITEM_TABLE.time_blocked_until < now),
                    )
                    .values(time_blocked_until=block_until, client_key=self.client_key)
                    .returning(self._ITEM_TABLE.request_id)
                )

                update_result = await session.execute(update_stmt)
                blocked_ids = {row[0] for row in update_result.fetchall()}

                if not blocked_ids:
                    await session.rollback()
                    return None

            await self._add_buffer_record(session)

        requests = [Request.model_validate_json(r.data) for r in requests_db if r.request_id in blocked_ids]

        if not requests:
            return None

        self._pending_fetch_cache.extend(requests[1:])

        return requests[0]

    @override
    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
        request_id = self._get_int_id_from_unique_key(request.unique_key)

        # Update the request's handled_at timestamp.
        if request.handled_at is None:
            request.handled_at = datetime.now(timezone.utc)

        # Update request in Db
        stmt = (
            update(self._ITEM_TABLE)
            .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id == request_id)
            .values(is_handled=True, time_blocked_until=None, client_key=None, data=request.model_dump_json())
        )
        async with self.get_session(with_simple_commit=True) as session:
            result = await session.execute(stmt)
            result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result

            if result.rowcount == 0:
                logger.warning(f'Request {request.unique_key} not found in database.')
                return None

            await self._add_buffer_record(
                session, update_modified_at=True, delta_pending_request_count=-1, delta_handled_request_count=1
            )
        return ProcessedRequest(
            unique_key=request.unique_key,
            was_already_present=True,
            was_already_handled=True,
        )

    @override
    async def reclaim_request(
        self,
        request: Request,
        *,
        forefront: bool = False,
    ) -> ProcessedRequest | None:
        request_id = self._get_int_id_from_unique_key(request.unique_key)

        stmt = update(self._ITEM_TABLE).where(
            self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id == request_id
        )

        async with self.get_session(with_simple_commit=True) as session:
            state = await self._get_state(session)

            # Update sequence number if changing priority
            if forefront:
                new_sequence = state.forefront_sequence_counter
                state.forefront_sequence_counter -= 1
                now = datetime.now(timezone.utc)
                block_until = now + timedelta(seconds=self._BLOCK_REQUEST_TIME)
                # Extend blocking for forefront request, it is considered blocked by the current client.
                stmt = stmt.values(
                    sequence_number=new_sequence,
                    time_blocked_until=block_until,
                    client_key=self.client_key,
                    data=request.model_dump_json(),
                )
            else:
                new_sequence = state.sequence_counter
                state.sequence_counter += 1
                stmt = stmt.values(
                    sequence_number=new_sequence,
                    time_blocked_until=None,
                    client_key=None,
                    data=request.model_dump_json(),
                )

            result = await session.execute(stmt)
            result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result

            if result.rowcount == 0:
                logger.warning(f'Request {request.unique_key} not found in database.')
                return None
            await self._add_buffer_record(session, update_modified_at=True)

        # put the forefront request at the beginning of the cache
        if forefront:
            self._pending_fetch_cache.appendleft(request)

        return ProcessedRequest(
            unique_key=request.unique_key,
            was_already_present=True,
            was_already_handled=False,
        )

    @override
    async def is_empty(self) -> bool:
        # Check in-memory cache for requests
        if self._pending_fetch_cache:
            return False

        metadata = await self.get_metadata()

        async with self.get_session(with_simple_commit=True) as session:
            # If there are no pending requests, check if there are any buffered updates
            if metadata.pending_request_count == 0:
                # Check for active buffer lock (indicates pending buffer processing)
                buffer_lock_stmt = select(self._METADATA_TABLE.buffer_locked_until).where(
                    self._METADATA_TABLE.id == self._id
                )
                buffer_lock_result = await session.execute(buffer_lock_stmt)
                buffer_locked_until = buffer_lock_result.scalar()

                # If buffer is locked, there are pending updates being processed
                if buffer_locked_until is not None:
                    await self._add_buffer_record(session)
                    return False

                # Check if there are any buffered updates that might change the pending count
                buffer_check_stmt = select(
                    exists().where(
                        (self._BUFFER_TABLE.storage_id == self._id)
                        & (
                            (self._BUFFER_TABLE.delta_pending_count != 0) | (self._BUFFER_TABLE.need_recalc == True)  # noqa: E712
                        )
                    )
                )
                buffer_result = await session.execute(buffer_check_stmt)
                has_pending_buffer_updates = buffer_result.scalar()

                await self._add_buffer_record(session)
                # If there are no pending requests and no buffered updates, the queue is empty
                return not has_pending_buffer_updates

            # There are pending requests (may be inaccurate), ensure recalculated metadata
            await self._add_buffer_record(session, update_modified_at=True, recalculate=True)

        return False

    async def _get_state(self, session: AsyncSession) -> RequestQueueStateDb:
        """Get the current state of the request queue."""
        orm_state: RequestQueueStateDb | None = await session.get(RequestQueueStateDb, self._id)
        if not orm_state:
            insert_values = {'request_queue_id': self._id}
            # Create a new state if it doesn't exist
            # This is a safeguard against race conditions where multiple clients might try to create the state
            # simultaneously.
            insert_stmt = self._build_insert_stmt_with_ignore(RequestQueueStateDb, insert_values)
            await session.execute(insert_stmt)
            await session.flush()
            orm_state = await session.get(RequestQueueStateDb, self._id)
            if not orm_state:
                raise RuntimeError(f'Failed to create or retrieve state for queue {self._id}')
        return orm_state

    @override
    def _specific_update_metadata(
        self,
        new_handled_request_count: int | None = None,
        new_pending_request_count: int | None = None,
        new_total_request_count: int | None = None,
        delta_handled_request_count: int | None = None,
        delta_pending_request_count: int | None = None,
        delta_total_request_count: int | None = None,
        *,
        recalculate: bool = False,
        update_had_multiple_clients: bool = False,
        **_kwargs: dict[str, Any],
    ) -> dict[str, Any]:
        """Update the request queue metadata in the database.

        Args:
            session: The SQLAlchemy session to use for database operations.
            new_handled_request_count: If provided, update the handled_request_count to this value.
            new_pending_request_count: If provided, update the pending_request_count to this value.
            new_total_request_count: If provided, update the total_request_count to this value.
            delta_handled_request_count: If provided, add this value to the handled_request_count.
            delta_pending_request_count: If provided, add this value to the pending_request_count.
            delta_total_request_count: If provided, add this value to the total_request_count.
            recalculate: If True, recalculate the pending_request_count, and total_request_count on request table.
            update_had_multiple_clients: If True, set had_multiple_clients to True.
        """
        values_to_set: dict[str, Any] = {}

        if update_had_multiple_clients:
            values_to_set['had_multiple_clients'] = True

        if recalculate:
            stmt = (
                update(self._METADATA_TABLE)
                .where(self._METADATA_TABLE.request_queue_id == self._id)
                .values(
                    pending_request_count=(
                        select(func.count())
                        .select_from(self._ITEM_TABLE)
                        .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.is_handled.is_(False))
                        .scalar_subquery()
                    ),
                    total_request_count=(
                        select(func.count())
                        .select_from(self._ITEM_TABLE)
                        .where(self._ITEM_TABLE.request_queue_id == self._id)
                        .scalar_subquery()
                    ),
                    handled_request_count=(
                        select(func.count())
                        .select_from(self._ITEM_TABLE)
                        .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.is_handled.is_(True))
                        .scalar_subquery()
                    ),
                )
            )

            values_to_set['custom_stmt'] = stmt

        else:
            if new_handled_request_count is not None:
                values_to_set['handled_request_count'] = new_handled_request_count
            elif delta_handled_request_count is not None:
                values_to_set['handled_request_count'] = (
                    self._METADATA_TABLE.handled_request_count + delta_handled_request_count
                )

            if new_pending_request_count is not None:
                values_to_set['pending_request_count'] = new_pending_request_count
            elif delta_pending_request_count is not None:
                values_to_set['pending_request_count'] = (
                    self._METADATA_TABLE.pending_request_count + delta_pending_request_count
                )

            if new_total_request_count is not None:
                values_to_set['total_request_count'] = new_total_request_count
            elif delta_total_request_count is not None:
                values_to_set['total_request_count'] = (
                    self._METADATA_TABLE.total_request_count + delta_total_request_count
                )

        return values_to_set

    @staticmethod
    @lru_cache(maxsize=10000)
    def _get_int_id_from_unique_key(unique_key: str) -> int:
        """Generate a deterministic integer ID for a unique_key.

        Args:
            unique_key: Unique key to be used to generate ID.

        Returns:
            An integer ID based on the unique_key.
        """
        hashed_key = sha256(unique_key.encode('utf-8')).hexdigest()
        name_length = 15
        return int(hashed_key[:name_length], 16)

    @override
    def _prepare_buffer_data(
        self,
        delta_handled_request_count: int | None = None,
        delta_pending_request_count: int | None = None,
        delta_total_request_count: int | None = None,
        *,
        recalculate: bool = False,
        **_kwargs: Any,
    ) -> dict[str, Any]:
        """Prepare request queue specific buffer data.

        Args:
            delta_handled_request_count: If provided, add this value to the handled_request_count.
            delta_pending_request_count: If provided, add this value to the pending_request_count.
            delta_total_request_count: If provided, add this value to the total_request_count.
            recalculate: If True, recalculate the pending_request_count, and total_request_count on request table.
        """
        buffer_data: dict[str, Any] = {
            'client_id': self.client_key,
        }

        if delta_handled_request_count:
            buffer_data['delta_handled_count'] = delta_handled_request_count

        if delta_pending_request_count:
            buffer_data['delta_pending_count'] = delta_pending_request_count

        if delta_total_request_count:
            buffer_data['delta_total_count'] = delta_total_request_count

        if recalculate:
            buffer_data['need_recalc'] = True

        return buffer_data

    @override
    async def _apply_buffer_updates(self, session: AsyncSession, max_buffer_id: int) -> None:
        aggregations: list[ColumnElement[Any]] = [
            sql_func.max(self._BUFFER_TABLE.accessed_at).label('max_accessed_at'),
            sql_func.max(self._BUFFER_TABLE.modified_at).label('max_modified_at'),
            sql_func.sum(self._BUFFER_TABLE.delta_handled_count).label('delta_handled_count'),
            sql_func.sum(self._BUFFER_TABLE.delta_pending_count).label('delta_pending_count'),
            sql_func.sum(self._BUFFER_TABLE.delta_total_count).label('delta_total_count'),
        ]

        if not self._had_multiple_clients:
            aggregations.append(
                sql_func.count(sql_func.distinct(self._BUFFER_TABLE.client_id)).label('unique_clients_count')
            )

        if self._storage_client.get_dialect_name() == 'postgresql':
            aggregations.append(sql_func.bool_or(self._BUFFER_TABLE.need_recalc).label('need_recalc'))
        else:
            aggregations.append(sql_func.max(self._BUFFER_TABLE.need_recalc).label('need_recalc'))

        aggregation_stmt = select(*aggregations).where(
            self._BUFFER_TABLE.storage_id == self._id, self._BUFFER_TABLE.id <= max_buffer_id
        )

        result = await session.execute(aggregation_stmt)
        row = result.first()

        if not row:
            return

        await self._update_metadata(
            session,
            **_QueueMetadataUpdateParams(
                accessed_at=row.max_accessed_at,
                modified_at=row.max_modified_at,
                update_had_multiple_clients=not self._had_multiple_clients and row.unique_clients_count > 1,
                delta_handled_request_count=row.delta_handled_count,
                delta_pending_request_count=row.delta_pending_count,
                delta_total_request_count=row.delta_total_count,
                recalculate=bool(row.need_recalc),
            ),
        )


================================================
FILE: src/crawlee/storage_clients/_sql/_storage_client.py
================================================
from __future__ import annotations

import warnings
from logging import getLogger
from pathlib import Path
from typing import TYPE_CHECKING, Any, ClassVar

from sqlalchemy.exc import IntegrityError, OperationalError
from sqlalchemy.ext.asyncio import AsyncEngine, async_sessionmaker, create_async_engine
from sqlalchemy.sql import insert, select, text
from typing_extensions import override

from crawlee._utils.docs import docs_group
from crawlee.configuration import Configuration
from crawlee.storage_clients._base import StorageClient

from ._dataset_client import SqlDatasetClient
from ._db_models import Base, VersionDb
from ._key_value_store_client import SqlKeyValueStoreClient
from ._request_queue_client import SqlRequestQueueClient

if TYPE_CHECKING:
    from types import TracebackType

    from sqlalchemy.ext.asyncio import AsyncSession


logger = getLogger(__name__)


@docs_group('Storage clients')
class SqlStorageClient(StorageClient):
    """SQL implementation of the storage client.

    This storage client provides access to datasets, key-value stores, and request queues that persist data
    to a SQL database using SQLAlchemy 2+. Each storage type uses two tables: one for metadata and one for
    records.

    The client accepts either a database connection string or a pre-configured AsyncEngine. If neither is
    provided, it creates a default SQLite database 'crawlee.db' in the storage directory.

    Database schema is automatically created during initialization. SQLite databases receive performance
    optimizations including WAL mode and increased cache size.

    Warning:
        This is an experimental feature. The behavior and interface may change in future versions.
    """

    _DEFAULT_DB_NAME = 'crawlee.db'
    """Default database name if not specified in connection string."""

    _SUPPORTED_DIALECTS: ClassVar[set[str]] = {'sqlite', 'postgresql', 'mysql', 'mariadb'}

    def __init__(
        self,
        *,
        connection_string: str | None = None,
        engine: AsyncEngine | None = None,
    ) -> None:
        """Initialize the SQL storage client.

        Args:
            connection_string: Database connection string (e.g., "sqlite+aiosqlite:///crawlee.db").
                If not provided, defaults to SQLite database in the storage directory.
            engine: Pre-configured AsyncEngine instance. If provided, connection_string is ignored.
        """
        if engine is not None and connection_string is not None:
            raise ValueError('Either connection_string or engine must be provided, not both.')

        self._connection_string = connection_string
        self._engine = engine
        self._initialized = False
        self.session_maker: None | async_sessionmaker[AsyncSession] = None

        # Flag needed to apply optimizations only for default database
        self._default_flag = self._engine is None and self._connection_string is None
        self._dialect_name: str | None = None

        # Call the notification only once
        warnings.warn(
            'The SqlStorageClient is experimental and may change or be removed in future releases.',
            category=UserWarning,
            stacklevel=2,
        )

    async def __aenter__(self) -> SqlStorageClient:
        """Async context manager entry."""
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        """Async context manager exit."""
        await self.close()

    @property
    def engine(self) -> AsyncEngine:
        """Get the SQLAlchemy AsyncEngine instance."""
        if self._engine is None:
            raise ValueError('Engine is not initialized. Call initialize() before accessing the engine.')
        return self._engine

    def get_dialect_name(self) -> str | None:
        """Get the database dialect name."""
        return self._dialect_name

    async def initialize(self, configuration: Configuration) -> None:
        """Initialize the database schema.

        This method creates all necessary tables if they don't exist.
        Should be called before using the storage client.
        """
        if not self._initialized:
            engine = self._get_or_create_engine(configuration)
            async with engine.begin() as conn:
                self._dialect_name = engine.dialect.name

                if self._dialect_name not in self._SUPPORTED_DIALECTS:
                    raise ValueError(
                        f'Unsupported database dialect: {self._dialect_name}. Supported: '
                        f'{", ".join(self._SUPPORTED_DIALECTS)}. Consider using a different database.',
                    )

                # Create tables if they don't exist.
                # Rollback the transaction when an exception occurs.
                # This is likely an attempt to create a database from several parallel processes.
                try:
                    # Set SQLite pragmas for performance and consistency
                    if self._default_flag:
                        await conn.execute(text('PRAGMA journal_mode=WAL'))  # Better concurrency
                        await conn.execute(text('PRAGMA synchronous=NORMAL'))  # Balanced safety/speed
                        await conn.execute(text('PRAGMA cache_size=100000'))  # 100MB cache
                        await conn.execute(text('PRAGMA temp_store=MEMORY'))  # Memory temp storage
                        await conn.execute(text('PRAGMA mmap_size=268435456'))  # 256MB memory mapping
                        await conn.execute(text('PRAGMA foreign_keys=ON'))  # Enforce constraints
                        await conn.execute(text('PRAGMA busy_timeout=30000'))  # 30s busy timeout
                    await conn.run_sync(Base.metadata.create_all, checkfirst=True)
                    from crawlee import __version__  # Noqa: PLC0415

                    db_version = (await conn.execute(select(VersionDb))).scalar_one_or_none()

                    # Raise an error if the new version creates breaking changes in the database schema.
                    if db_version and db_version != __version__:
                        warnings.warn(
                            f'Database version {db_version} does not match library version {__version__}. '
                            'This may lead to unexpected behavior. Drop the db if you want to make sure that '
                            'everything will work fine.',
                            category=UserWarning,
                            stacklevel=2,
                        )
                    elif not db_version:
                        await conn.execute(insert(VersionDb).values(version=__version__))
                except (IntegrityError, OperationalError):
                    await conn.rollback()

            self._initialized = True

    async def close(self) -> None:
        """Close the database connection pool."""
        if self._engine is not None:
            await self._engine.dispose()
        self._engine = None

    def create_session(self) -> AsyncSession:
        """Create a new database session.

        Returns:
            A new AsyncSession instance.
        """
        if self.session_maker is None:
            self.session_maker = async_sessionmaker(self._engine, expire_on_commit=False, autoflush=False)
        return self.session_maker()

    @override
    async def create_dataset_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> SqlDatasetClient:
        configuration = configuration or Configuration.get_global_configuration()
        await self.initialize(configuration)

        client = await SqlDatasetClient.open(
            id=id,
            name=name,
            alias=alias,
            storage_client=self,
        )

        await self._purge_if_needed(client, configuration)
        return client

    @override
    async def create_kvs_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> SqlKeyValueStoreClient:
        configuration = configuration or Configuration.get_global_configuration()
        await self.initialize(configuration)

        client = await SqlKeyValueStoreClient.open(
            id=id,
            name=name,
            alias=alias,
            storage_client=self,
        )

        await self._purge_if_needed(client, configuration)
        return client

    @override
    async def create_rq_client(
        self,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
    ) -> SqlRequestQueueClient:
        configuration = configuration or Configuration.get_global_configuration()
        await self.initialize(configuration)

        client = await SqlRequestQueueClient.open(
            id=id,
            name=name,
            alias=alias,
            storage_client=self,
        )

        await self._purge_if_needed(client, configuration)
        return client

    def _get_or_create_engine(self, configuration: Configuration) -> AsyncEngine:
        """Get or create the database engine based on configuration."""
        if self._engine is not None:
            return self._engine

        if self._connection_string is not None:
            connection_string = self._connection_string
        else:
            # Create SQLite database in the storage directory
            storage_dir = Path(configuration.storage_dir)
            if not storage_dir.exists():
                storage_dir.mkdir(parents=True, exist_ok=True)

            db_path = storage_dir / self._DEFAULT_DB_NAME

            # Create connection string with path to default database
            connection_string = f'sqlite+aiosqlite:///{db_path}'

        if not any(connection_string.startswith(dialect) for dialect in self._SUPPORTED_DIALECTS):
            raise ValueError(
                f'Unsupported database. Supported: {", ".join(self._SUPPORTED_DIALECTS)}. Consider using a different '
                'database.'
            )

        kwargs: dict[str, Any] = {}
        if 'mysql' in connection_string or 'mariadb' in connection_string:
            connect_args: dict[str, Any] = {'connect_timeout': 30}
            # MySQL/MariaDB require READ COMMITTED isolation level for correct behavior in concurrent environments
            # without deadlocks.
            kwargs['isolation_level'] = 'READ COMMITTED'
        else:
            connect_args = {'timeout': 30}

        self._engine = create_async_engine(
            connection_string,
            future=True,
            pool_size=5,
            max_overflow=10,
            pool_timeout=60,
            pool_recycle=600,
            pool_pre_ping=True,
            echo=False,
            connect_args=connect_args,
            **kwargs,
        )
        return self._engine


================================================
FILE: src/crawlee/storage_clients/_sql/py.typed
================================================


================================================
FILE: src/crawlee/storage_clients/models.py
================================================
from __future__ import annotations

from datetime import datetime
from typing import TYPE_CHECKING, Annotated, Any, Generic

from pydantic import BaseModel, BeforeValidator, ConfigDict, Field
from typing_extensions import TypeVar

from crawlee._types import HttpMethod
from crawlee._utils.docs import docs_group
from crawlee._utils.urls import validate_http_url

KvsValueType = TypeVar('KvsValueType', default=Any)


@docs_group('Storage data')
class StorageMetadata(BaseModel):
    """Represents the base model for storage metadata.

    It contains common fields shared across all specific storage types.
    """

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, extra='allow', from_attributes=True)

    id: Annotated[str, Field(alias='id')]
    """The unique identifier of the storage."""

    name: Annotated[str | None, Field(alias='name', default=None)]
    """The name of the storage."""

    accessed_at: Annotated[datetime, Field(alias='accessedAt')]
    """The timestamp when the storage was last accessed."""

    created_at: Annotated[datetime, Field(alias='createdAt')]
    """The timestamp when the storage was created."""

    modified_at: Annotated[datetime, Field(alias='modifiedAt')]
    """The timestamp when the storage was last modified."""


@docs_group('Storage data')
class DatasetMetadata(StorageMetadata):
    """Model for a dataset metadata."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)

    item_count: Annotated[int, Field(alias='itemCount')]
    """The number of items in the dataset."""


@docs_group('Storage data')
class KeyValueStoreMetadata(StorageMetadata):
    """Model for a key-value store metadata."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)


@docs_group('Storage data')
class RequestQueueMetadata(StorageMetadata):
    """Model for a request queue metadata."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)

    had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')]
    """Indicates whether the queue has been accessed by multiple clients (consumers)."""

    handled_request_count: Annotated[int, Field(alias='handledRequestCount')]
    """The number of requests that have been handled from the queue."""

    pending_request_count: Annotated[int, Field(alias='pendingRequestCount')]
    """The number of requests that are still pending in the queue."""

    total_request_count: Annotated[int, Field(alias='totalRequestCount')]
    """The total number of requests that have been added to the queue."""


@docs_group('Storage data')
class KeyValueStoreRecordMetadata(BaseModel):
    """Model for a key-value store record metadata."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)

    key: Annotated[str, Field(alias='key')]
    """The key of the record.

    A unique identifier for the record in the key-value store.
    """

    content_type: Annotated[str, Field(alias='contentType')]
    """The MIME type of the record.

    Describe the format and type of data stored in the record, following the MIME specification.
    """

    size: Annotated[int | None, Field(alias='size', default=None)] = None
    """The size of the record in bytes."""


@docs_group('Storage data')
class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]):
    """Model for a key-value store record."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)

    value: Annotated[KvsValueType, Field(alias='value')]
    """The value of the record."""


@docs_group('Storage data')
class DatasetItemsListPage(BaseModel):
    """Model for a single page of dataset items returned from a collection list method."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)

    count: Annotated[int, Field(default=0)]
    """The number of objects returned on this page."""

    offset: Annotated[int, Field(default=0)]
    """The starting position of the first object returned, as specified in the API call."""

    limit: Annotated[int, Field(default=0)]
    """The maximum number of objects to return, as specified in the API call."""

    total: Annotated[int, Field(default=0)]
    """The total number of objects that match the criteria of the API call."""

    desc: Annotated[bool, Field(default=False)]
    """Indicates if the returned list is in descending order."""

    # Workaround for Pydantic and type checkers when using Annotated with default_factory
    if TYPE_CHECKING:
        items: list[dict] = []
        """The list of dataset items returned on this page."""
    else:
        items: Annotated[list[dict], Field(default_factory=list)]
        """The list of dataset items returned on this page."""


@docs_group('Storage data')
class ProcessedRequest(BaseModel):
    """Represents a processed request."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)

    id: Annotated[str | None, Field(alias='requestId', default=None)] = None
    """Internal representation of the request by the storage client. Only some clients use id."""

    unique_key: Annotated[str, Field(alias='uniqueKey')]
    was_already_present: Annotated[bool, Field(alias='wasAlreadyPresent')]
    was_already_handled: Annotated[bool, Field(alias='wasAlreadyHandled')]


@docs_group('Storage data')
class UnprocessedRequest(BaseModel):
    """Represents an unprocessed request."""

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)

    unique_key: Annotated[str, Field(alias='uniqueKey')]
    url: Annotated[str, BeforeValidator(validate_http_url), Field()]
    method: Annotated[HttpMethod | None, Field()] = None


@docs_group('Storage data')
class AddRequestsResponse(BaseModel):
    """Model for a response to add requests to a queue.

    Contains detailed information about the processing results when adding multiple requests
    to a queue. This includes which requests were successfully processed and which ones
    encountered issues during processing.
    """

    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)

    processed_requests: Annotated[list[ProcessedRequest], Field(alias='processedRequests')]
    """Successfully processed requests, including information about whether they were
    already present in the queue and whether they had been handled previously."""

    unprocessed_requests: Annotated[list[UnprocessedRequest], Field(alias='unprocessedRequests')]
    """Requests that could not be processed, typically due to validation errors or other issues."""


================================================
FILE: src/crawlee/storage_clients/py.typed
================================================


================================================
FILE: src/crawlee/storages/__init__.py
================================================
from ._dataset import Dataset
from ._key_value_store import KeyValueStore
from ._request_queue import RequestQueue

__all__ = [
    'Dataset',
    'KeyValueStore',
    'RequestQueue',
]


================================================
FILE: src/crawlee/storages/_base.py
================================================
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
    from crawlee.configuration import Configuration
    from crawlee.storage_clients._base import StorageClient
    from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata


@docs_group('Storages')
class Storage(ABC):
    """Base class for storages."""

    @property
    @abstractmethod
    def id(self) -> str:
        """Get the storage ID."""

    @property
    @abstractmethod
    def name(self) -> str | None:
        """Get the storage name."""

    @abstractmethod
    async def get_metadata(self) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata:
        """Get the storage metadata."""

    @classmethod
    @abstractmethod
    async def open(
        cls,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
        storage_client: StorageClient | None = None,
    ) -> Storage:
        """Open a storage, either restore existing or create a new one.

        Args:
            id: The storage ID.
            name: The storage name (global scope, persists across runs). Name can only contain letters "a" through "z",
                the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
                (e.g. "my-value-1").
            alias: The storage alias (run scope, creates unnamed storage).
            configuration: Configuration object used during the storage creation or restoration process.
            storage_client: Underlying storage client to use. If not provided, the default global storage client
                from the service locator will be used.
        """

    @abstractmethod
    async def drop(self) -> None:
        """Drop the storage, removing it from the underlying storage client and clearing the cache."""

    @abstractmethod
    async def purge(self) -> None:
        """Purge the storage, removing all items from the underlying storage client.

        This method does not remove the storage itself, e.g. don't remove the metadata,
        but clears all items within it.
        """


================================================
FILE: src/crawlee/storages/_dataset.py
================================================
from __future__ import annotations

import logging
from io import StringIO
from typing import TYPE_CHECKING, overload

from typing_extensions import override

from crawlee import service_locator
from crawlee._utils.docs import docs_group
from crawlee._utils.file import export_csv_to_stream, export_json_to_stream

from ._base import Storage
from ._key_value_store import KeyValueStore
from ._utils import validate_storage_name

if TYPE_CHECKING:
    from collections.abc import AsyncIterator
    from typing import Any, Literal

    from typing_extensions import Unpack

    from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs
    from crawlee.configuration import Configuration
    from crawlee.storage_clients import StorageClient
    from crawlee.storage_clients._base import DatasetClient
    from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata

logger = logging.getLogger(__name__)


@docs_group('Storages')
class Dataset(Storage):
    """Dataset is a storage for managing structured tabular data.

    The dataset class provides a high-level interface for storing and retrieving structured data
    with consistent schema, similar to database tables or spreadsheets. It abstracts the underlying
    storage implementation details, offering a consistent API regardless of where the data is
    physically stored.

    Dataset operates in an append-only mode, allowing new records to be added but not modified
    or deleted after creation. This makes it particularly suitable for storing crawling results
    and other data that should be immutable once collected.

    The class provides methods for adding data, retrieving data with various filtering options,
    and exporting data to different formats. You can create a dataset using the `open` class method,
    specifying either a name or ID. The underlying storage implementation is determined by
    the configured storage client.

    ### Usage

    ```python
    from crawlee.storages import Dataset

    # Open a dataset
    dataset = await Dataset.open(name='my-dataset')

    # Add data
    await dataset.push_data({'title': 'Example Product', 'price': 99.99})

    # Retrieve filtered data
    results = await dataset.get_data(limit=10, desc=True)

    # Export data
    await dataset.export_to('results.json', content_type='json')
    ```
    """

    def __init__(self, client: DatasetClient, id: str, name: str | None) -> None:
        """Initialize a new instance.

        Preferably use the `Dataset.open` constructor to create a new instance.

        Args:
            client: An instance of a storage client.
            id: The unique identifier of the storage.
            name: The name of the storage, if available.
        """
        validate_storage_name(name)

        self._client = client
        self._id = id
        self._name = name

    @property
    @override
    def id(self) -> str:
        return self._id

    @property
    @override
    def name(self) -> str | None:
        return self._name

    @override
    async def get_metadata(self) -> DatasetMetadata:
        return await self._client.get_metadata()

    @override
    @classmethod
    async def open(
        cls,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
        storage_client: StorageClient | None = None,
    ) -> Dataset:
        configuration = service_locator.get_configuration() if configuration is None else configuration
        storage_client = service_locator.get_storage_client() if storage_client is None else storage_client

        client_opener_coro = storage_client.create_dataset_client(
            id=id, name=name, alias=alias, configuration=configuration
        )
        storage_client_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)

        return await service_locator.storage_instance_manager.open_storage_instance(
            cls,
            id=id,
            name=name,
            alias=alias,
            client_opener_coro=client_opener_coro,
            storage_client_cache_key=storage_client_cache_key,
        )

    @override
    async def drop(self) -> None:
        storage_instance_manager = service_locator.storage_instance_manager
        storage_instance_manager.remove_from_cache(self)
        await self._client.drop()

    @override
    async def purge(self) -> None:
        await self._client.purge()

    async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
        """Store an object or an array of objects to the dataset.

        The size of the data is limited by the receiving API and therefore `push_data()` will only
        allow objects whose JSON representation is smaller than 9MB. When an array is passed,
        none of the included objects may be larger than 9MB, but the array itself may be of any size.

        Args:
            data: A JSON serializable data structure to be stored in the dataset. The JSON representation
                of each item must be smaller than 9MB.
        """
        await self._client.push_data(data=data)

    async def get_data(
        self,
        *,
        offset: int = 0,
        limit: int | None = 999_999_999_999,
        clean: bool = False,
        desc: bool = False,
        fields: list[str] | None = None,
        omit: list[str] | None = None,
        unwind: list[str] | None = None,
        skip_empty: bool = False,
        skip_hidden: bool = False,
        flatten: list[str] | None = None,
        view: str | None = None,
    ) -> DatasetItemsListPage:
        """Retrieve a paginated list of items from a dataset based on various filtering parameters.

        This method provides the flexibility to filter, sort, and modify the appearance of dataset items
        when listed. Each parameter modifies the result set according to its purpose. The method also
        supports pagination through 'offset' and 'limit' parameters.

        Args:
            offset: Skips the specified number of items at the start.
            limit: The maximum number of items to retrieve. Unlimited if None.
            clean: Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.
            desc: Set to True to sort results in descending order.
            fields: Fields to include in each item. Sorts fields as specified if provided.
            omit: Fields to exclude from each item.
            unwind: Unwinds items by a specified array field, turning each element into a separate item.
            skip_empty: Excludes empty items from the results if True.
            skip_hidden: Excludes fields starting with '#' if True.
            flatten: Fields to be flattened in returned items.
            view: Specifies the dataset view to be used.

        Returns:
            An object with filtered, sorted, and paginated dataset items plus pagination details.
        """
        return await self._client.get_data(
            offset=offset,
            limit=limit,
            clean=clean,
            desc=desc,
            fields=fields,
            omit=omit,
            unwind=unwind,
            skip_empty=skip_empty,
            skip_hidden=skip_hidden,
            flatten=flatten,
            view=view,
        )

    async def iterate_items(
        self,
        *,
        offset: int = 0,
        limit: int | None = 999_999_999_999,
        clean: bool = False,
        desc: bool = False,
        fields: list[str] | None = None,
        omit: list[str] | None = None,
        unwind: list[str] | None = None,
        skip_empty: bool = False,
        skip_hidden: bool = False,
    ) -> AsyncIterator[dict[str, Any]]:
        """Iterate over items in the dataset according to specified filters and sorting.

        This method allows for asynchronously iterating through dataset items while applying various filters such as
        skipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`
        parameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and
        `skip_hidden` parameters.

        Args:
            offset: Skips the specified number of items at the start.
            limit: The maximum number of items to retrieve. Unlimited if None.
            clean: Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.
            desc: Set to True to sort results in descending order.
            fields: Fields to include in each item. Sorts fields as specified if provided.
            omit: Fields to exclude from each item.
            unwind: Unwinds items by a specified array field, turning each element into a separate item.
            skip_empty: Excludes empty items from the results if True.
            skip_hidden: Excludes fields starting with '#' if True.

        Yields:
            An asynchronous iterator of dictionary objects, each representing a dataset item after applying
            the specified filters and transformations.
        """
        async for item in self._client.iterate_items(
            offset=offset,
            limit=limit,
            clean=clean,
            desc=desc,
            fields=fields,
            omit=omit,
            unwind=unwind,
            skip_empty=skip_empty,
            skip_hidden=skip_hidden,
        ):
            yield item

    async def list_items(
        self,
        *,
        offset: int = 0,
        limit: int | None = 999_999_999_999,
        clean: bool = False,
        desc: bool = False,
        fields: list[str] | None = None,
        omit: list[str] | None = None,
        unwind: list[str] | None = None,
        skip_empty: bool = False,
        skip_hidden: bool = False,
    ) -> list[dict[str, Any]]:
        """Retrieve a list of all items from the dataset according to specified filters and sorting.

        This method collects all dataset items into a list while applying various filters such as
        skipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`
        parameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and
        `skip_hidden` parameters.

        Args:
            offset: Skips the specified number of items at the start.
            limit: The maximum number of items to retrieve. Unlimited if None.
            clean: Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.
            desc: Set to True to sort results in descending order.
            fields: Fields to include in each item. Sorts fields as specified if provided.
            omit: Fields to exclude from each item.
            unwind: Unwinds items by a specified array field, turning each element into a separate item.
            skip_empty: Excludes empty items from the results if True.
            skip_hidden: Excludes fields starting with '#' if True.

        Returns:
            A list of dictionary objects, each representing a dataset item after applying
            the specified filters and transformations.
        """
        return [
            item
            async for item in self.iterate_items(
                offset=offset,
                limit=limit,
                clean=clean,
                desc=desc,
                fields=fields,
                omit=omit,
                unwind=unwind,
                skip_empty=skip_empty,
                skip_hidden=skip_hidden,
            )
        ]

    @overload
    async def export_to(
        self,
        key: str,
        content_type: Literal['json'],
        to_kvs_id: str | None = None,
        to_kvs_name: str | None = None,
        to_kvs_storage_client: StorageClient | None = None,
        to_kvs_configuration: Configuration | None = None,
        **kwargs: Unpack[ExportDataJsonKwargs],
    ) -> None: ...

    @overload
    async def export_to(
        self,
        key: str,
        content_type: Literal['csv'],
        to_kvs_id: str | None = None,
        to_kvs_name: str | None = None,
        to_kvs_storage_client: StorageClient | None = None,
        to_kvs_configuration: Configuration | None = None,
        **kwargs: Unpack[ExportDataCsvKwargs],
    ) -> None: ...

    async def export_to(
        self,
        key: str,
        content_type: Literal['json', 'csv'] = 'json',
        to_kvs_id: str | None = None,
        to_kvs_name: str | None = None,
        to_kvs_storage_client: StorageClient | None = None,
        to_kvs_configuration: Configuration | None = None,
        **kwargs: Any,
    ) -> None:
        """Export the entire dataset into a specified file stored under a key in a key-value store.

        This method consolidates all entries from a specified dataset into one file, which is then saved under a
        given key in a key-value store. The format of the exported file is determined by the `content_type` parameter.
        Either the dataset's ID or name should be specified, and similarly, either the target key-value store's ID or
        name should be used.

        Args:
            key: The key under which to save the data in the key-value store.
            content_type: The format in which to export the data.
            to_kvs_id: ID of the key-value store to save the exported file.
                Specify only one of ID or name.
            to_kvs_name: Name of the key-value store to save the exported file.
                Specify only one of ID or name.
            to_kvs_storage_client: Storage client to use for the key-value store.
            to_kvs_configuration: Configuration for the key-value store.
            kwargs: Additional parameters for the export operation, specific to the chosen content type.
        """
        kvs = await KeyValueStore.open(
            id=to_kvs_id,
            name=to_kvs_name,
            configuration=to_kvs_configuration,
            storage_client=to_kvs_storage_client,
        )
        dst = StringIO()

        if content_type == 'csv':
            await export_csv_to_stream(self.iterate_items(), dst, **kwargs)
            await kvs.set_value(key, dst.getvalue(), 'text/csv')
        elif content_type == 'json':
            await export_json_to_stream(self.iterate_items(), dst, **kwargs)
            await kvs.set_value(key, dst.getvalue(), 'application/json')
        else:
            raise ValueError('Unsupported content type, expecting CSV or JSON')


================================================
FILE: src/crawlee/storages/_key_value_store.py
================================================
from __future__ import annotations

import asyncio
from collections.abc import AsyncIterator
from logging import getLogger
from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, overload

from pydantic import RootModel
from typing_extensions import override

from crawlee import service_locator
from crawlee._types import JsonSerializable  # noqa: TC001
from crawlee._utils.docs import docs_group
from crawlee._utils.recoverable_state import RecoverableState
from crawlee.storage_clients.models import KeyValueStoreMetadata

from ._base import Storage
from ._utils import validate_storage_name

if TYPE_CHECKING:
    from collections.abc import AsyncIterator

    from crawlee.configuration import Configuration
    from crawlee.storage_clients import StorageClient
    from crawlee.storage_clients._base import KeyValueStoreClient
    from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecordMetadata
else:
    from crawlee._utils.recoverable_state import RecoverableState

T = TypeVar('T')

logger = getLogger(__name__)


class AutosavedValue(RootModel):
    root: dict[str, JsonSerializable]


@docs_group('Storages')
class KeyValueStore(Storage):
    """Key-value store is a storage for reading and writing data records with unique key identifiers.

    The key-value store class acts as a high-level interface for storing, retrieving, and managing data records
    identified by unique string keys. It abstracts away the underlying storage implementation details,
    allowing you to work with the same API regardless of whether data is stored in memory, on disk,
    or in the cloud.

    Each data record is associated with a specific MIME content type, allowing storage of various
    data formats such as JSON, text, images, HTML snapshots or any binary data. This class is
    commonly used to store inputs, outputs, and other artifacts of crawler operations.

    You can instantiate a key-value store using the `open` class method, which will create a store
    with the specified name or id. The underlying storage implementation is determined by the configured
    storage client.

    ### Usage

    ```python
    from crawlee.storages import KeyValueStore

    # Open a named key-value store
    kvs = await KeyValueStore.open(name='my-store')

    # Store and retrieve data
    await kvs.set_value('product-1234.json', [{'name': 'Smartphone', 'price': 799.99}])
    product = await kvs.get_value('product-1234')
    ```
    """

    _autosaved_values: ClassVar[
        dict[
            str,
            dict[str, RecoverableState[AutosavedValue]],
        ]
    ] = {}
    """Cache for recoverable (auto-saved) values."""

    def __init__(self, client: KeyValueStoreClient, id: str, name: str | None) -> None:
        """Initialize a new instance.

        Preferably use the `KeyValueStore.open` constructor to create a new instance.

        Args:
            client: An instance of a storage client.
            id: The unique identifier of the storage.
            name: The name of the storage, if available.
        """
        validate_storage_name(name)

        self._client = client
        self._id = id
        self._name = name

        self._autosave_lock = asyncio.Lock()
        """Lock for autosaving values to prevent concurrent modifications."""

    @property
    @override
    def id(self) -> str:
        return self._id

    @property
    @override
    def name(self) -> str | None:
        return self._name

    @override
    async def get_metadata(self) -> KeyValueStoreMetadata:
        return await self._client.get_metadata()

    @override
    @classmethod
    async def open(
        cls,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
        storage_client: StorageClient | None = None,
    ) -> KeyValueStore:
        configuration = service_locator.get_configuration() if configuration is None else configuration
        storage_client = service_locator.get_storage_client() if storage_client is None else storage_client

        client_opener_coro = storage_client.create_kvs_client(
            id=id, name=name, alias=alias, configuration=configuration
        )
        additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)

        return await service_locator.storage_instance_manager.open_storage_instance(
            cls,
            id=id,
            name=name,
            alias=alias,
            client_opener_coro=client_opener_coro,
            storage_client_cache_key=additional_cache_key,
        )

    @override
    async def drop(self) -> None:
        storage_instance_manager = service_locator.storage_instance_manager
        storage_instance_manager.remove_from_cache(self)

        await self._clear_cache()  # Clear cache with persistent values.
        await self._client.drop()

    @override
    async def purge(self) -> None:
        await self._client.purge()

    @overload
    async def get_value(self, key: str) -> Any: ...

    @overload
    async def get_value(self, key: str, default_value: T) -> T: ...

    @overload
    async def get_value(self, key: str, default_value: T | None = None) -> T | None: ...

    async def get_value(self, key: str, default_value: T | None = None) -> T | None:
        """Get a value from the KVS.

        Args:
            key: Key of the record to retrieve.
            default_value: Default value returned in case the record does not exist.

        Returns:
            The value associated with the given key. `default_value` is used in case the record does not exist.
        """
        record = await self._client.get_value(key=key)
        return record.value if record else default_value

    async def set_value(
        self,
        key: str,
        value: Any,
        content_type: str | None = None,
    ) -> None:
        """Set a value in the KVS.

        Args:
            key: Key of the record to set.
            value: Value to set.
            content_type: The MIME content type string.
        """
        await self._client.set_value(key=key, value=value, content_type=content_type)

    async def delete_value(self, key: str) -> None:
        """Delete a value from the KVS.

        Args:
            key: Key of the record to delete.
        """
        await self._client.delete_value(key=key)

    async def iterate_keys(
        self,
        exclusive_start_key: str | None = None,
        limit: int | None = None,
    ) -> AsyncIterator[KeyValueStoreRecordMetadata]:
        """Iterate over the existing keys in the KVS.

        Args:
            exclusive_start_key: Key to start the iteration from.
            limit: Maximum number of keys to return. None means no limit.

        Yields:
            Information about the key.
        """
        async for item in self._client.iterate_keys(
            exclusive_start_key=exclusive_start_key,
            limit=limit,
        ):
            yield item

    async def list_keys(
        self,
        exclusive_start_key: str | None = None,
        limit: int = 1000,
    ) -> list[KeyValueStoreRecordMetadata]:
        """List all the existing keys in the KVS.

        It uses client's `iterate_keys` method to get the keys.

        Args:
            exclusive_start_key: Key to start the iteration from.
            limit: Maximum number of keys to return.

        Returns:
            A list of keys in the KVS.
        """
        return [
            key
            async for key in self._client.iterate_keys(
                exclusive_start_key=exclusive_start_key,
                limit=limit,
            )
        ]

    async def record_exists(self, key: str) -> bool:
        """Check if a record with the given key exists in the key-value store.

        Args:
            key: Key of the record to check for existence.

        Returns:
            True if a record with the given key exists, False otherwise.
        """
        return await self._client.record_exists(key=key)

    async def get_public_url(self, key: str) -> str:
        """Get the public URL for the given key.

        Args:
            key: Key of the record for which URL is required.

        Returns:
            The public URL for the given key.
        """
        return await self._client.get_public_url(key=key)

    async def get_auto_saved_value(
        self,
        key: str,
        default_value: dict[str, JsonSerializable] | None = None,
    ) -> dict[str, JsonSerializable]:
        """Get a value from KVS that will be automatically saved on changes.

        Args:
            key: Key of the record, to store the value.
            default_value: Value to be used if the record does not exist yet. Should be a dictionary.

        Returns:
            Return the value of the key.
        """
        default_value = {} if default_value is None else default_value

        async with self._autosave_lock:
            cache = self._autosaved_values.setdefault(self.id, {})

            if key in cache:
                return cache[key].current_value.root

            async def kvs_factory() -> KeyValueStore:
                return self

            cache[key] = recoverable_state = RecoverableState(
                default_state=AutosavedValue(default_value),
                persist_state_key=key,
                persistence_enabled=True,
                persist_state_kvs_factory=kvs_factory,
                logger=logger,
            )

            await recoverable_state.initialize()

        return recoverable_state.current_value.root

    async def persist_autosaved_values(self) -> None:
        """Force autosaved values to be saved without waiting for an event in Event Manager."""
        if self.id in self._autosaved_values:
            cache = self._autosaved_values[self.id]
            for value in cache.values():
                await value.persist_state()

    async def _clear_cache(self) -> None:
        """Clear cache with autosaved values."""
        if self.id in self._autosaved_values:
            cache = self._autosaved_values[self.id]
            for value in cache.values():
                await value.teardown()
            cache.clear()


================================================
FILE: src/crawlee/storages/_request_queue.py
================================================
from __future__ import annotations

import asyncio
from datetime import timedelta
from logging import getLogger
from typing import TYPE_CHECKING, TypeVar

from typing_extensions import override

from crawlee import Request, service_locator
from crawlee._utils.docs import docs_group
from crawlee._utils.wait import wait_for_all_tasks_for_finish
from crawlee.request_loaders import RequestManager

from ._base import Storage
from ._utils import validate_storage_name

if TYPE_CHECKING:
    from collections.abc import Sequence

    from crawlee import Request
    from crawlee.configuration import Configuration
    from crawlee.storage_clients import StorageClient
    from crawlee.storage_clients._base import RequestQueueClient
    from crawlee.storage_clients.models import ProcessedRequest, RequestQueueMetadata

logger = getLogger(__name__)

T = TypeVar('T')


@docs_group('Storages')
class RequestQueue(Storage, RequestManager):
    """Request queue is a storage for managing HTTP requests.

    The request queue class serves as a high-level interface for organizing and managing HTTP requests
    during web crawling. It provides methods for adding, retrieving, and manipulating requests throughout
    the crawling lifecycle, abstracting away the underlying storage implementation details.

    Request queue maintains the state of each URL to be crawled, tracking whether it has been processed,
    is currently being handled, or is waiting in the queue. Each URL in the queue is uniquely identified
    by a `unique_key` property, which prevents duplicate processing unless explicitly configured otherwise.

    The class supports both breadth-first and depth-first crawling strategies through its `forefront` parameter
    when adding requests. It also provides mechanisms for error handling and request reclamation when
    processing fails.

    You can open a request queue using the `open` class method, specifying either a name or ID to identify
    the queue. The underlying storage implementation is determined by the configured storage client.

    ### Usage

    ```python
    from crawlee.storages import RequestQueue

    # Open a request queue
    rq = await RequestQueue.open(name='my-queue')

    # Add a request
    await rq.add_request('https://example.com')

    # Process requests
    request = await rq.fetch_next_request()
    if request:
        try:
            # Process the request
            # ...
            await rq.mark_request_as_handled(request)
        except Exception:
            await rq.reclaim_request(request)
    ```
    """

    def __init__(self, client: RequestQueueClient, id: str, name: str | None) -> None:
        """Initialize a new instance.

        Preferably use the `RequestQueue.open` constructor to create a new instance.

        Args:
            client: An instance of a storage client.
            id: The unique identifier of the storage.
            name: The name of the storage, if available.
        """
        validate_storage_name(name)

        self._client = client
        self._id = id
        self._name = name

        self._add_requests_tasks = list[asyncio.Task]()
        """A list of tasks for adding requests to the queue."""

    @property
    @override
    def id(self) -> str:
        return self._id

    @property
    @override
    def name(self) -> str | None:
        return self._name

    @override
    async def get_metadata(self) -> RequestQueueMetadata:
        return await self._client.get_metadata()

    @override
    async def get_handled_count(self) -> int:
        metadata = await self._client.get_metadata()
        return metadata.handled_request_count

    @override
    async def get_total_count(self) -> int:
        metadata = await self._client.get_metadata()
        return metadata.total_request_count

    @override
    @classmethod
    async def open(
        cls,
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        configuration: Configuration | None = None,
        storage_client: StorageClient | None = None,
    ) -> RequestQueue:
        configuration = service_locator.get_configuration() if configuration is None else configuration
        storage_client = service_locator.get_storage_client() if storage_client is None else storage_client

        client_opener_coro = storage_client.create_rq_client(id=id, name=name, alias=alias, configuration=configuration)
        additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)

        return await service_locator.storage_instance_manager.open_storage_instance(
            cls,
            id=id,
            name=name,
            alias=alias,
            client_opener_coro=client_opener_coro,
            storage_client_cache_key=additional_cache_key,
        )

    @override
    async def drop(self) -> None:
        # Remove from cache before dropping
        storage_instance_manager = service_locator.storage_instance_manager
        storage_instance_manager.remove_from_cache(self)

        await self._client.drop()

    @override
    async def purge(self) -> None:
        await self._client.purge()

    @override
    async def add_request(
        self,
        request: str | Request,
        *,
        forefront: bool = False,
    ) -> ProcessedRequest | None:
        request = self._transform_request(request)
        response = await self._client.add_batch_of_requests([request], forefront=forefront)

        if response.processed_requests:
            return response.processed_requests[0]

        if response.unprocessed_requests:
            logger.warning(
                f'Request {request.url} was not processed by storage client "{self._client.__class__.__name__}".'
            )
        else:
            logger.warning(
                f'Request {request.url} was not processed by storage client "{self._client.__class__.__name__}" '
                'received empty response.'
            )
        return None

    @override
    async def add_requests(
        self,
        requests: Sequence[str | Request],
        *,
        forefront: bool = False,
        batch_size: int = 1000,
        wait_time_between_batches: timedelta = timedelta(seconds=1),
        wait_for_all_requests_to_be_added: bool = False,
        wait_for_all_requests_to_be_added_timeout: timedelta | None = None,
    ) -> None:
        transformed_requests = self._transform_requests(requests)
        wait_time_secs = wait_time_between_batches.total_seconds()

        # Wait for the first batch to be added
        first_batch = transformed_requests[:batch_size]
        if first_batch:
            await self._process_batch(
                first_batch,
                base_retry_wait=wait_time_between_batches,
                forefront=forefront,
            )

        async def _process_remaining_batches() -> None:
            for i in range(batch_size, len(transformed_requests), batch_size):
                batch = transformed_requests[i : i + batch_size]
                await self._process_batch(
                    batch,
                    base_retry_wait=wait_time_between_batches,
                    forefront=forefront,
                )
                if i + batch_size < len(transformed_requests):
                    await asyncio.sleep(wait_time_secs)

        # Create and start the task to process remaining batches in the background
        remaining_batches_task = asyncio.create_task(
            _process_remaining_batches(),
            name='request_queue_process_remaining_batches_task',
        )

        self._add_requests_tasks.append(remaining_batches_task)
        remaining_batches_task.add_done_callback(lambda _: self._add_requests_tasks.remove(remaining_batches_task))

        # Wait for all tasks to finish if requested
        if wait_for_all_requests_to_be_added:
            await wait_for_all_tasks_for_finish(
                (remaining_batches_task,),
                logger=logger,
                timeout=wait_for_all_requests_to_be_added_timeout,
            )

    async def fetch_next_request(self) -> Request | None:
        """Return the next request in the queue to be processed.

        Once you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`
        to mark the request as handled in the queue. If there was some error in processing the request, call
        `RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer
        in another call to the `fetch_next_request` method.

        Note that the `None` return value does not mean the queue processing finished, it means there are currently
        no pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`
        instead.

        Returns:
            The next request to process, or `None` if there are no more pending requests.
        """
        return await self._client.fetch_next_request()

    async def get_request(self, unique_key: str) -> Request | None:
        """Retrieve a specific request from the queue by its ID.

        Args:
            unique_key: Unique key of the request to retrieve.

        Returns:
            The request with the specified ID, or `None` if no such request exists.
        """
        return await self._client.get_request(unique_key)

    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
        """Mark a request as handled after successful processing.

        This method should be called after a request has been successfully processed.
        Once marked as handled, the request will be removed from the queue and will
        not be returned in subsequent calls to `fetch_next_request` method.

        Args:
            request: The request to mark as handled.

        Returns:
            Information about the queue operation.
        """
        return await self._client.mark_request_as_handled(request)

    async def reclaim_request(
        self,
        request: Request,
        *,
        forefront: bool = False,
    ) -> ProcessedRequest | None:
        """Reclaim a failed request back to the queue for later processing.

        If a request fails during processing, this method can be used to return it to the queue.
        The request will be returned for processing again in a subsequent call
        to `RequestQueue.fetch_next_request`.

        Args:
            request: The request to return to the queue.
            forefront: If true, the request will be added to the beginning of the queue.
                Otherwise, it will be added to the end.

        Returns:
            Information about the queue operation.
        """
        return await self._client.reclaim_request(request, forefront=forefront)

    async def is_empty(self) -> bool:
        """Check if the request queue is empty.

        An empty queue means that there are no requests currently in the queue, either pending or being processed.
        However, this does not necessarily mean that the crawling operation is finished, as there still might be
        tasks that could add additional requests to the queue.

        Returns:
            True if the request queue is empty, False otherwise.
        """
        return await self._client.is_empty()

    async def is_finished(self) -> bool:
        """Check if the request queue is finished.

        A finished queue means that all requests in the queue have been processed (the queue is empty) and there
        are no more tasks that could add additional requests to the queue. This is the definitive way to check
        if a crawling operation is complete.

        Returns:
            True if the request queue is finished (empty and no pending add operations), False otherwise.
        """
        if self._add_requests_tasks:
            logger.debug('Background add requests tasks are still in progress.')
            return False

        if await self.is_empty():
            logger.debug('The request queue is empty.')
            return True

        return False

    async def _process_batch(
        self,
        batch: Sequence[Request],
        *,
        base_retry_wait: timedelta,
        attempt: int = 1,
        forefront: bool = False,
    ) -> None:
        """Process a batch of requests with automatic retry mechanism."""
        max_attempts = 5
        response = await self._client.add_batch_of_requests(batch, forefront=forefront)

        if response.unprocessed_requests:
            logger.debug(f'Following requests were not processed: {response.unprocessed_requests}.')
            if attempt > max_attempts:
                logger.warning(
                    f'Following requests were not processed even after {max_attempts} attempts:\n'
                    f'{response.unprocessed_requests}'
                )
            else:
                logger.debug('Retry to add requests.')
                unprocessed_requests_unique_keys = {request.unique_key for request in response.unprocessed_requests}
                retry_batch = [request for request in batch if request.unique_key in unprocessed_requests_unique_keys]
                await asyncio.sleep((base_retry_wait * attempt).total_seconds())
                await self._process_batch(retry_batch, base_retry_wait=base_retry_wait, attempt=attempt + 1)

        request_count = len(batch) - len(response.unprocessed_requests)

        if request_count:
            logger.debug(
                f'Added {request_count} requests to the queue. Processed requests: {response.processed_requests}'
            )


================================================
FILE: src/crawlee/storages/_storage_instance_manager.py
================================================
from __future__ import annotations

from asyncio import Lock
from collections import defaultdict
from collections.abc import Coroutine, Hashable
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, TypeVar
from weakref import WeakValueDictionary

from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient

from ._utils import validate_storage_name

if TYPE_CHECKING:
    from ._base import Storage

T = TypeVar('T', bound='Storage')


@dataclass
class _StorageCache:
    """Cache for storage instances."""

    by_id: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
        default_factory=lambda: defaultdict(lambda: defaultdict(defaultdict))
    )
    """Cache for storage instances by ID. Example: by_id[Dataset]['some_id']['some_additional_cache_key']."""

    by_name: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
        default_factory=lambda: defaultdict(lambda: defaultdict(defaultdict))
    )
    """Cache for storage instances by name. Example: by_name[Dataset]['some_name']['some_additional_cache_key']"""

    by_alias: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
        default_factory=lambda: defaultdict(lambda: defaultdict(defaultdict))
    )
    """Cache for storage instances by alias. Example: by_alias[Dataset]['some_alias']['some_additional_cache_key']"""

    def remove_from_cache(self, storage_instance: Storage) -> None:
        """Remove a storage instance from the cache.

        Args:
            storage_instance: The storage instance to remove.
        """
        storage_type = type(storage_instance)

        # Remove from ID cache
        for additional_key in self.by_id[storage_type][storage_instance.id]:
            del self.by_id[storage_type][storage_instance.id][additional_key]
            break

        # Remove from name cache or alias cache. It can never be in both.
        if storage_instance.name is not None:
            for additional_key in self.by_name[storage_type][storage_instance.name]:
                del self.by_name[storage_type][storage_instance.name][additional_key]
                break
        else:
            for alias_key in self.by_alias[storage_type]:
                for additional_key in self.by_alias[storage_type][alias_key]:
                    del self.by_alias[storage_type][alias_key][additional_key]
                    break


ClientOpenerCoro = Coroutine[None, None, DatasetClient | KeyValueStoreClient | RequestQueueClient]
"""Type alias for the client opener function."""


class StorageInstanceManager:
    """Manager for caching and managing storage instances.

    This class centralizes the caching logic for all storage types (Dataset, KeyValueStore, RequestQueue)
    and provides a unified interface for opening and managing storage instances.
    """

    _DEFAULT_STORAGE_ALIAS = '__default__'
    """Reserved alias for default unnamed storage."""

    def __init__(self) -> None:
        self._cache: _StorageCache = _StorageCache()
        self._opener_locks: WeakValueDictionary[tuple, Lock] = WeakValueDictionary()

    async def open_storage_instance(
        self,
        cls: type[T],
        *,
        id: str | None,
        name: str | None,
        alias: str | None,
        client_opener_coro: ClientOpenerCoro,
        storage_client_cache_key: Hashable = '',
    ) -> T:
        """Open a storage instance with caching support.

        Args:
            cls: The storage class to instantiate.
            id: Storage ID.
            name: Storage name. (global scope, persists across runs). Name can only contain letters "a" through "z",
                the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
                (e.g. "my-value-1").
            alias: Storage alias (run scope, creates unnamed storage).
            client_opener_coro: Coroutine to open the storage client when storage instance not found in cache.
            storage_client_cache_key: Additional optional key from storage client to differentiate cache entries.

        Returns:
            The storage instance.

        Raises:
            ValueError: If multiple parameters out of `id`, `name`, and `alias` are specified.
        """
        try:
            if name == self._DEFAULT_STORAGE_ALIAS:
                raise ValueError(
                    f'Storage name cannot be "{self._DEFAULT_STORAGE_ALIAS}" as it is reserved for default alias.'
                )

            # Validate input parameters.
            raise_if_too_many_kwargs(id=id, name=name, alias=alias)

            # Auto-set alias='__default__' when no parameters are specified.
            if not any([name, alias, id]):
                alias = self._DEFAULT_STORAGE_ALIAS

            # Check cache without lock first for performance.
            if cached_instance := self._get_from_cache(
                cls,
                id=id,
                name=name,
                alias=alias,
                storage_client_cache_key=storage_client_cache_key,
            ):
                return cached_instance

            # Validate storage name
            if name is not None:
                validate_storage_name(name)

            # Acquire lock for this opener
            opener_lock_key = (cls, str(id or name or alias), storage_client_cache_key)
            if not (lock := self._opener_locks.get(opener_lock_key)):
                lock = Lock()
                self._opener_locks[opener_lock_key] = lock

            async with lock:
                # Another task could have created the storage while we were waiting for the lock - check if that
                # happened
                if cached_instance := self._get_from_cache(
                    cls,
                    id=id,
                    name=name,
                    alias=alias,
                    storage_client_cache_key=storage_client_cache_key,
                ):
                    return cached_instance

                # Check for conflicts between named and alias storages
                self._check_name_alias_conflict(
                    cls,
                    name=name,
                    alias=alias,
                    storage_client_cache_key=storage_client_cache_key,
                )

                # Create new instance
                client: KeyValueStoreClient | DatasetClient | RequestQueueClient
                client = await client_opener_coro

                metadata = await client.get_metadata()

                instance = cls(client, metadata.id, metadata.name)  # type: ignore[call-arg]
                instance_name = getattr(instance, 'name', None)

                # Cache the instance.
                # Note: No awaits in this section. All cache entries must be written
                # atomically to ensure pre-checks outside the lock see consistent state.

                # Always cache by id.
                self._cache.by_id[cls][instance.id][storage_client_cache_key] = instance

                # Cache named storage.
                if instance_name is not None:
                    self._cache.by_name[cls][instance_name][storage_client_cache_key] = instance

                # Cache unnamed storage.
                if alias is not None:
                    self._cache.by_alias[cls][alias][storage_client_cache_key] = instance

                return instance

        finally:
            # Make sure the client opener is closed.
            # If it was awaited, then closing is no operation, if it was not awaited, this is the cleanup.
            client_opener_coro.close()

    def remove_from_cache(self, storage_instance: Storage) -> None:
        """Remove a storage instance from the cache.

        Args:
            storage_instance: The storage instance to remove.
        """
        self._cache.remove_from_cache(storage_instance)

    def clear_cache(self) -> None:
        """Clear all cached storage instances."""
        self._cache = _StorageCache()

    def _get_from_cache(
        self,
        cls: type[T],
        *,
        id: str | None = None,
        name: str | None = None,
        alias: str | None = None,
        storage_client_cache_key: Hashable = '',
    ) -> T | None:
        """Get a storage instance from the cache."""
        if id is not None and (cached_instance := self._cache.by_id[cls][id].get(storage_client_cache_key)):
            if isinstance(cached_instance, cls):
                return cached_instance
            raise RuntimeError('Cached instance type mismatch.')

        if name is not None and (cached_instance := self._cache.by_name[cls][name].get(storage_client_cache_key)):
            if isinstance(cached_instance, cls):
                return cached_instance
            raise RuntimeError('Cached instance type mismatch.')

        if alias is not None and (cached_instance := self._cache.by_alias[cls][alias].get(storage_client_cache_key)):
            if isinstance(cached_instance, cls):
                return cached_instance
            raise RuntimeError('Cached instance type mismatch.')

        return None

    def _check_name_alias_conflict(
        self,
        cls: type[T],
        *,
        name: str | None = None,
        alias: str | None = None,
        storage_client_cache_key: Hashable = '',
    ) -> None:
        """Check for conflicts between named and alias storages."""
        if alias and (self._cache.by_name[cls][alias].get(storage_client_cache_key)):
            raise ValueError(
                f'Cannot create alias storage "{alias}" because a named storage with the same name already exists. '
                f'Use a different alias or drop the existing named storage first.'
            )

        if name and (self._cache.by_alias[cls][name].get(storage_client_cache_key)):
            raise ValueError(
                f'Cannot create named storage "{name}" because an alias storage with the same name already exists. '
                f'Use a different name or drop the existing alias storage first.'
            )


================================================
FILE: src/crawlee/storages/_utils.py
================================================
import re

NAME_REGEX = re.compile(r'^([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])$')


def validate_storage_name(name: str | None) -> None:
    if name and not NAME_REGEX.match(name):
        raise ValueError(
            f'Invalid storage name "{name}". Name can only contain letters "a" through "z", the digits "0" through'
            '"9", and the hyphen ("-") but only in the middle of the string (e.g. "my-value-1")'
        )


================================================
FILE: src/crawlee/storages/py.typed
================================================


================================================
FILE: tests/__init__.py
================================================


================================================
FILE: tests/e2e/__init__.py
================================================


================================================
FILE: tests/e2e/conftest.py
================================================
import subprocess
from pathlib import Path

import pytest
from _pytest.config import Config
from filelock import FileLock

_CRAWLEE_ROOT_PATH = Path(__file__).parent.parent.parent.resolve()


def pytest_configure(config: Config) -> None:
    for marker in [
        'httpx',
        'curl_impersonate',
        'impit',
        'playwright',
        'playwright_camoufox',
        'playwright_chrome',
        'playwright_firefox',
        'playwright_webkit',
        'parsel',
        'beautifulsoup',
        'uv',
        'poetry',
        'pip',
    ]:
        config.addinivalue_line('markers', f'{marker}: Integration test parameter marker.')


@pytest.fixture(scope='session')
def crawlee_wheel_path(tmp_path_factory: pytest.TempPathFactory, testrun_uid: str) -> Path:
    """Build the package wheel if it hasn't been built yet, and return the path to the wheel."""
    # Make sure the wheel is not being built concurrently across all the pytest-xdist runners,
    # through locking the building process with a temp file.
    with FileLock(tmp_path_factory.getbasetemp().parent / 'crawlee_wheel_build.lock'):
        # Make sure the wheel is built exactly once across all the pytest-xdist runners,
        # through an indicator file saying that the wheel was already built.
        was_wheel_built_this_test_run_file = tmp_path_factory.getbasetemp() / f'wheel_was_built_in_run_{testrun_uid}'
        if not was_wheel_built_this_test_run_file.exists():
            subprocess.run(
                args='python -m build',
                cwd=_CRAWLEE_ROOT_PATH,
                shell=True,
                check=True,
                capture_output=True,
            )
            was_wheel_built_this_test_run_file.touch()

        # Read the current package version, necessary for getting the right wheel filename.
        pyproject_toml_file = (_CRAWLEE_ROOT_PATH / 'pyproject.toml').read_text(encoding='utf-8')
        for line in pyproject_toml_file.splitlines():
            if line.startswith('version = '):
                delim = '"' if '"' in line else "'"
                crawlee_version = line.split(delim)[1]
                break
        else:
            raise RuntimeError('Unable to find version string.')

        wheel_path = _CRAWLEE_ROOT_PATH / 'dist' / f'crawlee-{crawlee_version}-py3-none-any.whl'

        # Just to be sure.
        assert wheel_path.exists()

        return wheel_path


================================================
FILE: tests/e2e/project_template/test_static_crawlers_templates.py
================================================
import os
import re
import subprocess
from pathlib import Path
from typing import Literal

import pytest
from apify_client import ApifyClientAsync
from cookiecutter.main import cookiecutter

from crawlee._cli import default_start_url, template_directory
from crawlee._utils.crypto import crypto_random_object_id
from tests.e2e.project_template.utils import patch_crawlee_version_in_project

# To run these tests locally, make sure you have apify-cli installed and available in the path.
# https://docs.apify.com/cli/docs/installation


@pytest.mark.parametrize(
    'crawler_type',
    [
        pytest.param('playwright-camoufox', marks=pytest.mark.playwright_camoufox),
        pytest.param('playwright-chrome', marks=pytest.mark.playwright_chrome),
        pytest.param('playwright-firefox', marks=pytest.mark.playwright_firefox),
        pytest.param('playwright-webkit', marks=pytest.mark.playwright_webkit),
        pytest.param('playwright', marks=pytest.mark.playwright),
        pytest.param('parsel', marks=pytest.mark.parsel),
        pytest.param('beautifulsoup', marks=pytest.mark.beautifulsoup),
    ],
)
@pytest.mark.parametrize(
    'http_client',
    [
        pytest.param('httpx', marks=pytest.mark.httpx),
        pytest.param('curl-impersonate', marks=pytest.mark.curl_impersonate),
        pytest.param('impit', marks=pytest.mark.impit),
    ],
)
@pytest.mark.parametrize(
    'package_manager',
    [
        pytest.param('pip', marks=pytest.mark.pip),
        pytest.param('uv', marks=pytest.mark.uv),
        pytest.param('poetry', marks=pytest.mark.poetry),
    ],
)
async def test_static_crawler_actor_at_apify(
    tmp_path: Path,
    crawlee_wheel_path: Path,
    package_manager: Literal['pip', 'uv', 'poetry'],
    crawler_type: str,
    http_client: str,
) -> None:
    # Generate new actor name
    actor_name = f'crawlee-python-template-e2e-test-{crypto_random_object_id(8).lower()}'

    # Create project from template
    cookiecutter(
        template=str(template_directory),
        no_input=True,
        extra_context={
            'project_name': actor_name,
            'package_manager': package_manager,
            'crawler_type': crawler_type,
            'http_client': http_client,
            'enable_apify_integration': True,
            'start_url': default_start_url,
            'install_project': False,
        },
        output_dir=str(tmp_path),
    )

    patch_crawlee_version_in_project(
        project_path=tmp_path / actor_name, wheel_path=crawlee_wheel_path, package_manager=package_manager
    )

    # Print apify version for debugging purposes in rare cases of CLI failures
    subprocess.run(['apify', '--version'], check=True)  # noqa: ASYNC221, S607

    # Build actor using sequence of cli commands as the user would
    subprocess.run(  # noqa: ASYNC221, S603
        ['apify', 'login', '-t', os.environ['APIFY_TEST_USER_API_TOKEN']],  # noqa: S607
        capture_output=True,
        check=True,
        cwd=tmp_path / actor_name,
    )
    subprocess.run(['apify', 'init', '-y', actor_name], capture_output=True, check=True, cwd=tmp_path / actor_name)  # noqa: ASYNC221, S603, S607

    build_process = subprocess.run(['apify', 'push'], capture_output=True, check=False, cwd=tmp_path / actor_name)  # noqa: ASYNC221, S607
    # Get actor ID from build log
    actor_id_regexp = re.compile(r'https:\/\/console\.apify\.com\/actors\/(.*)#\/builds\/\d*\.\d*\.\d*')

    if match := re.findall(actor_id_regexp, build_process.stderr.decode()):
        actor_id = match[0]
    else:
        raise AssertionError(f'Failed to find actor id in build log: {build_process.stderr.decode()}')

    client = ApifyClientAsync(token=os.getenv('APIFY_TEST_USER_API_TOKEN'))
    actor = client.actor(actor_id)

    # Run actor
    try:
        assert build_process.returncode == 0
        started_run_data = await actor.start(memory_mbytes=8192)
        actor_run = client.run(started_run_data['id'])

        finished_run_data = await actor_run.wait_for_finish()
        actor_run_log = await actor_run.log().get()
    finally:
        # Delete the actor once it is no longer needed.
        await actor.delete()

    # Asserts
    additional_run_info = f'Full actor run log: {actor_run_log}'
    assert actor_run_log
    assert finished_run_data
    assert finished_run_data['status'] == 'SUCCEEDED', additional_run_info
    assert (
        'Crawler.stop() was called with following reason: The crawler has reached its limit of 10 requests per crawl.'
    ) in actor_run_log, additional_run_info
    assert int(re.findall(r'requests_finished\s*│\s*(\d*)', actor_run_log)[-1]) >= 10, additional_run_info


================================================
FILE: tests/e2e/project_template/utils.py
================================================
import re
import shutil
import subprocess
from pathlib import Path
from typing import Literal


def patch_crawlee_version_in_project(
    project_path: Path, wheel_path: Path, package_manager: Literal['pip', 'uv', 'poetry']
) -> None:
    """Ensure that the test is using current version of the crawlee from the source and not from Pypi."""
    # Copy prepared .whl file
    shutil.copy(wheel_path, project_path)

    if package_manager in {'poetry', 'uv'}:
        _patch_crawlee_version_in_pyproject_toml_based_project(project_path, wheel_path)
    else:
        _patch_crawlee_version_in_requirements_txt_based_project(project_path, wheel_path)


def _patch_crawlee_version_in_requirements_txt_based_project(project_path: Path, wheel_path: Path) -> None:
    # Get any extras
    requirements_path = project_path / 'requirements.txt'
    with requirements_path.open() as f:
        requirements = f.read()
        crawlee_extras = re.findall(r'crawlee(\[.*\])', requirements)[0] or ''

    # Modify requirements.txt to use crawlee from wheel file instead of from Pypi
    with requirements_path.open() as f:
        modified_lines = []
        for line in f:
            if 'crawlee' in line:
                modified_lines.append(f'./{wheel_path.name}{crawlee_extras}\n')
            else:
                modified_lines.append(line)
    with requirements_path.open('w') as f:
        f.write(''.join(modified_lines))

    # Patch the dockerfile to have wheel file available
    dockerfile_path = project_path / 'Dockerfile'
    with dockerfile_path.open() as f:
        modified_lines = []
        for line in f:
            modified_lines.append(line)
            if line.startswith('COPY requirements.txt ./'):
                modified_lines.extend(
                    [
                        f'COPY {wheel_path.name} ./\n',
                        # If no crawlee version bump, pip might be lazy and take existing pre-installed crawlee version,
                        # make sure that one is patched as well.
                        f'RUN pip install ./{wheel_path.name}{crawlee_extras} --force-reinstall\n',
                    ]
                )
    with dockerfile_path.open('w') as f:
        f.write(''.join(modified_lines))


def _patch_crawlee_version_in_pyproject_toml_based_project(project_path: Path, wheel_path: Path) -> None:
    """Ensure that the test is using current version of the crawlee from the source and not from Pypi."""
    # Get any extras
    pyproject_path = project_path / 'pyproject.toml'
    with pyproject_path.open() as f:
        pyproject = f.read()
        crawlee_extras = re.findall(r'crawlee(\[.*\])', pyproject)[0] or ''

    # Inject crawlee wheel file to the docker image and update project to depend on it."""
    dockerfile_path = project_path / 'Dockerfile'
    with dockerfile_path.open() as f:
        modified_lines = []
        for line in f:
            modified_lines.append(line)
            if line.startswith('COPY pyproject.toml'):
                if 'uv.lock' in line:
                    package_manager = 'uv'
                elif 'poetry.lock' in line:
                    package_manager = 'poetry'
                else:
                    raise RuntimeError('This does not look like a uv or poetry based project.')

                # Create lock file that is expected by the docker to exist (even though it will be patched
                # in the docker).
                subprocess.run(
                    args=[package_manager, 'lock'],
                    cwd=str(project_path),
                    check=True,
                    capture_output=True,
                )

                # Add command to copy .whl to the docker image and update project with it.
                # Patching in docker file due to the poetry not properly supporting relative paths for wheel packages
                # and so the absolute path (in the container) is generated when running `add` command in the container.
                modified_lines.extend(
                    [
                        f'COPY {wheel_path.name} ./\n',
                        # If no crawlee version bump, poetry might be lazy and take existing pre-installed crawlee
                        # version, make sure that one is patched as well.
                        f'RUN pip install ./{wheel_path.name}{crawlee_extras} --force-reinstall\n',
                        f'RUN {package_manager} add ./{wheel_path.name}{crawlee_extras}\n',
                        f'RUN {package_manager} lock\n',
                    ]
                )
    with dockerfile_path.open('w') as f:
        f.write(''.join(modified_lines))


================================================
FILE: tests/unit/README.md
================================================
# Unit tests

Some tests may exhibit flaky behavior in CI. The reason for flaky behavior should be understood as it can indicate bug in the code or design flaw in the test. There are other reasons related to test execution, such as some tests that are not (or can not be) properly isolated, or limited resource constraints of the test executor.

Here are some suggested approaches to mitigate flakiness, sorted in the order of preference:
  - Investigate the root cause and fix the code or test.
  - Apply one of the pytest marks to mitigate the flakiness:
    - `@run_alone_on_mac` - Test with such mark will run alone on macOS exeutor in CI (normally several tests run in parallel, which can cause resource-sensitive tests to fail.) Use for resource sensitive tests that are known to be flaky only on macOS.
    - `@run_alone` - Test with such mark will run alone on any executor. Use for resource sensitive tests that are known to be flaky on all platforms or for tests that can not be run in parallel with other test due to their design (This should be extremely rare).
    - `@pytest.mark.flaky` - Test with such mark will be retried several times if it fails. Use for tests that are known to be flaky, but the reason for flakiness is not understood or can not be easily mitigated.
    - `@pytest.mark.skip` - Test with such mark will be skipped. Use when none of the above approaches mitigate the test flakiness. Marking test as skipped should be a last resort, as it can hide potential bugs and give false sense of security. Skipped tests should be tracked in GitHub issue.


================================================
FILE: tests/unit/__init__.py
================================================


================================================
FILE: tests/unit/_autoscaling/test_autoscaled_pool.py
================================================
# ruff: noqa: FBT003 # Boolean positional value in function call

from __future__ import annotations

import asyncio
from contextlib import suppress
from datetime import datetime, timedelta, timezone
from itertools import chain, repeat
from typing import TYPE_CHECKING, TypeVar, cast
from unittest.mock import Mock

import pytest

from crawlee._autoscaling import AutoscaledPool, SystemStatus
from crawlee._autoscaling._types import LoadRatioInfo, SystemInfo
from crawlee._types import ConcurrencySettings
from crawlee._utils.time import measure_time

if TYPE_CHECKING:
    from collections.abc import Awaitable


@pytest.fixture
def system_status() -> SystemStatus | Mock:
    return Mock(spec=SystemStatus)


T = TypeVar('T')


def future(value: T, /) -> Awaitable[T]:
    f = asyncio.Future[T]()
    f.set_result(value)
    return f


@pytest.mark.run_alone
async def test_runs_concurrently(system_status: SystemStatus | Mock) -> None:
    done_count = 0

    async def run() -> None:
        await asyncio.sleep(0.1)
        nonlocal done_count
        done_count += 1

    pool = AutoscaledPool(
        system_status=system_status,
        run_task_function=run,
        is_task_ready_function=lambda: future(True),
        is_finished_function=lambda: future(done_count >= 10),
        concurrency_settings=ConcurrencySettings(
            min_concurrency=10,
            max_concurrency=10,
        ),
    )

    with measure_time() as elapsed:
        await pool.run()

    assert elapsed.wall is not None
    assert elapsed.wall < 0.3

    assert done_count >= 10


async def test_abort_works(system_status: SystemStatus | Mock) -> None:
    async def run() -> None:
        await asyncio.sleep(60)

    pool = AutoscaledPool(
        system_status=system_status,
        run_task_function=run,
        is_task_ready_function=lambda: future(True),
        is_finished_function=lambda: future(False),
        concurrency_settings=ConcurrencySettings(
            min_concurrency=10,
            max_concurrency=10,
        ),
    )

    with measure_time() as elapsed:
        run_task = asyncio.create_task(pool.run(), name='pool run task')
        await asyncio.sleep(0.1)
        assert pool.current_concurrency == 10
        await pool.abort()
        assert pool.current_concurrency == 0
        await run_task

    assert elapsed.wall is not None
    assert elapsed.wall < 5


async def test_propagates_exceptions(system_status: SystemStatus | Mock) -> None:
    done_count = 0

    async def run() -> None:
        await asyncio.sleep(0.1)
        nonlocal done_count
        done_count += 1

        if done_count > 5:
            raise RuntimeError('Scheduled crash')

    pool = AutoscaledPool(
        system_status=system_status,
        run_task_function=run,
        is_task_ready_function=lambda: future(True),
        is_finished_function=lambda: future(done_count >= 20),
        concurrency_settings=ConcurrencySettings(
            min_concurrency=10,
            max_concurrency=10,
        ),
    )

    with pytest.raises(RuntimeError, match=r'Scheduled crash'):
        await pool.run()

    assert done_count < 20


async def test_propagates_exceptions_after_finished(system_status: SystemStatus | Mock) -> None:
    started_count = 0

    async def run() -> None:
        nonlocal started_count
        started_count += 1

        await asyncio.sleep(1)

        raise RuntimeError('Scheduled crash')

    pool = AutoscaledPool(
        system_status=system_status,
        run_task_function=run,
        is_task_ready_function=lambda: future(True),
        is_finished_function=lambda: future(started_count > 0),
        concurrency_settings=ConcurrencySettings(
            min_concurrency=1,
            desired_concurrency=1,
            max_concurrency=1,
        ),
    )

    with pytest.raises(RuntimeError, match=r'Scheduled crash'):
        await pool.run()


@pytest.mark.flaky(
    rerun=3,
    reason='Test is flaky on Windows and MacOS, see https://github.com/apify/crawlee-python/issues/1655.',
)
async def test_autoscales(
    monkeypatch: pytest.MonkeyPatch,
    system_status: SystemStatus | Mock,
) -> None:
    done_count = 0

    async def run() -> None:
        await asyncio.sleep(0.1)
        nonlocal done_count
        done_count += 1

    start = datetime.now(timezone.utc)

    def get_historical_system_info() -> SystemInfo:
        result = SystemInfo(
            cpu_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),
            memory_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),
            event_loop_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),
            client_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),
        )

        # 0.5 seconds after the start of the test, pretend the CPU became overloaded
        if result.created_at - start >= timedelta(seconds=0.5):
            result.cpu_info = LoadRatioInfo(limit_ratio=0.9, actual_ratio=1.0)

        return result

    cast('Mock', system_status.get_historical_system_info).side_effect = get_historical_system_info

    # Override AP class attributes using monkeypatch.
    monkeypatch.setattr(AutoscaledPool, '_AUTOSCALE_INTERVAL', timedelta(seconds=0.1))

    pool = AutoscaledPool(
        system_status=system_status,
        run_task_function=run,
        is_task_ready_function=lambda: future(True),
        is_finished_function=lambda: future(False),
        concurrency_settings=ConcurrencySettings(
            min_concurrency=1,
            desired_concurrency=1,
            max_concurrency=4,
        ),
    )

    pool_run_task = asyncio.create_task(pool.run(), name='pool run task')

    try:
        # After 0.2s, there should be an increase in concurrency
        await asyncio.sleep(0.2)
        assert pool.desired_concurrency > 1

        # After 0.5s, the concurrency should reach max concurrency
        await asyncio.sleep(0.3)
        assert pool.desired_concurrency == 4

        # The concurrency should guarantee completion of more than 10 tasks (a single worker would complete ~5)
        assert done_count > 10

        # After 0.7s, the pretend overload should have kicked in and there should be a drop in desired concurrency
        await asyncio.sleep(0.2)
        assert pool.desired_concurrency < 4

        # After a full second, the pool should scale down all the way to 1
        await asyncio.sleep(0.3)
        assert pool.desired_concurrency == 1
    finally:
        pool_run_task.cancel()
        with suppress(asyncio.CancelledError):
            await pool_run_task


async def test_autoscales_uses_desired_concurrency_ratio(
    monkeypatch: pytest.MonkeyPatch,
    system_status: SystemStatus | Mock,
) -> None:
    """Test that desired concurrency ratio can limit desired concurrency.

    This test creates situation where only one task is ready and then no other task is ever ready.
    This creates situation where the system could scale up desired concurrency, but it will not do so because
    desired_concurrency_ratio=1 means that first the system would have to increase current concurrency to same number as
    desired concurrency and due to no other task ever being ready, it will never happen. Thus desired concurrency will
    stay 2 as was the initial setup, even though other conditions would allow the increase. (max_concurrency=4,
    system being idle).
    """

    async def run() -> None:
        await asyncio.sleep(0.1)

    is_task_ready_iterator = chain([future(True)], repeat(future(False)))

    def is_task_ready_function() -> Awaitable[bool]:
        return next(is_task_ready_iterator)

    def get_historical_system_info() -> SystemInfo:
        return SystemInfo(
            cpu_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),
            memory_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),
            event_loop_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),
            client_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),
        )

    cast('Mock', system_status.get_historical_system_info).side_effect = get_historical_system_info

    # Override AP class attributes using monkeypatch.
    monkeypatch.setattr(AutoscaledPool, '_AUTOSCALE_INTERVAL', timedelta(seconds=0.1))
    monkeypatch.setattr(AutoscaledPool, '_DESIRED_CONCURRENCY_RATIO', 1)

    pool = AutoscaledPool(
        system_status=system_status,
        run_task_function=run,
        is_task_ready_function=is_task_ready_function,
        is_finished_function=lambda: future(False),
        concurrency_settings=ConcurrencySettings(
            min_concurrency=2,
            desired_concurrency=2,
            max_concurrency=4,
        ),
    )

    pool_run_task = asyncio.create_task(pool.run(), name='pool run task')
    try:
        for _ in range(5):
            assert pool.desired_concurrency == 2
            await asyncio.sleep(0.1)

    finally:
        pool_run_task.cancel()
        with suppress(asyncio.CancelledError):
            await pool_run_task


async def test_max_tasks_per_minute_works(system_status: SystemStatus | Mock) -> None:
    done_count = 0

    async def run() -> None:
        await asyncio.sleep(0.1)
        nonlocal done_count
        done_count += 1

    pool = AutoscaledPool(
        system_status=system_status,
        run_task_function=run,
        is_task_ready_function=lambda: future(True),
        is_finished_function=lambda: future(False),
        concurrency_settings=ConcurrencySettings(
            min_concurrency=1,
            desired_concurrency=1,
            max_concurrency=1,
            max_tasks_per_minute=120,
        ),
    )

    pool_run_task = asyncio.create_task(pool.run(), name='pool run task')
    try:
        await asyncio.sleep(0.5)
        assert done_count <= 1
    finally:
        pool_run_task.cancel()
        with suppress(asyncio.CancelledError):
            await pool_run_task


async def test_allows_multiple_run_calls(system_status: SystemStatus | Mock) -> None:
    done_count = 0

    async def run() -> None:
        nonlocal done_count
        done_count += 1
        await asyncio.sleep(0.1)

    pool = AutoscaledPool(
        system_status=system_status,
        run_task_function=run,
        is_task_ready_function=lambda: future(done_count < 4),
        is_finished_function=lambda: future(done_count >= 4),
        concurrency_settings=ConcurrencySettings(
            min_concurrency=4,
            desired_concurrency=4,
            max_concurrency=4,
        ),
    )

    await pool.run()
    assert done_count == 4

    done_count = 0

    await pool.run()
    assert done_count == 4


================================================
FILE: tests/unit/_autoscaling/test_snapshotter.py
================================================
from __future__ import annotations

import asyncio
import time
from bisect import insort
from datetime import datetime, timedelta, timezone
from logging import getLogger
from math import floor
from typing import TYPE_CHECKING, Any, cast
from unittest import mock
from unittest.mock import MagicMock

import pytest

from crawlee import service_locator
from crawlee._autoscaling import Snapshotter
from crawlee._autoscaling._types import (
    SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD,
    ClientSnapshot,
    CpuSnapshot,
    MemorySnapshot,
)
from crawlee._autoscaling.snapshotter import SortedSnapshotList
from crawlee._utils.byte_size import ByteSize
from crawlee._utils.system import CpuInfo, MemoryInfo, get_memory_info
from crawlee.configuration import Configuration
from crawlee.events import LocalEventManager
from crawlee.events._types import Event, EventSystemInfoData

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator


@pytest.fixture
async def event_manager() -> AsyncGenerator[LocalEventManager, None]:
    # Use a long interval to avoid interference from periodic system info events during tests and ensure the first
    # automatic event is consumed before yielding.

    event_manager = LocalEventManager(system_info_interval=timedelta(hours=9999))

    initial_system_info_consumed = asyncio.Event()

    async def consume_automatic_system_info(_: EventSystemInfoData) -> None:
        initial_system_info_consumed.set()

    event_manager.on(event=Event.SYSTEM_INFO, listener=consume_automatic_system_info)

    async with event_manager:
        await initial_system_info_consumed.wait()
        event_manager.off(event=Event.SYSTEM_INFO, listener=consume_automatic_system_info)

        yield event_manager


@pytest.fixture
async def snapshotter(event_manager: LocalEventManager) -> AsyncGenerator[Snapshotter, None]:
    config = Configuration(available_memory_ratio=0.25)
    service_locator.set_event_manager(event_manager)
    async with Snapshotter.from_config(config) as snapshotter:
        yield snapshotter


@pytest.fixture
def default_cpu_info() -> CpuInfo:
    return CpuInfo(used_ratio=0.5)


@pytest.fixture
def default_memory_info() -> MemoryInfo:
    return MemoryInfo(
        total_size=ByteSize.from_gb(8),
        current_size=ByteSize.from_gb(4),
        system_wide_used_size=ByteSize.from_gb(5),
    )


@pytest.fixture
def event_system_data_info(default_cpu_info: CpuInfo, default_memory_info: MemoryInfo) -> EventSystemInfoData:
    return EventSystemInfoData(
        cpu_info=default_cpu_info,
        memory_info=default_memory_info,
    )


async def test_start_stop_lifecycle() -> None:
    config = Configuration(available_memory_ratio=0.25)

    async with Snapshotter.from_config(config):
        pass


async def test_snapshot_cpu(
    snapshotter: Snapshotter, event_system_data_info: EventSystemInfoData, event_manager: LocalEventManager
) -> None:
    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_data_info)
    await event_manager.wait_for_all_listeners_to_complete()
    cpu_snapshots = cast('list[CpuSnapshot]', snapshotter.get_cpu_sample())
    assert len(cpu_snapshots) == 1
    assert cpu_snapshots[0].used_ratio == event_system_data_info.cpu_info.used_ratio


async def test_snapshot_memory(
    snapshotter: Snapshotter, event_system_data_info: EventSystemInfoData, event_manager: LocalEventManager
) -> None:
    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_data_info)
    await event_manager.wait_for_all_listeners_to_complete()
    memory_snapshots = cast('list[MemorySnapshot]', snapshotter.get_memory_sample())
    assert len(memory_snapshots) == 1
    assert memory_snapshots[0].current_size == event_system_data_info.memory_info.current_size


async def test_snapshot_memory_with_memory_info_sets_system_wide_fields(
    snapshotter: Snapshotter, event_manager: LocalEventManager
) -> None:
    memory_info = MemoryInfo(
        total_size=ByteSize.from_gb(16),
        current_size=ByteSize.from_gb(4),
        system_wide_used_size=ByteSize.from_gb(12),
    )

    event_data = EventSystemInfoData(
        cpu_info=CpuInfo(used_ratio=0.5),
        memory_info=memory_info,
    )

    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data)
    await event_manager.wait_for_all_listeners_to_complete()

    memory_snapshots = cast('list[MemorySnapshot]', snapshotter.get_memory_sample())

    assert len(memory_snapshots) == 1
    memory_snapshot = memory_snapshots[0]

    # Test that system-wide fields are properly set
    assert memory_snapshot.system_wide_used_size == memory_info.system_wide_used_size
    assert memory_snapshot.system_wide_memory_size == memory_info.total_size


def test_snapshot_event_loop(snapshotter: Snapshotter) -> None:
    # A first event loop snapshot is created when an instance is created.
    event_loop_snapshots = snapshotter.get_event_loop_sample()
    assert len(event_loop_snapshots) == 1


def test_snapshot_client(snapshotter: Snapshotter) -> None:
    # A first client snapshot is created when an instance is created.
    client_snapshots = snapshotter.get_client_sample()
    assert len(client_snapshots) == 1


def test_snapshot_client_overloaded() -> None:
    assert not ClientSnapshot(error_count=1, new_error_count=1, max_error_count=2).is_overloaded
    assert not ClientSnapshot(error_count=2, new_error_count=1, max_error_count=2).is_overloaded
    assert not ClientSnapshot(error_count=4, new_error_count=2, max_error_count=2).is_overloaded
    assert ClientSnapshot(error_count=7, new_error_count=3, max_error_count=2).is_overloaded


@pytest.mark.run_alone
async def test_get_cpu_sample(
    snapshotter: Snapshotter, event_manager: LocalEventManager, default_memory_info: MemoryInfo
) -> None:
    now = datetime.now(timezone.utc)
    snapshotter._SNAPSHOT_HISTORY = timedelta(hours=10)  # Extend history for testing

    events_data = [
        EventSystemInfoData(
            cpu_info=CpuInfo(
                used_ratio=0.5,
                created_at=now - timedelta(hours=delta),
            ),
            memory_info=default_memory_info,
        )
        for delta in range(5, 0, -1)
    ]
    for event_data in events_data:
        event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data)
    await event_manager.wait_for_all_listeners_to_complete()

    # When no sample duration is provided it should return all snapshots
    samples = snapshotter.get_cpu_sample()
    assert len(samples) == len(events_data)

    duration = timedelta(hours=0.5)
    samples = snapshotter.get_cpu_sample(duration)
    assert len(samples) == 1

    duration = timedelta(hours=2.5)
    samples = snapshotter.get_cpu_sample(duration)
    assert len(samples) == 3

    duration = timedelta(hours=10)
    samples = snapshotter.get_cpu_sample(duration)
    assert len(samples) == len(events_data)


async def test_methods_raise_error_when_not_active() -> None:
    snapshotter = Snapshotter.from_config(Configuration(available_memory_ratio=0.25))
    assert snapshotter.active is False

    with pytest.raises(RuntimeError, match=r'Snapshotter is not active.'):
        snapshotter.get_cpu_sample()

    with pytest.raises(RuntimeError, match=r'Snapshotter is not active.'):
        snapshotter.get_memory_sample()

    with pytest.raises(RuntimeError, match=r'Snapshotter is not active.'):
        snapshotter.get_event_loop_sample()

    with pytest.raises(RuntimeError, match=r'Snapshotter is not active.'):
        snapshotter.get_client_sample()

    with pytest.raises(RuntimeError, match=r'Snapshotter is already active.'):
        async with snapshotter, snapshotter:
            pass

    async with snapshotter:
        snapshotter.get_cpu_sample()
        snapshotter.get_memory_sample()
        snapshotter.get_event_loop_sample()
        snapshotter.get_client_sample()

        assert snapshotter.active is True


async def test_snapshot_pruning_removes_outdated_records(
    snapshotter: Snapshotter, event_manager: LocalEventManager, default_memory_info: MemoryInfo
) -> None:
    # Set the snapshot history to 2 hours
    snapshotter._SNAPSHOT_HISTORY = timedelta(hours=2)

    # Create timestamps for testing
    now = datetime.now(timezone.utc)

    def randomly_delayed_insort(*args: Any, **kwargs: Any) -> None:
        """Sort with injected delay to provoke otherwise hard to reproduce race condition."""
        time.sleep(0.05)
        return insort(*args, **kwargs)

    with mock.patch('crawlee._autoscaling.snapshotter.insort', side_effect=randomly_delayed_insort):
        events_data = [
            EventSystemInfoData(
                cpu_info=CpuInfo(used_ratio=0.5, created_at=now - timedelta(hours=delta)),
                memory_info=default_memory_info,
            )
            for delta in [0, 3, 2, 5]  # Out of order timestamps. Snapshotter can not rely on natural ordering.
        ]

        for event_data in events_data:
            event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data)
        await event_manager.wait_for_all_listeners_to_complete()

    cpu_snapshots = cast('list[CpuSnapshot]', snapshotter.get_cpu_sample())

    # Check that only the last two snapshots remain
    assert len(cpu_snapshots) == 2
    assert cpu_snapshots[0].created_at == now - timedelta(hours=2)
    assert cpu_snapshots[1].created_at == now


async def test_memory_load_evaluation_logs_warning_on_high_usage(
    caplog: pytest.LogCaptureFixture,
    event_manager: LocalEventManager,
    default_cpu_info: CpuInfo,
) -> None:
    config = Configuration(memory_mbytes=8192)

    service_locator.set_event_manager(event_manager)
    snapshotter = Snapshotter.from_config(config)

    high_memory_usage = ByteSize.from_gb(8) * 0.95  # 95% of 8 GB

    event_data = EventSystemInfoData(
        cpu_info=default_cpu_info,
        memory_info=MemoryInfo(
            total_size=ByteSize.from_gb(8),
            current_size=high_memory_usage,
            system_wide_used_size=ByteSize.from_gb(7),
        ),
    )

    async with snapshotter:
        event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data)
        await event_manager.wait_for_all_listeners_to_complete()

        # Filter log records to only include those from snapshotter
        log_records = [record for record in caplog.records if 'snapshotter' in record.pathname.lower()]

        assert len(log_records) == 1
        assert log_records[0].levelname.lower() == 'warning'
        assert 'Memory is critically overloaded' in log_records[0].msg

        event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data)
        await event_manager.wait_for_all_listeners_to_complete()

        log_records = [record for record in caplog.records if 'snapshotter' in record.pathname.lower()]

        assert len(log_records) == 1


async def test_memory_load_evaluation_silent_on_acceptable_usage(
    monkeypatch: pytest.MonkeyPatch,
    event_manager: LocalEventManager,
    default_cpu_info: CpuInfo,
) -> None:
    mock_logger_warn = MagicMock()
    monkeypatch.setattr(getLogger('crawlee.autoscaling.snapshotter'), 'warning', mock_logger_warn)

    service_locator.set_event_manager(event_manager)
    snapshotter = Snapshotter.from_config(Configuration(memory_mbytes=8192))

    low_memory_usage = ByteSize.from_gb(8) * 0.8  # 80% of 8 GB

    event_data = EventSystemInfoData(
        cpu_info=default_cpu_info,
        memory_info=MemoryInfo(
            total_size=ByteSize.from_gb(8),
            current_size=low_memory_usage,
            system_wide_used_size=ByteSize.from_gb(7),
        ),
    )

    async with snapshotter:
        event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data)
        await event_manager.wait_for_all_listeners_to_complete()

        assert mock_logger_warn.call_count == 0


async def test_snapshots_time_ordered(snapshotter: Snapshotter, event_manager: LocalEventManager) -> None:
    # All internal snapshot list should be ordered by creation time in ascending order.
    # Scenario where older emitted event arrives after newer event.
    # Snapshotter should not trust the event order and check events' times.
    time_new = datetime.now(tz=timezone.utc)
    time_old = datetime.now(tz=timezone.utc) - timedelta(milliseconds=50)

    def create_event_data(creation_time: datetime) -> EventSystemInfoData:
        return EventSystemInfoData(
            cpu_info=CpuInfo(used_ratio=0.5, created_at=creation_time),
            memory_info=MemoryInfo(
                current_size=ByteSize(bytes=1),
                created_at=creation_time,
                total_size=ByteSize(bytes=2),
                system_wide_used_size=ByteSize.from_gb(5),
            ),
        )

    event_manager.emit(event=Event.SYSTEM_INFO, event_data=create_event_data(time_new))
    event_manager.emit(event=Event.SYSTEM_INFO, event_data=create_event_data(time_old))
    await event_manager.wait_for_all_listeners_to_complete()

    memory_samples = snapshotter.get_memory_sample()
    cpu_samples = snapshotter.get_cpu_sample()
    assert memory_samples[0].created_at == time_old
    assert cpu_samples[0].created_at == time_old
    assert memory_samples[1].created_at == time_new
    assert cpu_samples[1].created_at == time_new


def test_sorted_snapshot_list_add_maintains_order() -> None:
    """Test that SortedSnapshotList.add method maintains sorted order by created_at with multiple items."""
    sorted_list = SortedSnapshotList[CpuSnapshot]()

    # Create snapshots with different timestamps (more items to test binary search better)
    now = datetime.now(timezone.utc)
    snapshots = [
        CpuSnapshot(used_ratio=0.1, max_used_ratio=0.95, created_at=now - timedelta(seconds=50)),  # oldest
        CpuSnapshot(used_ratio=0.2, max_used_ratio=0.95, created_at=now - timedelta(seconds=40)),
        CpuSnapshot(used_ratio=0.3, max_used_ratio=0.95, created_at=now - timedelta(seconds=30)),
        CpuSnapshot(used_ratio=0.4, max_used_ratio=0.95, created_at=now - timedelta(seconds=20)),
        CpuSnapshot(used_ratio=0.5, max_used_ratio=0.95, created_at=now - timedelta(seconds=10)),
        CpuSnapshot(used_ratio=0.6, max_used_ratio=0.95, created_at=now - timedelta(seconds=5)),
        CpuSnapshot(used_ratio=0.7, max_used_ratio=0.95, created_at=now),  # newest
    ]

    # Add snapshots in random order to test binary search insertion
    add_order = [3, 0, 5, 1, 6, 2, 4]  # indices in random order
    for i in add_order:
        sorted_list.add(snapshots[i])

    # Verify the list is sorted by created_at (should be in original order)
    assert len(sorted_list) == 7
    for i, snapshot in enumerate(sorted_list):
        assert snapshot == snapshots[i], f'Item at index {i} is not correctly sorted'
        if i > 0:
            prev_time = sorted_list[i - 1].created_at
            curr_time = snapshot.created_at
            assert prev_time <= curr_time, f'Items at indices {i - 1} and {i} are not in chronological order'


@pytest.mark.parametrize('dynamic_memory', [True, False])
async def test_dynamic_memory(
    *,
    default_cpu_info: CpuInfo,
    event_manager: LocalEventManager,
    dynamic_memory: bool,
) -> None:
    """Test dynamic memory scaling scenario where the system-wide memory can change.

    Create two memory snapshots. They have same memory usage, but different available memory.
    First snapshot is created with insufficient memory, so it is overloaded.
    Second snapshot is created with sufficient memory.

    Based on the Snapshotter configuration, it will either take into account the increased available memory or not.
    """
    _initial_memory_info = get_memory_info()
    ratio_just_below_system_wide_overload = 0.99 * SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD

    memory_mbytes = 0 if dynamic_memory else floor(_initial_memory_info.total_size.to_mb())

    service_locator.set_event_manager(event_manager)

    async with Snapshotter.from_config(
        Configuration(memory_mbytes=memory_mbytes, available_memory_ratio=ratio_just_below_system_wide_overload)
    ) as snapshotter:
        # Default state, memory usage exactly at the overload threshold -> overloaded, but not system-wide overloaded
        memory_infos = [
            # Overloaded sample
            MemoryInfo(
                total_size=_initial_memory_info.total_size,
                current_size=_initial_memory_info.total_size * ratio_just_below_system_wide_overload,
                system_wide_used_size=_initial_memory_info.total_size * ratio_just_below_system_wide_overload,
            ),
            # Same as first sample, with twice as memory available in the system
            MemoryInfo(
                total_size=_initial_memory_info.total_size * 2,  # Simulate increased total memory
                current_size=_initial_memory_info.total_size * ratio_just_below_system_wide_overload,
                system_wide_used_size=_initial_memory_info.total_size * ratio_just_below_system_wide_overload,
            ),
        ]

        for memory_info in memory_infos:
            event_manager.emit(
                event=Event.SYSTEM_INFO,
                event_data=EventSystemInfoData(
                    cpu_info=default_cpu_info,
                    memory_info=memory_info,
                ),
            )

        await event_manager.wait_for_all_listeners_to_complete()

        memory_samples = snapshotter.get_memory_sample()
        assert len(memory_samples) == 2
        # First sample will be overloaded.
        assert memory_samples[0].is_overloaded
        # Second sample can reflect the increased available memory based on the configuration used to create Snapshotter
        assert memory_samples[1].is_overloaded == (not dynamic_memory)


================================================
FILE: tests/unit/_autoscaling/test_system_status.py
================================================
from __future__ import annotations

from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING

import pytest

from crawlee._autoscaling import Snapshotter, SystemStatus
from crawlee._autoscaling._types import (
    ClientSnapshot,
    CpuSnapshot,
    EventLoopSnapshot,
    LoadRatioInfo,
    MemorySnapshot,
    SystemInfo,
)
from crawlee._utils.byte_size import ByteSize
from crawlee.configuration import Configuration

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator


@pytest.fixture
async def snapshotter() -> AsyncGenerator[Snapshotter, None]:
    config = Configuration(available_memory_ratio=0.25)
    async with Snapshotter.from_config(config) as snapshotter:
        yield snapshotter


@pytest.fixture
def now() -> datetime:
    return datetime.now(timezone.utc)


async def test_start_stop_lifecycle() -> None:
    config = Configuration(available_memory_ratio=0.25)

    async with Snapshotter.from_config(config) as snapshotter:
        system_status = SystemStatus(snapshotter)
        system_status.get_current_system_info()
        system_status.get_historical_system_info()


def test_cpu_is_overloaded(snapshotter: Snapshotter, now: datetime) -> None:
    system_status = SystemStatus(snapshotter, cpu_overload_threshold=0.5)
    system_status._snapshotter._cpu_snapshots = Snapshotter._get_sorted_list_by_created_at(
        [
            CpuSnapshot(used_ratio=0.6, max_used_ratio=0.75, created_at=now - timedelta(minutes=3)),
            CpuSnapshot(used_ratio=0.7, max_used_ratio=0.75, created_at=now - timedelta(minutes=2)),
            CpuSnapshot(used_ratio=0.8, max_used_ratio=0.75, created_at=now - timedelta(minutes=1)),
            CpuSnapshot(used_ratio=0.9, max_used_ratio=0.75, created_at=now),
        ]
    )
    cpu_info = system_status._is_cpu_overloaded()

    assert cpu_info == LoadRatioInfo(limit_ratio=0.5, actual_ratio=0.667)
    assert cpu_info.is_overloaded is True


def test_cpu_is_not_overloaded(snapshotter: Snapshotter, now: datetime) -> None:
    system_status = SystemStatus(snapshotter, cpu_overload_threshold=0.5)
    system_status._snapshotter._cpu_snapshots = Snapshotter._get_sorted_list_by_created_at(
        [
            CpuSnapshot(used_ratio=0.7, max_used_ratio=0.75, created_at=now - timedelta(minutes=3)),
            CpuSnapshot(used_ratio=0.8, max_used_ratio=0.75, created_at=now - timedelta(minutes=2)),
            CpuSnapshot(used_ratio=0.6, max_used_ratio=0.75, created_at=now - timedelta(minutes=1)),
            CpuSnapshot(used_ratio=0.5, max_used_ratio=0.75, created_at=now),
        ]
    )
    cpu_info = system_status._is_cpu_overloaded()

    assert cpu_info == LoadRatioInfo(limit_ratio=0.5, actual_ratio=0.333)
    assert cpu_info.is_overloaded is False


def test_get_system_info(snapshotter: Snapshotter, now: datetime) -> None:
    system_status = SystemStatus(
        snapshotter,
        max_snapshot_age=timedelta(minutes=1),
        cpu_overload_threshold=0.5,
        memory_overload_threshold=0.5,
        event_loop_overload_threshold=0.5,
        client_overload_threshold=0.5,
    )

    # Add CPU snapshots
    system_status._snapshotter._cpu_snapshots = Snapshotter._get_sorted_list_by_created_at(
        [
            CpuSnapshot(used_ratio=0.6, max_used_ratio=0.75, created_at=now - timedelta(minutes=3)),
            CpuSnapshot(used_ratio=0.7, max_used_ratio=0.75, created_at=now - timedelta(minutes=2)),
            CpuSnapshot(used_ratio=0.8, max_used_ratio=0.75, created_at=now - timedelta(minutes=1)),
            CpuSnapshot(used_ratio=0.9, max_used_ratio=0.75, created_at=now),
        ]
    )

    # Add memory snapshots
    system_status._snapshotter._memory_snapshots = Snapshotter._get_sorted_list_by_created_at(
        [
            MemorySnapshot(
                current_size=ByteSize.from_gb(4),
                max_memory_size=ByteSize.from_gb(12),
                max_used_memory_ratio=0.8,
                created_at=now - timedelta(seconds=90),
                system_wide_used_size=None,
                system_wide_memory_size=None,
            ),
            MemorySnapshot(
                current_size=ByteSize.from_gb(7),
                max_memory_size=ByteSize.from_gb(8),
                max_used_memory_ratio=0.8,
                created_at=now - timedelta(seconds=60),
                system_wide_used_size=None,
                system_wide_memory_size=None,
            ),
            MemorySnapshot(
                current_size=ByteSize.from_gb(28),
                max_memory_size=ByteSize.from_gb(30),
                max_used_memory_ratio=0.8,
                created_at=now - timedelta(seconds=30),
                system_wide_used_size=None,
                system_wide_memory_size=None,
            ),
            MemorySnapshot(
                current_size=ByteSize.from_gb(48),
                max_memory_size=ByteSize.from_gb(60),
                max_used_memory_ratio=0.8,
                created_at=now,
                system_wide_used_size=None,
                system_wide_memory_size=None,
            ),
        ]
    )

    # Add event loop snapshots
    system_status._snapshotter._event_loop_snapshots = Snapshotter._get_sorted_list_by_created_at(
        [
            EventLoopSnapshot(
                delay=timedelta(milliseconds=700),
                max_delay=timedelta(milliseconds=500),
                created_at=now - timedelta(minutes=3),
            ),
            EventLoopSnapshot(
                delay=timedelta(milliseconds=600),
                max_delay=timedelta(milliseconds=500),
                created_at=now - timedelta(minutes=2),
            ),
            EventLoopSnapshot(
                delay=timedelta(milliseconds=200),
                max_delay=timedelta(milliseconds=500),
                created_at=now - timedelta(minutes=1),
            ),
            EventLoopSnapshot(
                delay=timedelta(milliseconds=100),
                max_delay=timedelta(milliseconds=500),
                created_at=now,
            ),
        ]
    )

    # Add client snapshots
    system_status._snapshotter._client_snapshots = Snapshotter._get_sorted_list_by_created_at(
        [
            ClientSnapshot(error_count=1, new_error_count=1, max_error_count=2, created_at=now - timedelta(minutes=3)),
            ClientSnapshot(error_count=2, new_error_count=1, max_error_count=2, created_at=now - timedelta(minutes=2)),
            ClientSnapshot(error_count=4, new_error_count=2, max_error_count=2, created_at=now - timedelta(minutes=1)),
            ClientSnapshot(error_count=4, new_error_count=0, max_error_count=2, created_at=now),
        ]
    )

    # Test current system info
    current_system_info = system_status.get_current_system_info()
    assert current_system_info == SystemInfo(
        cpu_info=LoadRatioInfo(limit_ratio=system_status._cpu_overload_threshold, actual_ratio=1.0),
        memory_info=LoadRatioInfo(limit_ratio=system_status._memory_overload_threshold, actual_ratio=0.5),
        event_loop_info=LoadRatioInfo(limit_ratio=system_status._event_loop_overload_threshold, actual_ratio=0),
        client_info=LoadRatioInfo(limit_ratio=system_status._client_overload_threshold, actual_ratio=0),
        created_at=current_system_info.created_at,
    )
    assert current_system_info.is_system_idle is False

    # Test historical system info
    historical_system_info = system_status.get_historical_system_info()
    assert historical_system_info == SystemInfo(
        cpu_info=LoadRatioInfo(limit_ratio=system_status._cpu_overload_threshold, actual_ratio=0.667),
        memory_info=LoadRatioInfo(limit_ratio=system_status._memory_overload_threshold, actual_ratio=0.667),
        event_loop_info=LoadRatioInfo(limit_ratio=system_status._event_loop_overload_threshold, actual_ratio=0.333),
        client_info=LoadRatioInfo(limit_ratio=system_status._client_overload_threshold, actual_ratio=0),
        created_at=historical_system_info.created_at,
    )
    assert historical_system_info.is_system_idle is False


@pytest.mark.parametrize(('client_overload_threshold', 'is_overloaded'), [(0.66, True), (0.67, False)])
def test_client_overloaded(
    *, snapshotter: Snapshotter, now: datetime, client_overload_threshold: float, is_overloaded: bool
) -> None:
    system_status = SystemStatus(
        snapshotter,
        max_snapshot_age=timedelta(minutes=1),
        client_overload_threshold=client_overload_threshold,
    )

    system_status._snapshotter._client_snapshots = Snapshotter._get_sorted_list_by_created_at(
        [
            ClientSnapshot(error_count=1, new_error_count=1, max_error_count=0, created_at=now - timedelta(minutes=3)),
            ClientSnapshot(error_count=2, new_error_count=1, max_error_count=0, created_at=now - timedelta(minutes=2)),
            ClientSnapshot(error_count=3, new_error_count=1, max_error_count=0, created_at=now - timedelta(minutes=1)),
            ClientSnapshot(error_count=3, new_error_count=0, max_error_count=0, created_at=now),
        ]
    )

    # Ratio of overloaded snapshots is 2/3 (2 minutes out of 3)
    assert system_status._is_client_overloaded().is_overloaded == is_overloaded


def test_memory_overloaded_system_wide(snapshotter: Snapshotter, now: datetime) -> None:
    """Test that system-wide memory overload is detected when system-wide memory utilization exceeds threshold."""
    system_status = SystemStatus(
        snapshotter,
        max_snapshot_age=timedelta(minutes=1),
        memory_overload_threshold=0.5,  # Set high threshold so process memory won't trigger overload
    )

    # Add memory snapshots with system-wide memory usage above threshold (97%)
    system_status._snapshotter._memory_snapshots = Snapshotter._get_sorted_list_by_created_at(
        [
            MemorySnapshot(
                current_size=ByteSize.from_gb(1),  # Process memory is low
                max_memory_size=ByteSize.from_gb(8),  # Max memory is high
                max_used_memory_ratio=0.8,  # Ratio is fine
                created_at=now - timedelta(minutes=1),
                system_wide_used_size=ByteSize.from_gb(31),  # System-wide used is high
                system_wide_memory_size=ByteSize.from_gb(32),  # System-wide total (31/32 = 96.875% < 97%)
            ),
            MemorySnapshot(
                current_size=ByteSize.from_gb(1),  # Process memory is low
                max_memory_size=ByteSize.from_gb(8),  # Max memory is high
                max_used_memory_ratio=0.8,  # Ratio is fine
                created_at=now,
                system_wide_used_size=ByteSize.from_gb(31.5),  # System-wide used is high
                system_wide_memory_size=ByteSize.from_gb(32),  # System-wide total (31.5/32 = 98.4% > 97%)
            ),
        ]
    )

    memory_info = system_status._is_memory_overloaded()

    # Should be overloaded due to system-wide memory usage exceeding 97% threshold
    assert memory_info.is_overloaded is True
    # The actual ratio should be 1.0 (the entire time period from first to second snapshot is overloaded)
    assert memory_info.actual_ratio == 1.0
    assert memory_info.limit_ratio == 0.5


================================================
FILE: tests/unit/_statistics/test_error_tracker.py
================================================
import traceback

import pytest

from crawlee.statistics._error_tracker import ErrorTracker


@pytest.mark.parametrize(
    ('error_tracker', 'expected_unique_errors'),
    [
        (ErrorTracker(), 5),
        (ErrorTracker(show_file_and_line_number=False), 4),
        (ErrorTracker(show_error_name=False), 4),
        (ErrorTracker(show_error_message=False), 3),
        (ErrorTracker(show_error_name=False, show_file_and_line_number=False), 3),
        (ErrorTracker(show_file_and_line_number=False, show_error_message=False), 2),
        (ErrorTracker(show_error_name=False, show_file_and_line_number=False, show_error_message=False), 1),
    ],
)
async def test_error_tracker_counts(error_tracker: ErrorTracker, expected_unique_errors: int) -> None:
    """Use different settings of `error_tracker` and test unique errors count."""

    for error in [
        Exception('Some value error abc'),
        ValueError('Some value error abc'),  # Different type, different error
        ValueError('Some value error cde'),  # Same type and similar message to previous, considered the same.
        ValueError(
            'Another value error efg'
        ),  # Same type, but too different message to previous, considered different.
        ValueError(),  # Same type but don't have message, considered different.
    ]:
        try:
            raise error  # Errors raised on same line
        except Exception as e:  # noqa:PERF203
            await error_tracker.add(e)

    try:
        raise ValueError('Some value error abc')  # Same as one previous error, but different line.
    except Exception as e:
        await error_tracker.add(e)

    assert error_tracker.total == 6
    assert error_tracker.unique_error_count == expected_unique_errors


@pytest.mark.parametrize(
    ('message_1', 'message_2', 'expected_generic_message'),
    [
        ('Some error number 123', 'Some error number 456', 'Some error number ***'),
        ('Some error number 123 456', 'Some error number 123 456 789', 'Some error number 123 456 ***'),
        ('Some error number 0 0 0', 'Some error number 1 0 1', 'Some error number *** 0 ***'),
    ],
)
async def test_error_tracker_similar_messages_full_stack(
    message_1: str, message_2: str, expected_generic_message: str
) -> None:
    """Test that similar messages collapse into same group with generic name that contains wildcard symbols."""
    error_tracker = ErrorTracker()
    for error in [
        KeyError(message_1),
        KeyError(message_1),
        KeyError(message_1),
        ValueError(message_1),
        ValueError(message_2),
        RuntimeError(message_2),
    ]:
        try:
            raise error  # Errors raised on the same line
        except Exception as e:  # noqa:PERF203
            await error_tracker.add(e)
            line = traceback.extract_tb(e.__traceback__)[0].lineno

    file_name = __file__.split('/')[-1]
    errors = error_tracker.get_most_common_errors()
    assert errors[0][0] == f'{file_name}:{line}:KeyError:{message_1}'
    assert errors[0][1] == 3
    assert errors[1][0] == f'{file_name}:{line}:ValueError:{expected_generic_message}'
    assert errors[1][1] == 2
    assert errors[2][0] == f'{file_name}:{line}:RuntimeError:{message_2}'
    assert errors[2][1] == 1


@pytest.mark.parametrize(
    ('show_full_message', 'expected_message'),
    [
        (True, 'Error line 1\n Error line 2'),
        (False, 'Error line 1'),
    ],
)
async def test_show_full_message(*, show_full_message: bool, expected_message: str) -> None:
    """Test error message settings with both options of `show_full_message`."""
    error_tracker = ErrorTracker(
        show_error_name=False, show_file_and_line_number=False, show_full_message=show_full_message
    )

    try:
        raise RuntimeError('Error line 1\n Error line 2')  # Errors raised on the same line
    except Exception as e:
        await error_tracker.add(e)

    assert error_tracker.get_most_common_errors()[0][0] == expected_message


async def test_error_tracker_with_errors_chain() -> None:
    """Test error tracker with errors chain."""
    error_tracker = ErrorTracker(show_error_name=False, show_file_and_line_number=False, show_full_message=True)

    try:
        raise ZeroDivisionError('Zero division error')  # Errors raised on the same line
    except Exception as e:
        try:
            raise ValueError from e
        except Exception as e:
            await error_tracker.add(e)

    assert error_tracker.get_most_common_errors()[0][0] == 'Zero division error'


================================================
FILE: tests/unit/_statistics/test_periodic_logging.py
================================================
from __future__ import annotations

import asyncio
import logging
from datetime import timedelta
from typing import TYPE_CHECKING

from crawlee.statistics import Statistics

if TYPE_CHECKING:
    import pytest


async def test_periodic_logging(caplog: pytest.LogCaptureFixture) -> None:
    caplog.set_level(logging.INFO)

    log_message = 'Periodic statistics XYZ'
    statistics = Statistics.with_default_state(log_interval=timedelta(milliseconds=50), log_message=log_message)

    async with statistics:
        await asyncio.sleep(0.1)

    matching_records = [rec for rec in caplog.records if rec.message.startswith(log_message)]
    assert len(matching_records) >= 1


================================================
FILE: tests/unit/_statistics/test_persistence.py
================================================
from __future__ import annotations

from crawlee.statistics import Statistics


async def test_basic_persistence() -> None:
    key = 'statistics_foo'

    async with Statistics.with_default_state(persistence_enabled=True, persist_state_key=key) as statistics:
        statistics.state.requests_failed = 42

    async with Statistics.with_default_state(persistence_enabled=True, persist_state_key=key) as statistics:
        pass

    assert statistics.state.requests_failed == 42


================================================
FILE: tests/unit/_statistics/test_request_max_duration.py
================================================
from __future__ import annotations

import asyncio

from crawlee.statistics import Statistics


async def test_request_max_duration_tracks_maximum() -> None:
    """Test that request_max_duration correctly tracks the maximum duration, not the minimum."""

    # asyncio.sleep() can sleep slightly shorter than expected https://bugs.python.org/issue31539#msg302699
    asyncio_sleep_time_tolerance = 0.015
    sleep_time = 0.05

    async with Statistics.with_default_state() as statistics:
        # Record a short request
        statistics.record_request_processing_start('request_1')
        statistics.record_request_processing_finish('request_1')
        first_duration = statistics.state.request_max_duration

        # Record a longer request
        statistics.record_request_processing_start('request_2')
        await asyncio.sleep(sleep_time)  # 50ms delay
        statistics.record_request_processing_finish('request_2')
        second_duration = statistics.state.request_max_duration

        # The max duration should be updated to the longer request's duration
        assert second_duration is not None
        assert first_duration is not None
        assert second_duration >= first_duration
        assert second_duration.total_seconds() >= (sleep_time - asyncio_sleep_time_tolerance)

        # Record another short request - max should NOT decrease
        statistics.record_request_processing_start('request_3')
        statistics.record_request_processing_finish('request_3')
        third_duration = statistics.state.request_max_duration

        # The max duration should remain unchanged (still the longest request)
        assert third_duration == second_duration


================================================
FILE: tests/unit/_statistics/test_request_processing_record.py
================================================
from datetime import timedelta

from crawlee.statistics._statistics import RequestProcessingRecord


def test_tracking_time_resolution() -> None:
    """Test that `RequestProcessingRecord` tracks time with sufficient resolution.

    This is generally not an issue on Linux, but on Windows some packages in older Python versions might be using system
    timers with not so granular resolution - some sources estimate 15ms. This test will start failing on Windows
    if unsuitable source of time measurement is selected due to two successive time measurements possibly using same
    timing sample."""
    record = RequestProcessingRecord()
    record.run()
    record.finish()
    assert record.duration
    assert record.duration > timedelta(seconds=0)


================================================
FILE: tests/unit/_utils/test_byte_size.py
================================================
from __future__ import annotations

import pytest

from crawlee._utils.byte_size import ByteSize


def test_initializations() -> None:
    assert ByteSize(1024).bytes == 1024
    assert ByteSize.from_kb(1).bytes == 1024
    assert ByteSize.from_mb(1).bytes == 1024**2
    assert ByteSize.from_gb(1).bytes == 1024**3
    assert ByteSize.from_tb(1).bytes == 1024**4

    with pytest.raises(ValueError, match=r'ByteSize cannot be negative'):
        ByteSize(-1)


def test_conversions() -> None:
    size = ByteSize.from_mb(2)
    assert size.to_kb() == 2 * 1024
    assert size.to_mb() == 2.0
    assert size.to_gb() == 2 / 1024
    assert size.to_tb() == 2 / (1024**2)


def test_string_representation() -> None:
    assert str(ByteSize(512)) == '512 B'
    assert str(ByteSize(2 * 1024)) == '2.00 KB'
    assert str(ByteSize(3 * 1024**2)) == '3.00 MB'
    assert str(ByteSize(4 * 1024**3)) == '4.00 GB'
    assert str(ByteSize(5 * 1024**4)) == '5.00 TB'


def test_comparisons() -> None:
    size1 = ByteSize(1024)
    size2 = ByteSize(512)

    assert size1 > size2
    assert size1 >= size2
    assert size2 < size1
    assert size2 <= size1
    assert size1 == ByteSize(1024)
    assert size1 != size2


def test_additions() -> None:
    # Addition of ByteSize instances
    size1 = ByteSize(1024)
    size2 = ByteSize(2048)
    assert (size1 + size2).bytes == 3072

    # Addition of ByteSize instance and an int
    with pytest.raises(TypeError):
        _ = size1 + 1024

    # Addition of ByteSize instance and an float
    with pytest.raises(TypeError):
        _ = size2 + 123.45


def test_subtractions() -> None:
    # Direct subtraction of ByteSize instances
    size1 = ByteSize(2048)
    size2 = ByteSize(1024)
    assert (size1 - size2).bytes == 1024

    # Subtraction resulting in a negative value raises ValueError
    with pytest.raises(ValueError, match=r'Resulting ByteSize cannot be negative'):
        _ = size2 - size1

    # Subtraction of ByteSize instance and an int
    with pytest.raises(TypeError):
        _ = size1 - 1024

    # Subtraction of ByteSize instance and an float
    with pytest.raises(TypeError):
        _ = size2 - 123.45


def test_multiplication() -> None:
    # Multiplication of ByteSize by an int
    size = ByteSize(1024)
    result = size * 2
    assert result.bytes == 2048

    # Multiplication of ByteSize by a float
    size_float = ByteSize(1024)
    result_float = size_float * 1.5
    assert result_float.bytes == 1536

    # Test reflected multiplication
    size_reflected = ByteSize(1024)
    reflected_result = 3 * size_reflected
    assert reflected_result.bytes == 3072


def test_divisions() -> None:
    # Division of ByteSize by another ByteSize
    size1 = ByteSize(2048)
    size2 = ByteSize(1024)
    assert (size1 / size2) == 2

    # Division by zero when the divisor is a ByteSize with zero bytes
    with pytest.raises(ZeroDivisionError):
        _ = size1 / ByteSize(0)

    # Division of ByteSize - multiplying by a float
    assert (size1 * 0.5).bytes == 1024


================================================
FILE: tests/unit/_utils/test_console.py
================================================
from __future__ import annotations

from crawlee._utils.console import make_table


def test_empty_input() -> None:
    assert make_table([]) == ''


def test_empty_row() -> None:
    assert make_table([()]) == ''


def test_single_column() -> None:
    result = make_table([('test',)])
    lines = result.split('\n')
    assert len(lines) == 3
    assert lines[1] == '│ test │'


def test_two_columns() -> None:
    data = [('Name', 'Age'), ('Alice', '30'), ('Bob', '25')]
    result = make_table(data)
    lines = result.split('\n')
    # fmt: off
    assert lines == ['┌───────┬─────┐',
                     '│ Name  │ Age │',
                     '│ Alice │ 30  │',
                     '│ Bob   │ 25  │',
                     '└───────┴─────┘']
    # fmt: on


def test_long_content_truncation() -> None:
    data = [('Short', 'VeryVeryVeryLongContent')]
    result = make_table(data, width=25)
    lines = result.split('\n')
    # fmt: off
    assert lines == ['┌───────────┬───────────┐',
                     '│ Short     │ VeryVe... │',
                     '└───────────┴───────────┘']
    # fmt: on


================================================
FILE: tests/unit/_utils/test_crypto.py
================================================
from __future__ import annotations

from crawlee._utils.crypto import compute_short_hash, crypto_random_object_id


def test_crypto_random_object_id_default_length() -> None:
    object_id = crypto_random_object_id()
    assert len(object_id) == 17, 'Default generated object ID should have a length of 17 characters.'


def test_crypto_random_object_id_custom_length() -> None:
    for length in [5, 10, 20, 100]:
        object_id = crypto_random_object_id(length)
        assert len(object_id) == length, f'Generated object ID should have a length of {length} characters.'


def test_crypto_random_object_id_character_set() -> None:
    long_random_object_id = crypto_random_object_id(1000)
    allowed_chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
    for char in long_random_object_id:
        assert char in allowed_chars, f"Character '{char}' is not in the expected alphanumeric range."


def test_compute_short_hash_with_known_input() -> None:
    data = b'Hello world!'
    expected_hash = 'c0535e4b'
    assert compute_short_hash(data) == expected_hash, 'The hash does not match the expected output.'


def test_compute_short_hash_with_empty_input() -> None:
    data = b''
    expected_hash = 'e3b0c442'
    assert compute_short_hash(data) == expected_hash, 'The hash for an empty input should follow the expected pattern.'


def test_compute_short_hash_output_length() -> None:
    data = b'some random data'
    assert len(compute_short_hash(data)) == 8, 'The output hash should be 8 characters long.'


def test_compute_short_hash_differentiates_input() -> None:
    data1 = b'input 1'
    data2 = b'input 2'
    assert compute_short_hash(data1) != compute_short_hash(data2), 'Different inputs should produce different hashes.'


================================================
FILE: tests/unit/_utils/test_file.py
================================================
from __future__ import annotations

from datetime import datetime, timezone

from crawlee._utils.file import json_dumps


async def test_json_dumps() -> None:
    assert await json_dumps({'key': 'value'}) == '{\n  "key": "value"\n}'
    assert await json_dumps(['one', 2, 3.0]) == '[\n  "one",\n  2,\n  3.0\n]'
    assert await json_dumps('string') == '"string"'
    assert await json_dumps(123) == '123'
    assert await json_dumps(datetime(2022, 1, 1, tzinfo=timezone.utc)) == '"2022-01-01 00:00:00+00:00"'


================================================
FILE: tests/unit/_utils/test_globs.py
================================================
from __future__ import annotations

from crawlee._utils.globs import Glob


def test_asterisk() -> None:
    glob = Glob('foo/*')
    assert glob.regexp.match('bar/') is None
    assert glob.regexp.match('foo/bar') is not None
    assert glob.regexp.match('foo/bar/baz') is None


def test_double_asteritsk() -> None:
    glob = Glob('foo/**')
    assert glob.regexp.match('bar/') is None
    assert glob.regexp.match('foo/bar') is not None
    assert glob.regexp.match('foo/bar/baz') is not None


================================================
FILE: tests/unit/_utils/test_html_to_text.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

import pytest
from bs4 import BeautifulSoup
from parsel import Selector

from crawlee.crawlers._beautifulsoup._utils import html_to_text as html_to_text_beautifulsoup
from crawlee.crawlers._parsel._utils import html_to_text as html_to_text_parsel

if TYPE_CHECKING:
    from collections.abc import Callable

_EXPECTED_TEXT = (
    "Let's start with a simple text. \n"
    "The ships hung in the sky, much the way that bricks don't. \n"
    "These aren't the Droids you're looking for\n"
    "I'm sorry, Dave. I'm afraid I can't do that.\n"
    "I'm sorry, Dave. I'm afraid I can't do that.\n"
    'A1\tA2\tA3\t\n'
    'B1\tB2\tB3\tB 4\t\n'
    'This is some text with inline elements and HTML entities (>bla<) \n'
    'Test\n'
    'a\n'
    'few\n'
    'line\n'
    'breaks\n'
    'Spaces in an inline text should be completely ignored. \n'
    'But,\n'
    '    a pre-formatted\n'
    '                block  should  be  kept\n'
    '                                       pre-formatted.\n'
    'The Greatest Science Fiction Quotes Of All Time \n'
    "Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You "
    'Nexus, huh? I design your eyes.'
)

_EXAMPLE_HTML = """
<html>
<head>
    <title>Title SHOULD NOT be converted</title>

    <!-- Comments SHOULD NOT be converted -->
</head>
<body with='some attributes'>
Let's start with a        simple text.
<p>
    The ships hung in the sky, much the <a class="click" href="https://example.com/a/b/first">way that</a> bricks don't.
</p>
<ul>
    <li>These aren't the Droids you're looking for</li>
    <li some="attribute"><a href="https://example.com/a/second">I'm sorry, Dave. I'm afraid I can't do that.</a></li>
    <li><a class="click" href="https://example.com/a/b/third">I'm sorry, Dave. I'm afraid I can't do that.</a></li>
</ul>

<img src="something" alt="This should be ignored" />

<!-- Comments SHOULD NOT be converted -->

<table>
    <tr class="something">
        <td>A1</td>
        <td attributes="are ignored">A2</td>
        <td>A3</td>
    </tr>
    <tr class="something">
        <td>B1</td>
        <td attributes="are ignored" even="second attribute">B2</td>
        <td>B3</td>
        <td>B     4</td>
    </tr>
</table>

<p>
    This is <b>some<i> text <b>with</b></i></b> inline <span>elements</span> and HTML&nbsp;entities (&gt;bla&lt;)
</p>

<div>
    Test<br>
    a<br />
    few<br>
    line<br>
    breaks<br>
</div>


    Spaces


    in


    an inline text                                should be


    completely ignored.


<pre>
But,
    a pre-formatted
                block  should  be  kept
                                       pre-formatted.
</pre>

<svg>
    These special elements SHOULD NOT BE CONVERTED.
</svg>

<script>
    // These special elements should be completely skipped.
    skipThis();
</script>

<style>
    /* These special elements should be completely skipped. */
    .skip_this {}
</style>

<canvas>
    This should be skipped too.
</canvas>

<a class="click" href="https://another.com/a/fifth">The Greatest Science Fiction Quotes Of All Time</a>
<p>
    Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design,
    just eyes. You Nexus, huh? I design your <a class="click" href="http://cool.com/">eyes</a>.
</p>
</body>
</html>
"""


@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup])
@pytest.mark.parametrize(
    ('source', 'expected_text'),
    [
        pytest.param(_EXAMPLE_HTML, _EXPECTED_TEXT, id='Complex html'),
        ('   Plain    text     node    ', 'Plain text node'),
        ('   \nPlain    text     node  \n  ', 'Plain text node'),
        ('<h1>Header 1</h1> <h2>Header 2</h2>', 'Header 1\nHeader 2'),
        ('<h1>Header 1</h1> <h2>Header 2</h2><br>', 'Header 1\nHeader 2'),
        ('<h1>Header 1</h1> <h2>Header 2</h2><br><br>', 'Header 1\nHeader 2'),
        ('<h1>Header 1</h1> <h2>Header 2</h2><br><br><br>', 'Header 1\nHeader 2'),
        ('<h1>Header 1</h1><br><h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
        ('<h1>Header 1</h1> <br> <h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
        ('<h1>Header 1</h1>  \n <br>\n<h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
        ('<h1>Header 1</h1>  \n <br>\n<br><h2>Header 2</h2><br><br><br>', 'Header 1\n\n\nHeader 2'),
        ('<h1>Header 1</h1>  \n <br>\n<br><br><h2>Header 2</h2><br><br><br>', 'Header 1\n\n\n\nHeader 2'),
        ('<div><div>Div</div><p>Paragraph</p></div>', 'Div\nParagraph'),
        ('<div>Div1</div><!-- Some comments --><div>Div2</div>', 'Div1\nDiv2'),
        ('<div>Div1</div><style>Skip styles</style>', 'Div1'),
        ('<script>Skip_scripts();</script><div>Div1</div>', 'Div1'),
        ('<SCRIPT>Skip_scripts();</SCRIPT><div>Div1</div>', 'Div1'),
        ('<svg>Skip svg</svg><div>Div1</div>', 'Div1'),
        ('<canvas>Skip canvas</canvas><div>Div1</div>', 'Div1'),
        ('<b>A  B  C  D  E\n\nF  G</b>', 'A B C D E F G'),
        ('<pre>A  B  C  D  E\n\nF  G</pre>', 'A  B  C  D  E\n\nF  G'),
        (
            '<h1>Heading 1</h1><div><div><div><div>Deep  Div</div></div></div></div><h2>Heading       2</h2>',
            'Heading 1\nDeep Div\nHeading 2',
        ),
        ('<a>this_word</a>_should_<b></b>be_<span>one</span>', 'this_word_should_be_one'),
        ('<span attributes="should" be="ignored">some <span>text</span></span>', 'some text'),
        pytest.param(
            (
                """<table>
    <tr>
        <td>Cell    A1</td><td>Cell A2</td>
        <td>    Cell A3    </td>
    </tr>
    <tr>
        <td>Cell    B1</td><td>Cell B2</td>
    </tr>
</table>"""
            ),
            'Cell A1\tCell A2\tCell A3 \t\nCell B1\tCell B2',
            id='Table',
        ),
        ('<span>&aacute; &eacute;</span>', 'á é'),
    ],
)
def test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[str], str]) -> None:
    assert html_to_text(source) == expected_text


@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup])
def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None:
    with pytest.raises(TypeError):
        # Intentional wrong type test.
        html_to_text(1)  # ty: ignore[invalid-argument-type]


def test_html_to_text_parsel() -> None:
    assert html_to_text_parsel(Selector(_EXAMPLE_HTML)) == _EXPECTED_TEXT


def test_html_to_text_beautifulsoup() -> None:
    assert html_to_text_beautifulsoup(BeautifulSoup(_EXAMPLE_HTML, features='lxml')) == _EXPECTED_TEXT


================================================
FILE: tests/unit/_utils/test_measure_time.py
================================================
from __future__ import annotations

import asyncio
import time

from crawlee._utils.time import measure_time


def test_measure_time_wall_sync() -> None:
    with measure_time() as elapsed:
        time.sleep(0.1)

    assert elapsed.cpu is not None
    assert elapsed.wall is not None
    assert elapsed.wall >= 0.09


def test_measure_time_cpu_sync() -> None:
    with measure_time() as elapsed:
        start = time.time()
        acc = 0

        while time.time() - start < 0.1:
            acc += 1
            acc *= acc

    assert elapsed.cpu is not None
    assert elapsed.wall is not None
    # Just verify that CPU time is measured and is positive.
    assert elapsed.cpu > 0


async def test_measure_time_wall_async() -> None:
    with measure_time() as elapsed:
        await asyncio.sleep(0.1)

    assert elapsed.cpu is not None
    assert elapsed.wall is not None
    assert elapsed.wall >= 0.09


================================================
FILE: tests/unit/_utils/test_raise_if_too_many_kwargs.py
================================================
from contextlib import nullcontext
from typing import Any

import pytest

from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs


@pytest.mark.parametrize(
    ('kwargs', 'should_raise'),
    [
        ({'alias': 'alias', 'name': None, 'id': None}, False),
        ({'alias': None, 'name': 'name', 'id': None}, False),
        ({'alias': None, 'name': None, 'id': 'id'}, False),
        ({'alias': 'alias', 'name': 'name', 'id': None}, True),
        ({'alias': 'alias', 'name': None, 'id': 'id'}, True),
        ({'alias': None, 'name': 'name', 'id': 'id'}, True),
        ({'alias': 'alias', 'name': 'name', 'id': 'id'}, True),
        ({'alias': None, 'name': None, 'id': None}, False),
    ],
)
def test_limit_kwargs_default(kwargs: dict[str, Any], *, should_raise: bool) -> None:
    context = pytest.raises(ValueError, match=r'^Only one of .*') if should_raise else nullcontext()
    with context:
        raise_if_too_many_kwargs(**kwargs)


@pytest.mark.parametrize(
    ('kwargs', 'should_raise'),
    [
        ({'alias': 'alias', 'name': 'name', 'id': 'id'}, True),
        ({'alias': 'alias', 'name': 'name', 'id': None}, False),
    ],
)
def test_limit_kwargs(kwargs: dict[str, Any], *, should_raise: bool) -> None:
    context = pytest.raises(ValueError, match=r'^Only one of .*') if should_raise else nullcontext()
    with context:
        raise_if_too_many_kwargs(max_kwargs=2, **kwargs)


================================================
FILE: tests/unit/_utils/test_recurring_task.py
================================================
from __future__ import annotations

import asyncio
from datetime import timedelta
from unittest.mock import AsyncMock

import pytest

from crawlee._utils.recurring_task import RecurringTask


@pytest.fixture
def function() -> AsyncMock:
    mock_function = AsyncMock()
    mock_function.__name__ = 'mocked_function'  # To avoid issues with the function name in RecurringTask
    return mock_function


@pytest.fixture
def delay() -> timedelta:
    return timedelta(milliseconds=30)


async def test_init(function: AsyncMock, delay: timedelta) -> None:
    rt = RecurringTask(function, delay)
    assert rt.func == function
    assert rt.delay == delay
    assert rt.task is None


async def test_start_and_stop(function: AsyncMock, delay: timedelta) -> None:
    rt = RecurringTask(function, delay)

    rt.start()
    await asyncio.sleep(0)  # Yield control to allow the task to start

    assert isinstance(rt.task, asyncio.Task)
    assert not rt.task.done()

    await rt.stop()
    assert rt.task.done()


@pytest.mark.run_alone
async def test_execution(function: AsyncMock, delay: timedelta) -> None:
    task = RecurringTask(function, delay)

    task.start()
    await asyncio.sleep(0.2)  # Wait enough for the task to execute a few times
    await task.stop()

    assert isinstance(task.func, AsyncMock)  # To let type checker know that the function is a mock
    assert task.func.call_count >= 3

    await task.stop()


================================================
FILE: tests/unit/_utils/test_requests.py
================================================
from __future__ import annotations

import pytest

from crawlee._types import HttpHeaders
from crawlee._utils.requests import compute_unique_key, normalize_url


@pytest.mark.parametrize(
    ('url', 'expected_output', 'keep_url_fragment'),
    [
        ('https://example.com/?utm_source=test&utm_medium=test&key=value', 'https://example.com/?key=value', False),
        (
            'http://example.com/?key=value&another_key=another_value',
            'http://example.com/?another_key=another_value&key=value',
            False,
        ),
        ('HTTPS://EXAMPLE.COM/?KEY=VALUE', 'https://example.com/?key=value', False),
        ('', '', False),
        ('http://example.com/#fragment', 'http://example.com/#fragment', True),
        ('http://example.com/#fragment', 'http://example.com', False),
        ('  https://example.com/  ', 'https://example.com', False),
        ('http://example.com/?b=2&a=1', 'http://example.com/?a=1&b=2', False),
    ],
    ids=[
        'remove_utm_params',
        'retain_sort_non_utm_params',
        'convert_scheme_netloc_to_lowercase',
        'handle_empty_url',
        'retain_fragment',
        'remove_fragment',
        'trim_whitespace',
        'sort_query_params',
    ],
)
def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: bool) -> None:
    output = normalize_url(url, keep_url_fragment=keep_url_fragment)
    assert output == expected_output


def test_compute_unique_key_basic() -> None:
    url = 'https://crawlee.dev'
    uk_get = compute_unique_key(url, method='GET')
    uk_post = compute_unique_key(url, method='POST')
    assert url == uk_get == uk_post


def test_compute_unique_key_handles_fragments() -> None:
    url = 'https://crawlee.dev/#fragment'
    uk_with_fragment = compute_unique_key(url, keep_url_fragment=True)
    assert uk_with_fragment == url

    uk_without_fragment = compute_unique_key(url, 'GET', keep_url_fragment=False)
    assert uk_without_fragment == 'https://crawlee.dev'


def test_compute_unique_key_handles_payload() -> None:
    url = 'https://crawlee.dev'
    payload = b'{"key": "value"}'

    # Payload without extended unique key
    uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=False)
    assert uk == url

    # Extended unique key and payload is None
    uk = compute_unique_key(url, method='POST', payload=None, use_extended_unique_key=True)
    assert uk == 'POST|e3b0c442|e3b0c442|https://crawlee.dev'

    # Extended unique key and payload is bytes
    uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=True)
    assert uk == 'POST|e3b0c442|9724c1e2|https://crawlee.dev'


def test_compute_unique_key_handles_headers() -> None:
    url = 'https://crawlee.dev'
    headers = HttpHeaders({'Accept': '*/*', 'Content-Type': 'application/json'})
    uk = compute_unique_key(url, headers=headers, use_extended_unique_key=False)
    assert uk == url

    extended_uk_expected = 'GET|4e1a2cf6|e3b0c442|https://crawlee.dev'

    uk = compute_unique_key(url, headers=headers, use_extended_unique_key=True)
    assert uk == extended_uk_expected

    # Accept-Encoding header should not be included.
    headers = HttpHeaders({'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Content-Type': 'application/json'})
    uk = compute_unique_key(url, headers=headers, use_extended_unique_key=True)
    assert uk == extended_uk_expected


def test_compute_unique_key_complex() -> None:
    url = 'https://crawlee.dev'
    headers = HttpHeaders({'Accept': '*/*', 'Content-Type': 'application/json'})
    payload = b'{"key": "value"}'

    uk = compute_unique_key(
        url,
        method='POST',
        headers=headers,
        payload=payload,
        session_id='test_session',
        use_extended_unique_key=False,
    )
    assert uk == url

    extended_uk = compute_unique_key(
        url,
        method='POST',
        headers=headers,
        payload=payload,
        session_id='test_session',
        use_extended_unique_key=True,
    )
    assert extended_uk == 'POST|4e1a2cf6|9724c1e2|test_session|https://crawlee.dev'


def test_compute_unique_key_post_with_none_payload() -> None:
    url = 'https://crawlee.dev'
    expected_output = 'POST|e3b0c442|e3b0c442|https://crawlee.dev'
    output = compute_unique_key(url, 'POST', payload=None, use_extended_unique_key=True)
    assert output == expected_output


def test_compute_unique_key_with_whitespace_in_headers() -> None:
    url = 'https://crawlee.dev'
    headers = HttpHeaders({'Content-Type': 'application/json'})
    headers_with_whitespaces = HttpHeaders({'Content-Type': ' application/json '})

    expected_output = 'GET|60d83e70|e3b0c442|https://crawlee.dev'
    uk_1 = compute_unique_key(url, headers=headers, use_extended_unique_key=True)
    assert uk_1 == expected_output

    uk_2 = compute_unique_key(url, headers=headers_with_whitespaces, use_extended_unique_key=True)
    assert uk_2 == expected_output


================================================
FILE: tests/unit/_utils/test_robots.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

from crawlee._utils.robots import RobotsTxtFile

if TYPE_CHECKING:
    from yarl import URL

    from crawlee.http_clients._base import HttpClient


async def test_generation_robots_txt_url(server_url: URL, http_client: HttpClient) -> None:
    robots_file = await RobotsTxtFile.find(str(server_url), http_client)
    assert len(robots_file.get_sitemaps()) > 0


async def test_allow_disallow_robots_txt(server_url: URL, http_client: HttpClient) -> None:
    robots = await RobotsTxtFile.find(str(server_url), http_client)
    assert robots.is_allowed('https://crawlee.dev')
    assert robots.is_allowed(str(server_url / 'something/page.html'))
    assert robots.is_allowed(str(server_url / 'deny_googlebot/page.html'))
    assert not robots.is_allowed(str(server_url / 'deny_all/page.html'))


async def test_extract_sitemaps_urls(server_url: URL, http_client: HttpClient) -> None:
    robots = await RobotsTxtFile.find(str(server_url), http_client)
    assert len(robots.get_sitemaps()) == 2
    assert set(robots.get_sitemaps()) == {'http://not-exists.com/sitemap_1.xml', 'http://not-exists.com/sitemap_2.xml'}


async def test_parse_from_content() -> None:
    content = """User-agent: *
        Disallow: *deny_all/
        crawl-delay: 10
        User-agent: Googlebot
        Disallow: *deny_googlebot/"""
    robots = await RobotsTxtFile.from_content('http://not-exists.com/robots.txt', content)
    assert robots.is_allowed('http://not-exists.com/something/page.html')
    assert robots.is_allowed('http://not-exists.com/deny_googlebot/page.html')
    assert not robots.is_allowed('http://not-exists.com/deny_googlebot/page.html', 'Googlebot')
    assert not robots.is_allowed('http://not-exists.com/deny_all/page.html')


async def test_bind_robots_txt_url() -> None:
    content = 'User-agent: *\nDisallow: /'
    robots = await RobotsTxtFile.from_content('http://check.com/robots.txt', content)
    assert not robots.is_allowed('http://check.com/test.html')
    assert robots.is_allowed('http://othercheck.com/robots.txt')


================================================
FILE: tests/unit/_utils/test_shared_timeout.py
================================================
import asyncio
from datetime import timedelta

import pytest

from crawlee._utils.time import SharedTimeout, measure_time


async def test_shared_timeout_tracks_elapsed_time() -> None:
    timeout_duration = timedelta(seconds=1)
    shared_timeout = SharedTimeout(timeout_duration)

    # First usage
    async with shared_timeout:
        await asyncio.sleep(0.2)

    # Second usage - should have less time remaining
    async with shared_timeout as remaining:
        assert remaining < timedelta(seconds=0.85)
        assert remaining > timedelta(seconds=0)


async def test_shared_timeout_expires() -> None:
    timeout_duration = timedelta(seconds=0.1)
    shared_timeout = SharedTimeout(timeout_duration)

    with measure_time() as elapsed, pytest.raises(asyncio.TimeoutError):
        async with shared_timeout:
            await asyncio.sleep(0.5)

    assert elapsed.wall is not None
    assert elapsed.wall < 0.3


async def test_shared_timeout_cannot_be_nested() -> None:
    timeout_duration = timedelta(seconds=1)
    shared_timeout = SharedTimeout(timeout_duration)

    async with shared_timeout:
        with pytest.raises(RuntimeError, match='cannot be entered twice'):
            async with shared_timeout:
                pass


async def test_shared_timeout_multiple_sequential_uses() -> None:
    """Test that SharedTimeout can be used multiple times sequentially."""
    timeout_duration = timedelta(seconds=1)
    shared_timeout = SharedTimeout(timeout_duration)

    for _ in range(5):
        async with shared_timeout:
            await asyncio.sleep(0.05)

    # Should have consumed roughly 0.25 seconds
    async with shared_timeout as remaining:
        assert remaining < timedelta(seconds=0.8)
        assert remaining > timedelta(seconds=0)


================================================
FILE: tests/unit/_utils/test_sitemap.py
================================================
import base64
import gzip
from datetime import datetime
from typing import Any
from unittest.mock import AsyncMock, MagicMock

from yarl import URL

from crawlee._utils.sitemap import Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap
from crawlee.http_clients._base import HttpClient, HttpResponse

BASIC_SITEMAP = """
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://not-exists.com/</loc>
<lastmod>2005-02-03</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>http://not-exists.com/catalog?item=12&amp;desc=vacation_hawaii</loc>
<changefreq>weekly</changefreq>
</url>
<url>
<loc>http://not-exists.com/catalog?item=73&amp;desc=vacation_new_zealand</loc>
<lastmod>2004-12-23</lastmod>
<changefreq>weekly</changefreq>
</url>
<url>
<loc>http://not-exists.com/catalog?item=74&amp;desc=vacation_newfoundland</loc>
<lastmod>2004-12-23T18:00:15+00:00</lastmod>
<priority>0.3</priority>
</url>
<url>
<loc>http://not-exists.com/catalog?item=83&amp;desc=vacation_usa</loc>
<lastmod>2004-11-23</lastmod>
</url>
</urlset>
""".strip()

BASIC_RESULTS = {
    'http://not-exists.com/',
    'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',
    'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',
    'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',
    'http://not-exists.com/catalog?item=83&desc=vacation_usa',
}


def _make_mock_client(url_map: dict[str, tuple[int, bytes]]) -> AsyncMock:
    async def send_request(url: str, **_kwargs: Any) -> HttpResponse:
        status, body = 404, b''
        for pattern, (s, b) in url_map.items():
            if pattern in url:
                status, body = s, b
                break
        response = MagicMock(spec=HttpResponse)
        response.status_code = status
        response.read = AsyncMock(return_value=body)
        return response

    client = AsyncMock(spec=HttpClient)
    client.send_request.side_effect = send_request
    return client


def compress_gzip(data: str) -> bytes:
    """Compress a string using gzip."""
    return gzip.compress(data.encode())


def encode_base64(data: bytes) -> str:
    """Encode bytes to a base64 string."""
    return base64.b64encode(data).decode('utf-8')


async def test_sitemap(server_url: URL, http_client: HttpClient) -> None:
    """Test loading a basic sitemap."""
    sitemap_url = (server_url / 'sitemap.xml').with_query(
        base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8'
    )
    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)

    assert len(sitemap.urls) == 5
    assert set(sitemap.urls) == BASIC_RESULTS


async def test_extract_metadata_sitemap(server_url: URL, http_client: HttpClient) -> None:
    """Test extracting item metadata from a sitemap."""
    sitemap_url = (server_url / 'sitemap.xml').with_query(
        base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8'
    )

    items = [item async for item in parse_sitemap([{'type': 'url', 'url': str(sitemap_url)}], http_client=http_client)]
    assert len(items) == 5
    assert items[0] == SitemapUrl(
        loc='http://not-exists.com/',
        priority=0.8,
        changefreq='monthly',
        lastmod=datetime.fromisoformat('2005-02-03'),
        origin_sitemap_url=str(sitemap_url),
    )


async def test_gzipped_sitemap(server_url: URL, http_client: HttpClient) -> None:
    """Test loading a gzipped sitemap with correct type and .xml.gz url."""
    gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP))
    sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=gzipped_data, c_type='application/gzip')
    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
    assert len(sitemap.urls) == 5
    assert set(sitemap.urls) == BASIC_RESULTS


async def test_gzipped_sitemap_with_invalid_data(server_url: URL, http_client: HttpClient) -> None:
    """Test loading a invalid gzipped sitemap with correct type and .xml.gz url."""
    compress_data = compress_gzip(BASIC_SITEMAP)
    invalid_gzipped_data = encode_base64(compress_data[:30])
    sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=invalid_gzipped_data, c_type='application/gzip')
    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)

    assert len(sitemap.urls) == 0
    assert sitemap.urls == []


async def test_gz_sitemap_with_non_gzipped(server_url: URL, http_client: HttpClient) -> None:
    """Test loading a sitemap with gzip type and .xml.gz url, but without gzipped data."""
    sitemap_url = (server_url / 'sitemap.xml.gz').with_query(
        base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/gzip'
    )
    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)

    assert len(sitemap.urls) == 5
    assert set(sitemap.urls) == BASIC_RESULTS


async def test_gzipped_sitemap_with_bad_type(server_url: URL, http_client: HttpClient) -> None:
    """Test loading a gzipped sitemap with bad type and .xml.gz url."""
    gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP))
    sitemap_url = (server_url / 'sitemap.xml.gz').with_query(
        base64=gzipped_data, c_type='application/xml; charset=utf-8'
    )
    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)

    assert len(sitemap.urls) == 5
    assert set(sitemap.urls) == BASIC_RESULTS


async def test_xml_sitemap_with_gzipped_data(server_url: URL, http_client: HttpClient) -> None:
    """Test loading a gzipped sitemap with correct type and .xml url."""
    gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP))
    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=gzipped_data, c_type='application/gzip')
    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)

    assert len(sitemap.urls) == 5
    assert set(sitemap.urls) == BASIC_RESULTS


async def test_parent_sitemap(server_url: URL, http_client: HttpClient) -> None:
    """Test loading a parent sitemap that references child sitemaps."""
    parent_sitemap = """
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>{child_sitemap}</loc>
<lastmod>2004-12-23</lastmod>
</sitemap>
<sitemap>
<loc>{child_sitemap_2}</loc>
<lastmod>2004-12-23</lastmod>
</sitemap>
</sitemapindex>
""".strip()
    child_sitemap = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
    child_sitemap_2 = (server_url / 'sitemap.xml.gz').with_query(base64=encode_base64(compress_gzip(BASIC_SITEMAP)))
    parent_sitemap_content = parent_sitemap.format(child_sitemap=child_sitemap, child_sitemap_2=child_sitemap_2)
    encoded_parent_sitemap_content = encode_base64(parent_sitemap_content.encode())
    parent_sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encoded_parent_sitemap_content)

    sitemap = await Sitemap.load(str(parent_sitemap_url), http_client=http_client)

    assert len(sitemap.urls) == 10
    assert set(sitemap.urls) == BASIC_RESULTS


async def test_non_sitemap_url(server_url: URL, http_client: HttpClient) -> None:
    """Test loading a URL that does not point to a sitemap."""
    sitemap = await Sitemap.load(str(server_url), http_client=http_client)

    assert len(sitemap.urls) == 0
    assert sitemap.urls == []


async def test_cdata_sitemap(server_url: URL, http_client: HttpClient) -> None:
    """Test loading a sitemap with CDATA sections."""
    cdata_sitemap = """
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc><![CDATA[http://not-exists.com/catalog]]></loc>
</url>
</urlset>
    """.strip()
    sitemap_url = (server_url / 'sitemap.xml').with_query(
        base64=encode_base64(cdata_sitemap.encode()), c_type='application/xml; charset=utf-8'
    )
    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)

    assert len(sitemap.urls) == 1
    assert sitemap.urls == ['http://not-exists.com/catalog']


async def test_txt_sitemap(server_url: URL, http_client: HttpClient) -> None:
    """Test loading a plain text sitemap."""
    urls = [
        'http://not-exists.com/catalog?item=78&desc=vacation_crete',
        'http://not-exists.com/catalog?item=79&desc=vacation_somalia',
    ]
    txt_sitemap_content = '\n'.join(urls)

    sitemap_url = (server_url / 'sitemap.txt').with_query(base64=encode_base64(txt_sitemap_content.encode()))
    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)

    assert len(sitemap.urls) == 2
    assert set(sitemap.urls) == {
        'http://not-exists.com/catalog?item=78&desc=vacation_crete',
        'http://not-exists.com/catalog?item=79&desc=vacation_somalia',
    }


async def test_sitemap_pretty(server_url: URL, http_client: HttpClient) -> None:
    """Test loading a pretty-printed sitemap."""
    pretty_sitemap = """
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>
    http://not-exists.com/catalog?item=80&amp;desc=vacation_turkey
</loc>
<lastmod>
    2005-02-03
</lastmod>
<changefreq>

    monthly
</changefreq>
<priority>
    0.8
</priority>
</url>
</urlset>
""".strip()
    sitemap_url = (server_url / 'sitemap.xml').with_query(
        base64=encode_base64(pretty_sitemap.encode()), c_type='application/xml; charset=utf-8'
    )
    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)

    assert len(sitemap.urls) == 1
    assert sitemap.urls == ['http://not-exists.com/catalog?item=80&desc=vacation_turkey']


async def test_sitemap_from_string() -> None:
    """Test creating a Sitemap instance from an XML string."""
    sitemap = await Sitemap.from_xml_string(BASIC_SITEMAP)

    assert len(sitemap.urls) == 5
    assert set(sitemap.urls) == BASIC_RESULTS


async def test_discover_sitemap_from_robots_txt() -> None:
    """Sitemap URL found in robots.txt is yielded."""
    robots_content = b'User-agent: *\nSitemap: http://example.com/custom-sitemap.xml'
    http_client = _make_mock_client({'robots.txt': (200, robots_content)})

    urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)]

    assert urls == ['http://example.com/custom-sitemap.xml']


async def test_discover_sitemap_from_common_paths() -> None:
    """Sitemap is found at common paths when robots.txt has none."""
    http_client = _make_mock_client(
        {'/sitemap.xml': (200, b''), '/sitemap.txt': (200, b''), '/sitemap_index.xml': (200, b'')}
    )

    urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)]

    assert urls == [
        'http://example.com/sitemap.xml',
        'http://example.com/sitemap.txt',
        'http://example.com/sitemap_index.xml',
    ]


async def test_discover_sitemap_from_input_url() -> None:
    """Input URL that is already a sitemap is yielded directly without checking common paths."""
    http_client = _make_mock_client({'/sitemap.txt': (200, b'')})

    urls = [url async for url in discover_valid_sitemaps(['http://example.com/sitemap.xml'], http_client=http_client)]

    assert urls == ['http://example.com/sitemap.xml']


async def test_discover_sitemap_deduplication() -> None:
    """Sitemap URL found in robots.txt is not yielded again from common paths check."""
    robots_content = b'User-agent: *\nSitemap: http://example.com/sitemap.xml'
    http_client = _make_mock_client(
        {
            'robots.txt': (200, robots_content),
            '/sitemap.xml': (200, b''),
        }
    )

    urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)]

    assert urls == ['http://example.com/sitemap.xml']


async def test_discover_sitemaps_multiple_domains() -> None:
    """Sitemaps from multiple domains are all discovered."""
    http_client = _make_mock_client(
        {
            'domain-a.com/sitemap.xml': (200, b''),
            'domain-b.com/sitemap.xml': (200, b''),
        }
    )

    urls = [
        url
        async for url in discover_valid_sitemaps(
            ['http://domain-a.com/page', 'http://domain-b.com/page'],
            http_client=http_client,
        )
    ]

    assert set(urls) == {
        'http://domain-a.com/sitemap.xml',
        'http://domain-b.com/sitemap.xml',
    }


async def test_discover_sitemap_url_without_host_skipped() -> None:
    """URLs without a host are skipped."""
    http_client = _make_mock_client({})

    urls = [url async for url in discover_valid_sitemaps(['not-a-valid-url'], http_client=http_client)]

    assert urls == []


================================================
FILE: tests/unit/_utils/test_system.py
================================================
from __future__ import annotations

import sys
from multiprocessing import get_context, synchronize
from multiprocessing.shared_memory import SharedMemory
from typing import TYPE_CHECKING

import pytest

from crawlee._utils.byte_size import ByteSize
from crawlee._utils.system import get_cpu_info, get_memory_info

if TYPE_CHECKING:
    from collections.abc import Callable


def test_get_memory_info_returns_valid_values() -> None:
    memory_info = get_memory_info()

    assert ByteSize(0) < memory_info.total_size < ByteSize.from_tb(1)
    assert memory_info.current_size < memory_info.total_size


def test_get_cpu_info_returns_valid_values() -> None:
    cpu_info = get_cpu_info()
    assert 0 <= cpu_info.used_ratio <= 1


@pytest.mark.skipif(sys.platform != 'linux', reason='Improved estimation available only on Linux')
def test_memory_estimation_does_not_overestimate_due_to_shared_memory() -> None:
    """Test that memory usage estimation is not overestimating memory usage by counting shared memory multiple times.

    In this test, the parent process is started and its memory usage is measured in situations where it is running
    child processes without additional memory, with shared additional memory and with own unshared additional memory.
    Child process without additional memory are used to estimate baseline memory usage of any child process.
    The following estimation is asserted by the test:
    additional_memory_size_estimate_per_shared_memory_child * number_of_sharing_children_processes is approximately
    equal to additional_memory_size_estimate_per_unshared_memory_child where the additional shared memory is exactly
    the same as the unshared memory.
    """

    ctx = get_context('fork')
    estimated_memory_expectation = ctx.Value('b', False)  # noqa: FBT003  # Common usage pattern for multiprocessing.Value

    def parent_process() -> None:
        extra_memory_size = 1024 * 1024 * 100  # 100 MB
        children_count = 4
        # Memory calculation is not exact, so allow for some tolerance.
        test_tolerance = 0.3

        def no_extra_memory_child(ready: synchronize.Barrier, measured: synchronize.Barrier) -> None:
            ready.wait()
            measured.wait()

        def extra_memory_child(ready: synchronize.Barrier, measured: synchronize.Barrier) -> None:
            memory = SharedMemory(size=extra_memory_size, create=True)
            assert memory.buf is not None
            memory.buf[:] = bytearray([255 for _ in range(extra_memory_size)])
            print(f'Using the memory... {memory.buf[-1]}')
            ready.wait()
            measured.wait()
            memory.close()
            memory.unlink()

        def shared_extra_memory_child(
            ready: synchronize.Barrier, measured: synchronize.Barrier, memory: SharedMemory
        ) -> None:
            assert memory.buf is not None
            print(f'Using the memory... {memory.buf[-1]}')
            ready.wait()
            measured.wait()

        def get_additional_memory_estimation_while_running_processes(
            *, target: Callable, count: int = 1, use_shared_memory: bool = False
        ) -> float:
            processes = []
            ready = ctx.Barrier(parties=count + 1)
            measured = ctx.Barrier(parties=count + 1)
            shared_memory: None | SharedMemory = None
            memory_before = get_memory_info().current_size

            if use_shared_memory:
                shared_memory = SharedMemory(size=extra_memory_size, create=True)
                assert shared_memory.buf is not None
                shared_memory.buf[:] = bytearray([255 for _ in range(extra_memory_size)])
                extra_args = [shared_memory]
            else:
                extra_args = []

            for _ in range(count):
                p = ctx.Process(target=target, args=[ready, measured, *extra_args])
                p.start()
                processes.append(p)

            ready.wait()
            memory_during = get_memory_info().current_size
            measured.wait()

            for p in processes:
                p.join()

            if shared_memory:
                shared_memory.close()
                shared_memory.unlink()

            return (memory_during - memory_before).to_mb() / count

        additional_memory_simple_child = get_additional_memory_estimation_while_running_processes(
            target=no_extra_memory_child, count=children_count
        )
        additional_memory_extra_memory_child = (
            get_additional_memory_estimation_while_running_processes(target=extra_memory_child, count=children_count)
            - additional_memory_simple_child
        )
        additional_memory_shared_extra_memory_child = (
            get_additional_memory_estimation_while_running_processes(
                target=shared_extra_memory_child, count=children_count, use_shared_memory=True
            )
            - additional_memory_simple_child
        )

        memory_estimation_difference_ratio = (
            abs((additional_memory_shared_extra_memory_child * children_count) - additional_memory_extra_memory_child)
            / additional_memory_extra_memory_child
        )

        estimated_memory_expectation.value = memory_estimation_difference_ratio < test_tolerance

        if not estimated_memory_expectation.value:
            print(
                f'{additional_memory_shared_extra_memory_child=}\n'
                f'{children_count=}\n'
                f'{additional_memory_extra_memory_child=}\n'
                f'{memory_estimation_difference_ratio=}'
            )

    process = ctx.Process(target=parent_process)
    process.start()
    process.join()

    assert estimated_memory_expectation.value, (
        'Estimated memory usage for process with shared memory does not meet the expectation.'
    )


================================================
FILE: tests/unit/_utils/test_timedelta_ms.py
================================================
from __future__ import annotations

from datetime import timedelta
from typing import Any

import pytest
from pydantic import BaseModel

from crawlee._utils.models import timedelta_ms


class _ModelWithTimedeltaMs(BaseModel):
    time_delta: timedelta_ms | None = None


@pytest.mark.parametrize(
    ('time_delta_input', 'expected_time_delta', 'expected_model_dump_value'),
    [
        (1.0, timedelta(milliseconds=1), 1),
        (1, timedelta(milliseconds=1), 1),
        ('1', timedelta(milliseconds=1), 1),
        (timedelta(milliseconds=1), timedelta(milliseconds=1), 1),
        (3.01, timedelta(microseconds=3010), 3),
        (3.5, timedelta(microseconds=3500), 4),
        (3.99, timedelta(microseconds=3990), 4),
        (None, None, None),
        (float('inf'), timedelta(days=999999999, seconds=3600 * 24 - 1, microseconds=999999), float('inf')),
    ],
)
def test_model_with_timedelta_ms_input_types(
    time_delta_input: float | timedelta | Any | None, expected_time_delta: timedelta, expected_model_dump_value: int
) -> None:
    model = _ModelWithTimedeltaMs(time_delta=time_delta_input)  # ty: ignore[invalid-argument-type]
    assert model.time_delta == expected_time_delta
    assert model.model_dump() == {'time_delta': expected_model_dump_value}


================================================
FILE: tests/unit/_utils/test_urls.py
================================================
from __future__ import annotations

import pytest
from pydantic import ValidationError

from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute, validate_http_url


def test_is_url_absolute() -> None:
    assert is_url_absolute('http://example.com/path') is True
    assert is_url_absolute('https://example.com/path') is True
    assert is_url_absolute('ftp://example.com/path') is True
    assert is_url_absolute('//example.com/path') is False
    assert is_url_absolute('/path/to/resource') is False
    assert is_url_absolute('relative/path/to/resource') is False
    assert is_url_absolute('example.com/path') is False


def test_convert_to_absolute_url() -> None:
    base_url = 'http://example.com'
    relative_url = '/path/to/resource'
    absolute_url = convert_to_absolute_url(base_url, relative_url)
    assert absolute_url == 'http://example.com/path/to/resource'

    base_url = 'http://example.com'
    relative_url = '//example.com/path/to/resource'
    absolute_url = convert_to_absolute_url(base_url, relative_url)
    assert absolute_url == 'http://example.com/path/to/resource'

    base_url = 'http://example.com/base/'
    relative_url = '../path/to/resource'
    absolute_url = convert_to_absolute_url(base_url, relative_url)
    assert absolute_url == 'http://example.com/path/to/resource'


def test_validate_http_url() -> None:
    assert validate_http_url(None) is None

    valid_url = 'https://example.com'
    assert validate_http_url(valid_url) == valid_url

    invalid_url = 'htp://invalid-url'
    with pytest.raises(ValidationError):
        validate_http_url(invalid_url)


================================================
FILE: tests/unit/browsers/test_browser_pool.py
================================================
from __future__ import annotations

from datetime import timedelta
from typing import TYPE_CHECKING
from unittest.mock import AsyncMock

import pytest

from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin
from crawlee.browsers._browser_controller import BrowserController
from crawlee.browsers._types import CrawleePage
from tests.unit.utils import run_alone_on_mac

if TYPE_CHECKING:
    from collections.abc import Mapping
    from typing import Any

    from yarl import URL

    from crawlee.proxy_configuration import ProxyInfo


async def test_default_plugin_new_page_creation(server_url: URL) -> None:
    async with BrowserPool() as browser_pool:
        page_1 = await browser_pool.new_page()
        await page_1.page.goto(str(server_url))
        assert page_1.browser_type == 'chromium'
        assert page_1.page.url == str(server_url)
        assert '<html' in await page_1.page.content()  # there is some HTML content
        assert browser_pool.total_pages_count == 1

        page_2 = await browser_pool.new_page()
        await page_2.page.goto(str(server_url / 'status/200'))
        assert page_2.browser_type == 'chromium'
        assert page_2.page.url == str(server_url / 'status/200')
        assert '<html' in await page_1.page.content()  # there is some HTML content
        assert browser_pool.total_pages_count == 2

        await page_1.page.close()
        await page_2.page.close()


async def test_multiple_plugins_new_page_creation(server_url: URL) -> None:
    plugin_chromium = PlaywrightBrowserPlugin(browser_type='chromium')
    plugin_firefox = PlaywrightBrowserPlugin(browser_type='firefox')

    async with BrowserPool([plugin_chromium, plugin_firefox]) as browser_pool:
        assert browser_pool.plugins == [plugin_chromium, plugin_firefox]

        page_1 = await browser_pool.new_page()
        await page_1.page.goto(str(server_url))
        assert page_1.browser_type == 'chromium'
        assert page_1.page.url == str(server_url)
        assert '<html' in await page_1.page.content()  # there is some HTML content

        page_2 = await browser_pool.new_page()
        await page_2.page.goto(str(server_url / 'headers'))
        assert page_2.browser_type == 'firefox'
        assert page_2.page.url == str(server_url / 'headers')
        assert '<html' in await page_2.page.content()  # there is some HTML content

        page_3 = await browser_pool.new_page()
        await page_3.page.goto(str(server_url / 'user-agent'))
        assert page_3.browser_type == 'chromium'
        assert page_3.page.url == str(server_url / 'user-agent')
        assert '<html' in await page_3.page.content()  # there is some HTML content

        await page_1.page.close()
        await page_2.page.close()
        await page_3.page.close()

        assert browser_pool.total_pages_count == 3


@pytest.mark.flaky(
    rerun=3,
    reason='Test is flaky on Windows and MacOS, see https://github.com/apify/crawlee-python/issues/1660.',
)
async def test_new_page_with_each_plugin(server_url: URL) -> None:
    plugin_chromium = PlaywrightBrowserPlugin(browser_type='chromium')
    plugin_firefox = PlaywrightBrowserPlugin(browser_type='firefox')

    async with BrowserPool([plugin_chromium, plugin_firefox]) as browser_pool:
        pages = await browser_pool.new_page_with_each_plugin()

        assert len(pages) == 2

        assert pages[0].browser_type == 'chromium'
        assert pages[1].browser_type == 'firefox'

        await pages[0].page.goto(str(server_url))
        assert pages[0].page.url == str(server_url)
        assert '<html' in await pages[0].page.content()  # there is some HTML content

        await pages[1].page.goto(str(server_url / 'headers'))
        assert pages[1].page.url == str(server_url / 'headers')
        assert '<html' in await pages[1].page.content()

        for page in pages:
            await page.page.close()

        assert browser_pool.total_pages_count == 2


@run_alone_on_mac
async def test_with_default_plugin_constructor(server_url: URL) -> None:
    # Use a generous operation timeout so that Firefox has enough time to launch on slow Windows CI.
    async with BrowserPool.with_default_plugin(
        headless=True, browser_type='firefox', operation_timeout=timedelta(seconds=60)
    ) as browser_pool:
        assert len(browser_pool.plugins) == 1
        assert isinstance(browser_pool.plugins[0], PlaywrightBrowserPlugin)

        page = await browser_pool.new_page()
        assert page.browser_type == 'firefox'

        await page.page.goto(str(server_url))
        assert page.page.url == str(server_url)
        assert '<html' in await page.page.content()  # there is some HTML content

        await page.page.close()
        assert browser_pool.total_pages_count == 1


async def test_new_page_with_existing_id() -> None:
    async with BrowserPool() as browser_pool:
        page_1 = await browser_pool.new_page()
        with pytest.raises(ValueError, match=r'Page with ID: .* already exists.'):
            await browser_pool.new_page(page_id=page_1.id)


async def test_new_page_with_invalid_plugin() -> None:
    plugin_1 = PlaywrightBrowserPlugin(browser_type='chromium')
    plugin_2 = PlaywrightBrowserPlugin(browser_type='firefox')
    async with BrowserPool([plugin_1]) as browser_pool:
        with pytest.raises(ValueError, match=r'Provided browser_plugin is not one of the plugins used by BrowserPool.'):
            await browser_pool.new_page(browser_plugin=plugin_2)


async def test_resource_management(server_url: URL) -> None:
    playwright_plugin = PlaywrightBrowserPlugin(browser_type='chromium')

    async with BrowserPool([playwright_plugin]) as browser_pool:
        page = await browser_pool.new_page()
        await page.page.goto(str(server_url))
        assert page.page.url == str(server_url)
        assert '<html' in await page.page.content()  # there is some HTML content
        assert browser_pool.total_pages_count == 1

    # All pages should be closed in __aexit__
    assert page.page.is_closed()


async def test_methods_raise_error_when_not_active() -> None:
    plugin = PlaywrightBrowserPlugin()
    browser_pool = BrowserPool([plugin])

    assert browser_pool.active is False

    with pytest.raises(RuntimeError, match=r'BrowserPool is not active.'):
        await browser_pool.new_page()

    with pytest.raises(RuntimeError, match=r'BrowserPool is not active.'):
        await browser_pool.new_page_with_each_plugin()

    with pytest.raises(RuntimeError, match=r'BrowserPool is already active.'):
        async with browser_pool, browser_pool:
            pass

    async with browser_pool:
        assert browser_pool.active is True


async def test_with_plugin_contains_page_options(server_url: URL) -> None:
    plugin = PlaywrightBrowserPlugin(browser_new_context_options={'user_agent': 'My Best User-Agent'})
    async with BrowserPool(plugins=[plugin]) as browser_pool:
        test_page = await browser_pool.new_page()
        await test_page.page.goto(str(server_url / 'user-agent'))
        assert 'My Best User-Agent' in await test_page.page.content()
        await test_page.page.close()


@pytest.mark.parametrize(
    ('retire_after_page_count', 'expect_equal_browsers'),
    [
        pytest.param(2, True, id='Two pages opened in the same browser'),
        pytest.param(1, False, id='Each page opened in a new browser.'),
    ],
)
async def test_browser_pool_retire_browser_after_page_count(
    retire_after_page_count: int, *, expect_equal_browsers: bool
) -> None:
    async with BrowserPool(retire_browser_after_page_count=retire_after_page_count) as browser_pool:
        test_page = await browser_pool.new_page()
        first_browser = test_page.page.context
        await test_page.page.close()

        test_page = await browser_pool.new_page()
        second_browser = test_page.page.context

        await test_page.page.close()

        if expect_equal_browsers:
            assert first_browser is second_browser
        else:
            assert first_browser is not second_browser


async def test_pre_page_create_hook_is_called() -> None:
    call_mock = AsyncMock()

    async with BrowserPool() as browser_pool:

        @browser_pool.pre_page_create_hook
        async def hook(
            page_id: str,
            controller: BrowserController,
            browser_new_context_options: dict[str, Any],
            proxy_info: ProxyInfo | None,
        ) -> None:
            await call_mock(page_id, controller, browser_new_context_options, proxy_info)

            browser_new_context_options['user_agent'] = 'Modified User-Agent'

            assert len(controller.pages) == 0

        test_page = await browser_pool.new_page()
        user_agent = await test_page.page.evaluate('navigator.userAgent')

        await test_page.page.close()

    assert user_agent == 'Modified User-Agent'

    call_mock.assert_awaited_once()
    page_id, controller, _, proxy_info = call_mock.call_args[0]

    assert isinstance(page_id, str)
    assert test_page.id == page_id
    assert isinstance(controller, BrowserController)
    assert proxy_info is None


async def test_post_page_create_hook_is_called() -> None:
    call_mock = AsyncMock()

    async with BrowserPool() as browser_pool:

        @browser_pool.post_page_create_hook
        async def hook(crawlee_page: CrawleePage, controller: BrowserController) -> None:
            await call_mock(crawlee_page, controller)
            await crawlee_page.page.evaluate('window.__hook_applied = true')

            assert isinstance(crawlee_page, CrawleePage)

            assert len(controller.pages) == 1

        test_page = await browser_pool.new_page()

        js_result = await test_page.page.evaluate('window.__hook_applied')

        await test_page.page.close()

    assert js_result is True

    call_mock.assert_awaited_once()
    crawlee_page, controller = call_mock.call_args[0]

    assert test_page is crawlee_page
    assert isinstance(controller, BrowserController)


async def test_pre_page_close_hook() -> None:
    call_mock = AsyncMock()

    async with BrowserPool() as browser_pool:

        @browser_pool.pre_page_close_hook
        async def hook(crawlee_page: CrawleePage, controller: BrowserController) -> None:
            await call_mock(crawlee_page, controller)

            assert not crawlee_page.page.is_closed()
            assert len(controller.pages) == 1

        test_page = await browser_pool.new_page()
        await test_page.page.close()

    call_mock.assert_awaited_once()
    assert test_page.page.is_closed()


async def test_post_page_close_hook() -> None:
    call_mock = AsyncMock()

    async with BrowserPool() as browser_pool:

        @browser_pool.post_page_close_hook
        async def hook(page_id: str, controller: BrowserController) -> None:
            await call_mock(page_id, controller)

            assert len(controller.pages) == 0

        test_page = await browser_pool.new_page()
        await test_page.page.close()

    page_id, controller = call_mock.call_args[0]

    call_mock.assert_awaited_once()
    assert test_page.id == page_id
    assert isinstance(controller, BrowserController)


async def test_page_hooks_execution_order() -> None:
    call_order: list[str] = []

    async with BrowserPool() as browser_pool:

        @browser_pool.pre_page_create_hook
        async def pre_create(
            _page_id: str,
            _controller: BrowserController,
            _browser_new_context_options: Mapping[str, Any],
            _proxy_info: ProxyInfo | None,
        ) -> None:
            call_order.append('pre_create')

        @browser_pool.post_page_create_hook
        async def post_create(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:
            call_order.append('post_create')

        @browser_pool.pre_page_close_hook
        async def pre_close(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:
            call_order.append('pre_close')

        @browser_pool.post_page_close_hook
        async def post_close(_page_id: str, _controller: BrowserController) -> None:
            call_order.append('post_close')

        page = await browser_pool.new_page()
        await page.page.close()

    assert call_order == ['pre_create', 'post_create', 'pre_close', 'post_close']


async def test_multiple_hooks_all_called() -> None:
    call_order: list[str] = []

    async with BrowserPool() as browser_pool:

        @browser_pool.post_page_create_hook
        async def first(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:
            call_order.append('first')

        @browser_pool.post_page_create_hook
        async def second(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:
            call_order.append('second')

        page = await browser_pool.new_page()
        await page.page.close()

    assert call_order == ['first', 'second']


================================================
FILE: tests/unit/browsers/test_playwright_browser.py
================================================
from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING

import pytest
from playwright.async_api import async_playwright

from crawlee.browsers._playwright_browser import PlaywrightPersistentBrowser

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from playwright.async_api import Playwright


@pytest.fixture
async def playwright() -> AsyncGenerator[Playwright, None]:
    async with async_playwright() as playwright:
        yield playwright


async def test_init(playwright: Playwright) -> None:
    browser_type = playwright.chromium
    persist_browser = PlaywrightPersistentBrowser(browser_type, user_data_dir=None, browser_launch_options={})
    assert persist_browser._browser_type == browser_type
    assert persist_browser.browser_type == browser_type
    assert persist_browser._browser_launch_options == {}
    assert persist_browser._temp_dir is None
    assert persist_browser._user_data_dir is None
    assert persist_browser._is_connected is True
    assert persist_browser.is_connected() is True


async def test_delete_temp_folder_with_close_browser(playwright: Playwright) -> None:
    persist_browser = PlaywrightPersistentBrowser(
        playwright.chromium, user_data_dir=None, browser_launch_options={'headless': True}
    )
    await persist_browser.new_context()
    assert isinstance(persist_browser._temp_dir, Path)
    current_temp_dir = persist_browser._temp_dir
    assert current_temp_dir.exists()
    await persist_browser.close()
    assert not current_temp_dir.exists()


================================================
FILE: tests/unit/browsers/test_playwright_browser_controller.py
================================================
from __future__ import annotations

import asyncio
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING, Any
from unittest.mock import AsyncMock

import pytest
from playwright.async_api import Browser, BrowserContext, Page, Playwright, async_playwright

from crawlee.browsers import PlaywrightBrowserController, PlaywrightPersistentBrowser

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator
    from pathlib import Path

    from yarl import URL


@pytest.fixture
async def playwright() -> AsyncGenerator[Playwright, None]:
    async with async_playwright() as playwright:
        yield playwright


@pytest.fixture
async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]:
    browser = await playwright.chromium.launch()
    yield browser
    await browser.close()


@pytest.fixture
async def controller(browser: Browser) -> AsyncGenerator[PlaywrightBrowserController, None]:
    controller = PlaywrightBrowserController(browser, max_open_pages_per_browser=2)
    yield controller
    await controller.close()


async def test_initial_state(browser: Browser) -> None:
    controller = PlaywrightBrowserController(browser)

    # Test initial state
    assert controller.pages == []
    assert controller.pages_count == 0
    assert isinstance(controller.last_page_opened_at, datetime)
    assert controller.idle_time < timedelta(seconds=1)
    assert controller.has_free_capacity


@pytest.mark.run_alone
async def test_open_and_close_page(controller: PlaywrightBrowserController, server_url: URL) -> None:
    page = await controller.new_page()
    await page.goto(str(server_url))

    assert page in controller.pages
    assert controller.pages_count == 1
    assert controller.last_page_opened_at <= datetime.now(timezone.utc)

    await page.close()

    assert page not in controller.pages
    assert controller.pages_count == 0


async def test_max_open_pages_limit(controller: PlaywrightBrowserController) -> None:
    page1 = await controller.new_page()
    assert controller.pages_count == 1

    page2 = await controller.new_page()
    assert controller.pages_count == 2

    with pytest.raises(ValueError, match=r'Cannot open more pages in this browser.'):
        await controller.new_page()

    assert controller.pages_count == 2

    await page1.close()
    assert controller.pages_count == 1

    page3 = await controller.new_page()
    assert controller.pages_count == 2

    await page2.close()
    await page3.close()

    assert controller.pages == []
    assert controller.pages_count == 0


async def test_idle_time(controller: PlaywrightBrowserController) -> None:
    idle_time_before = controller.idle_time
    await asyncio.sleep(1)  # Simulate waiting
    idle_time_after = controller.idle_time
    assert idle_time_after > idle_time_before


async def test_close_browser_with_open_pages(browser: Browser) -> None:
    controller = PlaywrightBrowserController(browser, max_open_pages_per_browser=2)
    _ = await controller.new_page()

    with pytest.raises(ValueError, match=r'Cannot close the browser while there are open pages.'):
        await controller.close()

    assert controller.pages_count == 1
    assert controller.is_browser_connected

    await controller.close(force=True)

    assert controller.pages_count == 0
    assert not controller.is_browser_connected


async def test_memory_leak_on_concurrent_context_creation() -> None:
    """Test that only one browser context is created when multiple pages are opened concurrently."""

    # Prepare mocked browser with relevant methods and attributes
    mocked_browser = AsyncMock()
    mocked_context_launcher = AsyncMock()
    mocked_context = AsyncMock(spec=BrowserContext)

    mocked_context_launcher.return_value = mocked_context
    mocked_context.new_page.return_value = AsyncMock(spec=Page)

    async def delayed_launch_persistent_context(*args: Any, **kwargs: Any) -> Any:
        """Ensure that both calls to create context overlap in time."""
        await asyncio.sleep(5)  # Simulate delay in creation to make sure race condition happens
        return await mocked_context_launcher(*args, **kwargs)

    mocked_browser.launch_persistent_context = delayed_launch_persistent_context

    # Create minimal instance of PlaywrightBrowserController with mocked browser
    controller = PlaywrightBrowserController(
        PlaywrightPersistentBrowser(mocked_browser, None, {}), header_generator=None, fingerprint_generator=None
    )

    # Both calls will try to create browser context at the same time, but only one context should be created.
    await asyncio.gather(controller.new_page(), controller.new_page())

    assert mocked_context_launcher.call_count == 1


async def test_max_open_pages_limit_on_concurrent_creation(controller: PlaywrightBrowserController) -> None:
    pages = await asyncio.gather(controller.new_page(), controller.new_page())

    assert controller.pages_count == 2

    for page in pages:
        await page.close()


async def test_max_open_pages_limit_error_on_concurrent_creation(controller: PlaywrightBrowserController) -> None:
    """Test that max open pages limit is respected during concurrent page creation."""
    with pytest.raises(ValueError, match=r'Cannot open more pages in this browser.'):
        await asyncio.gather(controller.new_page(), controller.new_page(), controller.new_page())


async def test_browser_with_pre_existing_context(tmp_path: Path) -> None:
    """Test that using `Browser` with pre-existing active context re-uses such context."""
    async with async_playwright() as pw:
        persistent_context = await pw.firefox.launch_persistent_context(
            user_data_dir=str(tmp_path),
            headless=True,
        )
        browser = persistent_context.browser
        assert browser

        controller = PlaywrightBrowserController(browser=browser)
        page_1 = await controller.new_page()
        page_2 = await controller.new_page()
        assert page_1.context == page_2.context == persistent_context


================================================
FILE: tests/unit/browsers/test_playwright_browser_plugin.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

import pytest

from crawlee.browsers import PlaywrightBrowserPlugin

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from yarl import URL


@pytest.fixture
async def plugin() -> AsyncGenerator[PlaywrightBrowserPlugin, None]:
    async with PlaywrightBrowserPlugin() as plugin:
        yield plugin


async def test_initial_state() -> None:
    plugin = PlaywrightBrowserPlugin(
        browser_type='chromium',
        browser_launch_options={'headless': False},
        browser_new_context_options={'viewport': {'width': 1920, 'height': 1080}},
        max_open_pages_per_browser=10,
    )

    # Test initial state
    assert plugin.browser_type == 'chromium'
    assert 'headless' in plugin.browser_launch_options
    assert plugin.browser_launch_options['headless'] is False
    assert plugin.browser_new_context_options == {'viewport': {'width': 1920, 'height': 1080}}
    assert plugin.max_open_pages_per_browser == 10


async def test_new_browser(plugin: PlaywrightBrowserPlugin, server_url: URL) -> None:
    browser_controller = await plugin.new_browser()

    assert browser_controller.is_browser_connected

    page = await browser_controller.new_page()
    await page.goto(str(server_url))

    await page.close()
    await browser_controller.close()

    assert not browser_controller.is_browser_connected


async def test_multiple_new_browsers(plugin: PlaywrightBrowserPlugin) -> None:
    browser_controller_1 = await plugin.new_browser()
    browser_controller_2 = await plugin.new_browser()

    assert browser_controller_1 is not browser_controller_2


async def test_methods_raise_error_when_not_active() -> None:
    plugin = PlaywrightBrowserPlugin()

    assert plugin.active is False

    with pytest.raises(RuntimeError, match=r'Plugin is not active'):
        await plugin.new_browser()

    with pytest.raises(RuntimeError, match=r'Plugin is already active.'):
        async with plugin, plugin:
            pass

    async with plugin:
        assert plugin.active is True


async def raise_error_if_chrome_and_executable_path() -> None:
    with pytest.raises(
        ValueError, match=r'Cannot use `use_chrome` with `Configuration.default_browser_path` or `executable_path` set.'
    ):
        PlaywrightBrowserPlugin(
            browser_type='chrome',
            browser_launch_options={'executable_path': '/path/to/chrome'},
        )


================================================
FILE: tests/unit/conftest.py
================================================
from __future__ import annotations

import logging
import os
import warnings
from typing import TYPE_CHECKING, Any, cast

import pytest
from curl_cffi import CurlHttpVersion
from fakeredis import FakeAsyncRedis
from proxy import Proxy
from uvicorn.config import Config

from crawlee import service_locator
from crawlee.crawlers import BasicCrawler
from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_network
from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient, ImpitHttpClient
from crawlee.proxy_configuration import ProxyInfo
from crawlee.statistics import Statistics
from crawlee.storages import KeyValueStore
from tests.unit.server import TestServer, app, serve_in_thread

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator, Callable, Iterator
    from pathlib import Path

    from yarl import URL

    from crawlee.http_clients._base import HttpClient


@pytest.fixture(autouse=True)
async def suppress_user_warning() -> AsyncGenerator[None, None]:
    """Suppress user warnings during tests.

    Mostly to suppress warnings about the experimental status of the SqlStorageClient.
    """
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        yield


@pytest.fixture
def prepare_test_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Callable[[], None]:
    """Prepare the testing environment by resetting the global state before each test.

    This fixture ensures that the global state of the package is reset to a known baseline before each test runs.
    It also configures a temporary storage directory for test isolation.

    Args:
        monkeypatch: Test utility provided by pytest for patching.
        tmp_path: A unique temporary directory path provided by pytest for test isolation.

    Returns:
        A callable that prepares the test environment.
    """

    def _prepare_test_env() -> None:
        # Disable the browser sandbox by setting the environment variable. This is required for running
        # Playwright tests in the CI environment, where the sandbox is not supported.
        monkeypatch.setenv('CRAWLEE_DISABLE_BROWSER_SANDBOX', 'true')

        # Set the environment variable for the local storage directory to the temporary path.
        monkeypatch.setenv('CRAWLEE_STORAGE_DIR', str(tmp_path))

        # Reset the services in the service locator.
        service_locator._configuration = None
        service_locator._event_manager = None
        service_locator._storage_client = None
        service_locator.storage_instance_manager.clear_cache()

        # Verify that the test environment was set up correctly.
        assert os.environ.get('CRAWLEE_STORAGE_DIR') == str(tmp_path)

        # Reset global class variables to ensure test isolation.
        KeyValueStore._autosaved_values = {}
        Statistics._Statistics__next_id = 0  # type:ignore[attr-defined] # Mangled attribute
        BasicCrawler._BasicCrawler__next_id = 0  # type:ignore[attr-defined] # Mangled attribute

    return _prepare_test_env


@pytest.fixture(autouse=True)
def _isolate_test_environment(prepare_test_env: Callable[[], None]) -> None:
    """Isolate the testing environment by resetting global state before and after each test.

    This fixture ensures that each test starts with a clean slate and that any modifications during the test
    do not affect subsequent tests. It runs automatically for all tests.

    Args:
        prepare_test_env: Fixture to prepare the environment before each test.
    """
    prepare_test_env()


@pytest.fixture(autouse=True)
def _set_crawler_log_level(pytestconfig: pytest.Config, monkeypatch: pytest.MonkeyPatch) -> None:
    from crawlee import _log_config  # noqa: PLC0415

    loglevel = cast('str | None', pytestconfig.getoption('--log-level'))
    if loglevel is not None:
        monkeypatch.setattr(_log_config, 'get_configured_log_level', lambda: getattr(logging, loglevel.upper()))


@pytest.fixture
async def proxy_info(unused_tcp_port: int) -> ProxyInfo:
    username = 'user'
    password = 'pass'

    return ProxyInfo(
        url=f'http://{username}:{password}@127.0.0.1:{unused_tcp_port}',
        scheme='http',
        hostname='127.0.0.1',
        port=unused_tcp_port,
        username=username,
        password=password,
    )


@pytest.fixture
async def proxy(proxy_info: ProxyInfo) -> AsyncGenerator[ProxyInfo, None]:
    with Proxy(
        [
            '--hostname',
            proxy_info.hostname,
            '--port',
            str(proxy_info.port),
            '--basic-auth',
            f'{proxy_info.username}:{proxy_info.password}',
        ]
    ):
        yield proxy_info


@pytest.fixture
async def disabled_proxy(proxy_info: ProxyInfo) -> AsyncGenerator[ProxyInfo, None]:
    with Proxy(
        [
            '--hostname',
            proxy_info.hostname,
            '--port',
            str(proxy_info.port),
            '--basic-auth',
            f'{proxy_info.username}:{proxy_info.password}',
            '--disable-http-proxy',
        ]
    ):
        yield proxy_info


@pytest.fixture(scope='session')
def header_network() -> dict:
    return get_available_header_network()


@pytest.fixture
async def key_value_store() -> AsyncGenerator[KeyValueStore, None]:
    kvs = await KeyValueStore.open()
    yield kvs
    await kvs.drop()


@pytest.fixture(scope='session')
def http_server(unused_tcp_port_factory: Callable[[], int]) -> Iterator[TestServer]:
    """Create and start an HTTP test server."""
    config = Config(app=app, lifespan='off', loop='asyncio', port=unused_tcp_port_factory())
    server = TestServer(config=config)
    yield from serve_in_thread(server)


@pytest.fixture(scope='session')
def server_url(http_server: TestServer) -> URL:
    """Provide the base URL of the test server."""
    return http_server.url


# It is needed only in some tests, so we use the standard `scope=function`
@pytest.fixture
def redirect_http_server(unused_tcp_port_factory: Callable[[], int]) -> Iterator[TestServer]:
    """Create and start an HTTP test server."""
    config = Config(
        app=app,
        lifespan='off',
        loop='asyncio',
        port=unused_tcp_port_factory(),
        limit_max_requests=100,
        timeout_graceful_shutdown=10,
        log_level='error',
        access_log=False,
        ws='websockets-sansio',
    )
    server = TestServer(config=config)
    yield from serve_in_thread(server)


@pytest.fixture
def redirect_server_url(redirect_http_server: TestServer) -> URL:
    """Provide the base URL of the test server."""
    return redirect_http_server.url


@pytest.fixture(
    params=[
        pytest.param('httpx', id='httpx'),
        pytest.param('impit', id='impit'),
        pytest.param('curl', id='curl'),
    ]
)
async def http_client(request: pytest.FixtureRequest) -> AsyncGenerator[HttpClient, None]:
    class_client: type[HttpClient]
    kwargs: dict[str, Any]
    if request.param == 'curl':
        class_client = CurlImpersonateHttpClient
        kwargs = {'http_version': CurlHttpVersion.V1_1}
    elif request.param == 'impit':
        class_client = ImpitHttpClient
        kwargs = {'http3': False}
    else:
        class_client = HttpxHttpClient
        kwargs = {'http2': True}
    async with class_client(**kwargs) as client:
        yield client


@pytest.fixture
def redis_client() -> FakeAsyncRedis:
    return FakeAsyncRedis()


================================================
FILE: tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py
================================================
from __future__ import annotations

import asyncio
import logging
from dataclasses import dataclass
from datetime import timedelta
from itertools import cycle
from typing import TYPE_CHECKING, cast
from unittest.mock import Mock, call, patch

import pytest
from bs4 import Tag
from parsel import Selector
from typing_extensions import override

from crawlee import Request
from crawlee.crawlers import (
    AdaptivePlaywrightCrawler,
    AdaptivePlaywrightCrawlingContext,
    AdaptivePlaywrightPostNavCrawlingContext,
    AdaptivePlaywrightPreNavCrawlingContext,
    BasicCrawler,
    RenderingType,
    RenderingTypePrediction,
    RenderingTypePredictor,
)
from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import (
    AdaptivePlaywrightCrawlerStatisticState,
)
from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import (
    AdaptiveContextError,
)
from crawlee.sessions import SessionPool
from crawlee.statistics import Statistics
from crawlee.storage_clients import SqlStorageClient
from crawlee.storages import KeyValueStore, RequestQueue

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator, Iterator
    from pathlib import Path

    from yarl import URL


_H1_TEXT = 'Static'
_H2_TEXT = 'Only in browser'
_H3_CHANGED_TEXT = 'Changed by JS'
_INJECTED_JS_DELAY_MS = 100
_PAGE_CONTENT_STATIC = f"""
<h1>{_H1_TEXT}</h1>
<h3>Initial text</h3>
<script>
    setTimeout(function() {{
    let h2 = document.createElement("h2");
    h2.innerText = "{_H2_TEXT}";
    document.getElementsByTagName("body")[0].append(h2);
    document.getElementsByTagName("h3")[0].textContent="{_H3_CHANGED_TEXT}";
    }}, {_INJECTED_JS_DELAY_MS});

</script>
"""


@pytest.fixture
def test_urls(server_url: URL) -> list[str]:
    """Example pages used in the test are mocked for static requests."""
    return [
        str(server_url.with_path('echo_content').with_query(content=_PAGE_CONTENT_STATIC)),
        str(server_url.with_path('echo_content').with_query(id='test2', content=_PAGE_CONTENT_STATIC)),
    ]


@pytest.fixture
async def key_value_store() -> AsyncGenerator[KeyValueStore, None]:
    kvs = await KeyValueStore.open()
    yield kvs
    await kvs.drop()


class _SimpleRenderingTypePredictor(RenderingTypePredictor):
    """Simplified predictor for tests."""

    def __init__(
        self,
        rendering_types: Iterator[RenderingType] | None = None,
        detection_probability_recommendation: None | Iterator[float] = None,
    ) -> None:
        super().__init__()

        self._rendering_types = rendering_types or cycle(['static'])
        self._detection_probability_recommendation = detection_probability_recommendation or cycle([1])

    @override
    def predict(self, request: Request) -> RenderingTypePrediction:
        return RenderingTypePrediction(next(self._rendering_types), next(self._detection_probability_recommendation))

    @override
    def store_result(self, request: Request, rendering_type: RenderingType) -> None:
        pass


@dataclass(frozen=True)
class TestInput:
    __test__ = False

    expected_pw_count: int
    expected_static_count: int
    rendering_types: Iterator[RenderingType]
    detection_probability_recommendation: Iterator[float]


@pytest.mark.parametrize(
    'test_input',
    [
        pytest.param(
            TestInput(
                expected_pw_count=0,
                expected_static_count=2,
                # Lack of ty support, see https://github.com/astral-sh/ty/issues/2348.
                rendering_types=cycle(['static']),
                detection_probability_recommendation=cycle([0]),
            ),
            id='Static only',
        ),
        pytest.param(
            TestInput(
                expected_pw_count=2,
                expected_static_count=0,
                rendering_types=cycle(['client only']),
                detection_probability_recommendation=cycle([0]),
            ),
            id='Client only',
        ),
        pytest.param(
            TestInput(
                expected_pw_count=1,
                expected_static_count=1,
                rendering_types=cycle(['static', 'client only']),
                detection_probability_recommendation=cycle([0]),
            ),
            id='Mixed',
        ),
        pytest.param(
            TestInput(
                expected_pw_count=2,
                expected_static_count=2,
                rendering_types=cycle(['static', 'client only']),
                detection_probability_recommendation=cycle([1]),
            ),
            id='Enforced rendering type detection',
        ),
    ],
)
async def test_adaptive_crawling(
    test_input: TestInput,
    test_urls: list[str],
) -> None:
    """Tests correct routing to pre-nav hooks and correct handling through proper handler."""

    predictor = _SimpleRenderingTypePredictor(
        rendering_types=test_input.rendering_types,
        detection_probability_recommendation=test_input.detection_probability_recommendation,
    )

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        rendering_type_predictor=predictor,
    )

    pw_handler_count = 0
    static_handler_count = 0

    pw_hook_count = 0
    static_hook_count = 0

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        nonlocal pw_handler_count
        nonlocal static_handler_count

        try:
            # page is available only if it was crawled by PlaywrightCrawler.
            context.page  # noqa:B018 Intentionally "useless expression". Can trigger exception.
            pw_handler_count += 1
        except AdaptiveContextError:
            static_handler_count += 1

    @crawler.pre_navigation_hook
    async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:  # Intentionally unused arg
        nonlocal static_hook_count
        nonlocal pw_hook_count

        try:
            # page is available only if it was crawled by PlaywrightCrawler.
            context.page  # noqa:B018 Intentionally "useless expression". Can trigger exception.
            pw_hook_count += 1
        except AdaptiveContextError:
            static_hook_count += 1

    await crawler.run(test_urls)

    assert pw_handler_count == test_input.expected_pw_count
    assert pw_hook_count == test_input.expected_pw_count

    assert static_handler_count == test_input.expected_static_count
    assert static_hook_count == test_input.expected_static_count


async def test_adaptive_crawling_parsel(test_urls: list[str]) -> None:
    """Top level test for parsel. Only one argument combination. (The rest of code is tested with bs variant.)"""
    predictor = _SimpleRenderingTypePredictor(
        rendering_types=cycle(['static', 'client only']),
        detection_probability_recommendation=cycle([0]),
    )

    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
        rendering_type_predictor=predictor,
    )

    pw_handler_count = 0
    static_handler_count = 0

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        nonlocal pw_handler_count
        nonlocal static_handler_count

        try:
            # page is available only if it was crawled by PlaywrightCrawler.
            context.page  # noqa:B018 Intentionally "useless expression". Can trigger exception.
            pw_handler_count += 1
        except AdaptiveContextError:
            static_handler_count += 1

    await crawler.run(test_urls)

    assert pw_handler_count == 1
    assert static_handler_count == 1


async def test_adaptive_crawling_pre_nav_change_to_context(test_urls: list[str]) -> None:
    """Tests that context can be modified in pre-navigation hooks."""
    static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor()

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        rendering_type_predictor=static_only_predictor_enforce_detection,
    )
    user_data_in_pre_nav_hook = []
    user_data_in_handler = []

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        user_data_in_handler.append(context.request.user_data.get('data', None))

    @crawler.pre_navigation_hook
    async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
        user_data_in_pre_nav_hook.append(context.request.user_data.get('data', None))
        try:
            # page is available only if it was crawled by PlaywrightCrawler.
            context.page  # noqa:B018 Intentionally "useless expression". Can trigger exception.
            context.request.user_data['data'] = 'pw'
        except AdaptiveContextError:
            context.request.user_data['data'] = 'bs'

    await crawler.run(test_urls[:1])
    # Check that repeated pre nav hook invocations do not influence each other while probing
    assert user_data_in_pre_nav_hook == [None, None]
    # Check that the request handler sees changes to user data done by pre nav hooks
    assert user_data_in_handler == ['pw', 'bs']


async def test_playwright_only_pre_navigation_hook(test_urls: list[str]) -> None:
    """Test that hook can be registered for playwright only sub crawler.

    Create a situation where one page is crawled by both sub crawlers. One common pre navigation hook is registered and
    one playwright only pre navigation hook is registered."""
    static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor()

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        rendering_type_predictor=static_only_predictor_enforce_detection,
    )
    pre_nav_hook_common = Mock()
    pre_nav_hook_playwright = Mock()

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        pass

    @crawler.pre_navigation_hook
    async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
        pre_nav_hook_common(context.request.url)

    @crawler.pre_navigation_hook(playwright_only=True)
    async def pre_nav_hook_pw_only(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
        pre_nav_hook_playwright(context.page.url)

    await crawler.run(test_urls[:1])

    # Default behavior. Hook is called every time, both static sub crawler and playwright sub crawler.
    pre_nav_hook_common.assert_has_calls([call(test_urls[0]), call(test_urls[0])])
    # Hook is called only by playwright sub crawler.
    pre_nav_hook_playwright.assert_called_once_with('about:blank')


async def test_adaptive_crawling_post_nav_change_to_context(test_urls: list[str]) -> None:
    """Tests that context can be modified in post-navigation hooks."""
    static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor()

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        rendering_type_predictor=static_only_predictor_enforce_detection,
    )
    user_data_in_post_nav_hook = []
    user_data_in_handler = []

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        user_data_in_handler.append(context.request.user_data.get('data', None))

    @crawler.post_navigation_hook
    async def post_nav_hook(context: AdaptivePlaywrightPostNavCrawlingContext) -> None:
        user_data_in_post_nav_hook.append(context.request.user_data.get('data', None))
        try:
            # page is available only if it was crawled by PlaywrightCrawler.
            context.page  # noqa:B018 Intentionally "useless expression". Can trigger exception.
            context.request.user_data['data'] = 'pw'
        except AdaptiveContextError:
            context.request.user_data['data'] = 'bs'

    await crawler.run(test_urls[:1])
    # Check that repeated post nav hook invocations do not influence each other while probing
    assert user_data_in_post_nav_hook == [None, None]
    # Check that the request handler sees changes to user data done by post nav hooks
    assert user_data_in_handler == ['pw', 'bs']


async def test_playwright_only_post_navigation_hook(test_urls: list[str]) -> None:
    """Test that hook can be registered for playwright only sub crawler.

    Create a situation where one page is crawled by both sub crawlers. One common post navigation hook is registered and
    one playwright only post navigation hook is registered."""
    static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor()

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        rendering_type_predictor=static_only_predictor_enforce_detection,
    )
    post_nav_hook_common = Mock()
    post_nav_hook_playwright = Mock()

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        pass

    @crawler.post_navigation_hook
    async def post_nav_hook(context: AdaptivePlaywrightPostNavCrawlingContext) -> None:
        post_nav_hook_common(context.request.url)

    @crawler.post_navigation_hook(playwright_only=True)
    async def post_nav_hook_pw_only(context: AdaptivePlaywrightPostNavCrawlingContext) -> None:
        post_nav_hook_playwright(context.page.url)

    await crawler.run(test_urls[:1])

    # Default behavior. Hook is called every time, both static sub crawler and playwright sub crawler.
    post_nav_hook_common.assert_has_calls([call(test_urls[0]), call(test_urls[0])])
    # Hook is called only by playwright sub crawler.
    post_nav_hook_playwright.assert_called_once_with(test_urls[0])


async def test_adaptive_crawling_result(test_urls: list[str]) -> None:
    """Tests that result only from one sub crawler is saved.

    Enforced rendering type detection to run both sub crawlers."""
    static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor()
    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        rendering_type_predictor=static_only_predictor_enforce_detection,
    )

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        try:
            # page is available only if it was crawled by PlaywrightCrawler.
            context.page  # noqa:B018 Intentionally "useless expression". Can trigger exception.
            await context.push_data({'handler': 'pw'})
        except AdaptiveContextError:
            await context.push_data({'handler': 'bs'})

    await crawler.run(test_urls[:1])

    # Enforced rendering type detection will trigger both sub crawlers, but only pw crawler result is saved.
    assert (await crawler.get_data()).items == [{'handler': 'pw'}]


@pytest.mark.parametrize(
    ('pw_saved_data', 'static_saved_data', 'expected_result_rendering_type'),
    [
        pytest.param({'some': 'data'}, {'some': 'data'}, 'static', id='Same results from sub crawlers'),
        pytest.param({'some': 'data'}, {'different': 'data'}, 'client only', id='Different results from sub crawlers'),
    ],
)
async def test_adaptive_crawling_predictor_calls(
    pw_saved_data: dict[str, str],
    static_saved_data: dict[str, str],
    expected_result_rendering_type: RenderingType,
    test_urls: list[str],
) -> None:
    """Tests expected predictor calls. Same results."""
    some_label = 'bla'
    some_url = test_urls[0]
    static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor()
    requests = [Request.from_url(url=some_url, label=some_label)]
    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        rendering_type_predictor=static_only_predictor_enforce_detection,
    )

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        try:
            # page is available only if it was crawled by PlaywrightCrawler.
            context.page  # noqa:B018 Intentionally "useless expression". Can trigger exception.
            await context.push_data(pw_saved_data)
        except AdaptiveContextError:
            await context.push_data(static_saved_data)

    with (
        patch.object(static_only_predictor_enforce_detection, 'store_result', Mock()) as mocked_store_result,
        patch.object(
            static_only_predictor_enforce_detection, 'predict', Mock(return_value=RenderingTypePrediction('static', 1))
        ) as mocked_predict,
    ):
        await crawler.run(requests)

    assert mocked_predict.call_count == 1
    assert mocked_predict.call_args[0][0].url == requests[0].url

    # If `static` and `client only` results are same, `store_result` should be called with `static`.
    mocked_store_result.assert_called_once_with(mocked_predict.call_args[0][0], expected_result_rendering_type)


async def test_adaptive_crawling_result_use_state_isolation(
    key_value_store: KeyValueStore, test_urls: list[str]
) -> None:
    """Tests that global state accessed through `use_state` is changed only by one sub crawler.

    Enforced rendering type detection to run both sub crawlers."""
    static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor()
    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        rendering_type_predictor=static_only_predictor_enforce_detection,
    )
    await key_value_store.set_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0', {'counter': 0})
    request_handler_calls = 0

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        nonlocal request_handler_calls
        state = cast('dict[str, int]', await context.use_state())
        request_handler_calls += 1
        state['counter'] += 1

    await crawler.run(test_urls[:1])

    await key_value_store.persist_autosaved_values()

    # Request handler was called twice
    assert request_handler_calls == 2
    # Increment of global state happened only once
    assert (await key_value_store.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0'))['counter'] == 1


async def test_adaptive_crawling_statistics(test_urls: list[str]) -> None:
    """Test adaptive crawler statistics.

    Crawler set to static crawling, but due to result_checker returning False on static crawling result it
    will do browser crawling instead as well. This increments all three adaptive crawling related stats."""
    static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0]))

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        rendering_type_predictor=static_only_predictor_no_detection,
        result_checker=lambda result: False,  #  noqa: ARG005  # Intentionally unused argument.
    )

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        pass

    await crawler.run(test_urls[:1])

    assert crawler.statistics.state.http_only_request_handler_runs == 1
    assert crawler.statistics.state.browser_request_handler_runs == 1
    assert crawler.statistics.state.rendering_type_mispredictions == 1

    # Despite running both sub crawlers the top crawler statistics should count this as one request finished.
    assert crawler.statistics.state.requests_finished == 1
    assert crawler.statistics.state.requests_failed == 0


@pytest.mark.parametrize(
    'error_in_pw_crawler',
    [
        pytest.param(False, id='Error only in static sub crawler'),
        pytest.param(True, id='Error in both sub crawlers'),
    ],
)
async def test_adaptive_crawler_exceptions_in_sub_crawlers(*, error_in_pw_crawler: bool, test_urls: list[str]) -> None:
    """Test that correct results are committed when exceptions are raised in sub crawlers.

    Exception in bs sub crawler will be logged and pw sub crawler used instead.
    Any result from bs sub crawler will be discarded, result form pw crawler will be saved instead.
    (But global state modifications through `use_state` will not be reverted!!!)

    Exception in pw sub crawler will prevent any result from being committed. Even if `push_data` was called before
    the exception
    """
    static_only_no_detection_predictor = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0]))

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        rendering_type_predictor=static_only_no_detection_predictor,
    )
    saved_data = {'some': 'data'}

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        try:
            # page is available only if it was crawled by PlaywrightCrawler.
            context.page  # noqa:B018 Intentionally "useless expression". Can trigger exception.
            await context.push_data(saved_data)
            if error_in_pw_crawler:
                raise RuntimeError('Some pw sub crawler related error')

        except AdaptiveContextError:
            await context.push_data({'this': 'data should not be saved'})
            raise RuntimeError('Some bs sub crawler related error') from None

    await crawler.run(test_urls[:1])

    dataset = await crawler.get_dataset()
    stored_results = [item async for item in dataset.iterate_items()]

    if error_in_pw_crawler:
        assert stored_results == []
    else:
        assert stored_results == [saved_data]


async def test_adaptive_playwright_crawler_statistics_in_init() -> None:
    """Tests that adaptive crawler uses created AdaptivePlaywrightCrawlerStatistics from inputted Statistics."""
    persistence_enabled = True
    persist_state_kvs_name = 'some-name'
    persist_state_key = 'come key'
    log_message = 'some message'
    periodic_message_logger = logging.getLogger('some logger')
    log_interval = timedelta(minutes=2)
    statistics = Statistics.with_default_state(
        persistence_enabled=persistence_enabled,
        persist_state_kvs_name=persist_state_kvs_name,
        persist_state_key=persist_state_key,
        log_message=log_message,
        periodic_message_logger=periodic_message_logger,
        log_interval=log_interval,
    )

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(statistics=statistics)
    await crawler.run([])  # ensure that statistics get initialized

    assert type(crawler._statistics.state) is AdaptivePlaywrightCrawlerStatisticState

    assert crawler._statistics._state._persistence_enabled == persistence_enabled
    assert crawler._statistics._state._persist_state_key == persist_state_key

    assert crawler._statistics._log_message == log_message
    assert crawler._statistics._periodic_message_logger == periodic_message_logger


async def test_adaptive_playwright_crawler_timeout_in_sub_crawler(test_urls: list[str]) -> None:
    """Tests that timeout in static sub crawler forces fall back to browser sub crawler.

    Create situation where static sub crawler blocks(should time out), such error should start browser sub
    crawler.
    """
    static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0]))
    # Use a generous timeout so the static pipeline has enough time to reach the handler even on slow CI.
    # The handler will block indefinitely, so the timeout will always fire during the handler's wait.
    request_handler_timeout = timedelta(seconds=10)

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        max_request_retries=0,
        rendering_type_predictor=static_only_predictor_no_detection,
        request_handler_timeout=request_handler_timeout,
    )
    mocked_static_handler = Mock(name='static_handler')
    mocked_browser_handler = Mock(name='browser_handler')

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        try:
            # page is available only if it was crawled by PlaywrightCrawler.
            context.page  # noqa:B018 Intentionally "useless expression". Can trigger exception.
            mocked_browser_handler()
        except AdaptiveContextError:
            mocked_static_handler()
            # Relax timeout for the fallback browser request to allow for slow browser startup on CI
            crawler._request_handler_timeout = timedelta(seconds=120)
            # Block indefinitely - will be cancelled when the request_handler_timeout fires.
            await asyncio.Event().wait()

    await crawler.run(test_urls[:1])

    mocked_static_handler.assert_called_once_with()
    # Browser handler was capable of running despite static handler blocking longer than the handler timeout.
    mocked_browser_handler.assert_called_once_with()


async def test_adaptive_playwright_crawler_default_predictor(test_urls: list[str]) -> None:
    """Test default rendering type predictor integration into crawler."""

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser()
    mocked_static_handler = Mock()
    mocked_browser_handler = Mock()

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        try:
            # page is available only if it was crawled by PlaywrightCrawler.
            context.page  # noqa:B018 Intentionally "useless expression". Can trigger exception.
            mocked_browser_handler()
        except AdaptiveContextError:
            mocked_static_handler()

    await crawler.run(test_urls[:1])

    # First prediction should trigger rendering type detection as the predictor does not have any data for prediction.
    mocked_static_handler.assert_called_once_with()
    mocked_browser_handler.assert_called_once_with()


async def test_adaptive_context_query_selector_beautiful_soup(test_urls: list[str]) -> None:
    """Test that `context.query_selector_one` works regardless of the crawl type for BeautifulSoup variant.

    Handler tries to locate two elements h1 and h2.
    h1 exists immediately, h2 is created dynamically by inline JS snippet embedded in the html.
    Create situation where page is crawled with static sub crawler first.
    Static sub crawler should be able to locate only h1. It will try to wait for h2, trying to wait for h2 will trigger
    `AdaptiveContextError` which will force the adaptive crawler to try playwright sub crawler instead. Playwright sub
    crawler is able to wait for the h2 element."""

    # Get page with injected JS code that will add some element after timeout
    static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0]))

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        max_request_retries=1,
        rendering_type_predictor=static_only_predictor_no_detection,
    )

    mocked_h1_handler = Mock()
    mocked_h2_handler = Mock()

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        h1 = await context.query_selector_one('h1', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2))
        mocked_h1_handler(h1)
        h2 = await context.query_selector_one('h2', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2))
        mocked_h2_handler(h2)

    await crawler.run(test_urls[:1])

    expected_h1_tag = Tag(name='h1')
    expected_h1_tag.append(_H1_TEXT)

    expected_h2_tag = Tag(name='h2')
    expected_h2_tag.append(_H2_TEXT)

    # Called by both sub crawlers
    mocked_h1_handler.assert_has_calls([call(expected_h1_tag), call(expected_h1_tag)])
    # Called only by pw sub crawler
    mocked_h2_handler.assert_has_calls([call(expected_h2_tag)])


@pytest.mark.flaky(
    rerun=3,
    reason='Test is flaky on Windows and MacOS, see https://github.com/apify/crawlee-python/issues/1650.',
)
async def test_adaptive_context_query_selector_parsel(test_urls: list[str]) -> None:
    """Test that `context.query_selector_one` works regardless of the crawl type for Parsel variant.

    Handler tries to locate two elements h1 and h2.
    h1 exists immediately, h2 is created dynamically by inline JS snippet embedded in the html.
    Create situation where page is crawled with static sub crawler first.
    Static sub crawler should be able to locate only h1. It will try to wait for h2, trying to wait for h2 will trigger
    `AdaptiveContextError` which will force the adaptive crawler to try playwright sub crawler instead. Playwright sub
    crawler is able to wait for the h2 element."""

    # Get page with injected JS code that will add some element after timeout
    static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0]))
    expected_h1_tag = f'<h1>{_H1_TEXT}</h1>'
    expected_h2_tag = f'<h2>{_H2_TEXT}</h2>'

    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
        max_request_retries=1,
        rendering_type_predictor=static_only_predictor_no_detection,
    )

    mocked_h1_handler = Mock()
    mocked_h2_handler = Mock()

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        if h1 := await context.query_selector_one('h1', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)):
            mocked_h1_handler(type(h1), h1.get())
        if h2 := await context.query_selector_one('h2', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)):
            mocked_h2_handler(type(h2), h2.get())

    await crawler.run(test_urls[:1])

    # Called by both sub crawlers
    mocked_h1_handler.assert_has_calls([call(Selector, expected_h1_tag), call(Selector, expected_h1_tag)])
    # Called only by pw sub crawler
    mocked_h2_handler.assert_has_calls([call(Selector, expected_h2_tag)])


async def test_adaptive_context_parse_with_static_parser_parsel(test_urls: list[str]) -> None:
    """Test `context.parse_with_static_parser` works regardless of the crawl type for Parsel variant.

    (Test covers also  `context.wait_for_selector`, which is called by `context.parse_with_static_parser`)
    """
    static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0]))
    expected_h2_tag = f'<h2>{_H2_TEXT}</h2>'

    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
        max_request_retries=1,
        rendering_type_predictor=static_only_predictor_no_detection,
    )

    mocked_h2_handler = Mock()

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        h2_static = context.parsed_content.css('h2')  # Should not find anything
        mocked_h2_handler(h2_static)

        # Reparse whole page after h2 appears
        parsed_content_after_h2_appeared = await context.parse_with_static_parser(
            selector='h2', timeout=timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)
        )
        mocked_h2_handler(parsed_content_after_h2_appeared.css('h2')[0].get())

    await crawler.run(test_urls[:1])

    mocked_h2_handler.assert_has_calls(
        [
            call([]),  # Static sub crawler tried and did not find h2.
            call([]),  # Playwright sub crawler tried and did not find h2 without waiting.
            call(expected_h2_tag),  # Playwright waited for h2 to appear.
        ]
    )


async def test_adaptive_context_helpers_on_changed_selector(test_urls: list[str]) -> None:
    """Test that context helpers work on latest version of the page.

    Scenario where page is changed after a while. H2 element is added and text of H3 element is modified.
    Test that context helpers automatically work on latest version of the page by reading H3 element and expecting it's
    dynamically changed text instead of the original static text.
    """
    browser_only_predictor_no_detection = _SimpleRenderingTypePredictor(
        rendering_types=cycle(['client only']),
        detection_probability_recommendation=cycle([0]),
    )
    expected_h3_tag = f'<h3>{_H3_CHANGED_TEXT}</h3>'

    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
        max_request_retries=1,
        rendering_type_predictor=browser_only_predictor_no_detection,
    )

    mocked_h3_handler = Mock()

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        await context.query_selector_one('h2')  # Wait for change that is indicated by appearance of h2 element.
        if h3 := await context.query_selector_one('h3'):
            mocked_h3_handler(h3.get())  # Get updated h3 element.

    await crawler.run(test_urls[:1])

    mocked_h3_handler.assert_called_once_with(expected_h3_tag)


async def test_adaptive_context_query_non_existing_element(test_urls: list[str]) -> None:
    """Test that querying non-existing selector returns `None`"""
    browser_only_predictor_no_detection = _SimpleRenderingTypePredictor(
        rendering_types=cycle(['client only']),
        detection_probability_recommendation=cycle([0]),
    )

    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
        max_request_retries=1,
        rendering_type_predictor=browser_only_predictor_no_detection,
    )

    mocked_h3_handler = Mock()

    @crawler.router.default_handler
    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
        mocked_h3_handler(await context.query_selector_one('non sense selector', timeout=timedelta(milliseconds=1)))

    await crawler.run(test_urls[:1])

    mocked_h3_handler.assert_called_once_with(None)


@pytest.mark.parametrize(
    'test_input',
    [
        pytest.param(
            TestInput(
                expected_pw_count=0,
                expected_static_count=2,
                rendering_types=cycle(['static']),
                detection_probability_recommendation=cycle([0]),
            ),
            id='Static only',
        ),
        pytest.param(
            TestInput(
                expected_pw_count=2,
                expected_static_count=0,
                rendering_types=cycle(['client only']),
                detection_probability_recommendation=cycle([0]),
            ),
            id='Client only',
        ),
        pytest.param(
            TestInput(
                expected_pw_count=2,
                expected_static_count=2,
                rendering_types=cycle(['static', 'client only']),
                detection_probability_recommendation=cycle([1]),
            ),
            id='Enforced rendering type detection',
        ),
    ],
)
async def test_change_context_state_after_handling(test_input: TestInput, server_url: URL) -> None:
    """Test that context state is saved after handling the request."""
    predictor = _SimpleRenderingTypePredictor(
        rendering_types=test_input.rendering_types,
        detection_probability_recommendation=test_input.detection_probability_recommendation,
    )

    request_queue = await RequestQueue.open(name='state-test')
    used_session_id = None

    async with SessionPool() as session_pool:
        crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
            rendering_type_predictor=predictor,
            session_pool=session_pool,
            request_manager=request_queue,
        )

        @crawler.router.default_handler
        async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
            nonlocal used_session_id

            if context.session is not None:
                used_session_id = context.session.id
                context.session.user_data['session_state'] = True

            if isinstance(context.request.user_data['request_state'], list):
                context.request.user_data['request_state'].append('handler')

        request = Request.from_url(str(server_url), user_data={'request_state': ['initial']})

        await crawler.run([request])

        assert used_session_id is not None

        session = await session_pool.get_session_by_id(used_session_id)
        check_request = await request_queue.get_request(request.unique_key)

        assert session is not None
        assert check_request is not None

        assert session.user_data.get('session_state') is True
        # Check that request user data was updated in the handler and only onse.
        assert check_request.user_data.get('request_state') == ['initial', 'handler']

        await request_queue.drop()


async def test_adaptive_playwright_crawler_with_sql_storage(test_urls: list[str], tmp_path: Path) -> None:
    """Tests that AdaptivePlaywrightCrawler can be initialized with SqlStorageClient."""
    storage_dir = tmp_path / 'test_table.db'

    async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client:
        crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
            storage_client=storage_client,
        )

        mocked_handler = Mock()

        @crawler.router.default_handler
        async def request_handler(_context: AdaptivePlaywrightCrawlingContext) -> None:
            mocked_handler()

        await crawler.run(test_urls[:1])

        mocked_handler.assert_called()


================================================
FILE: tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py
================================================
from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import (
    AdaptivePlaywrightCrawlerStatisticState,
)
from crawlee.statistics import Statistics


async def test_predictor_state_persistence() -> None:
    """Test that adaptive statistics can be correctly persisted and initialized from persisted values."""

    async with Statistics(
        state_model=AdaptivePlaywrightCrawlerStatisticState, persistence_enabled=True
    ) as adaptive_statistics:
        adaptive_statistics.state.browser_request_handler_runs = 1
        adaptive_statistics.state.rendering_type_mispredictions = 2
        adaptive_statistics.state.http_only_request_handler_runs = 3

        persistence_state_key = adaptive_statistics._state._persist_state_key
    # adaptive_statistics are persisted after leaving the context

    # new_adaptive_statistics are initialized from the persisted values.
    async with Statistics(
        state_model=AdaptivePlaywrightCrawlerStatisticState,
        persistence_enabled=True,
        persist_state_key=persistence_state_key,
    ) as new_adaptive_statistics:
        pass

    assert new_adaptive_statistics.state.browser_request_handler_runs == 1
    assert new_adaptive_statistics.state.rendering_type_mispredictions == 2
    assert new_adaptive_statistics.state.http_only_request_handler_runs == 3


================================================
FILE: tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawling_context.py
================================================


================================================
FILE: tests/unit/crawlers/_adaptive_playwright/test_predictor.py
================================================
from __future__ import annotations

import pytest

from crawlee import Request
from crawlee.crawlers._adaptive_playwright._rendering_type_predictor import (
    DefaultRenderingTypePredictor,
    RenderingType,
    calculate_url_similarity,
    get_url_components,
)
from crawlee.storages import KeyValueStore


@pytest.mark.parametrize('label', ['some label', None])
@pytest.mark.parametrize(
    ('url', 'expected_prediction'),
    [
        ('http://www.aaa.com/some/stuff/extra', 'static'),
        ('http://www.aab.com/some/otherstuff', 'static'),
        ('http://www.aac.com/some', 'static'),
        ('http://www.ddd.com/some/stuff/extra', 'client only'),
        ('http://www.dde.com/some/otherstuff', 'client only'),
        ('http://www.ddf.com/some', 'client only'),
    ],
)
async def test_predictor_same_label(url: str, expected_prediction: RenderingType, label: str | None) -> None:
    async with DefaultRenderingTypePredictor() as predictor:
        learning_inputs: tuple[tuple[str, RenderingType], ...] = (
            ('http://www.aaa.com/some/stuff', 'static'),
            ('http://www.aab.com/some/stuff', 'static'),
            ('http://www.aac.com/some/stuff', 'static'),
            ('http://www.ddd.com/some/stuff', 'client only'),
            ('http://www.dde.com/some/stuff', 'client only'),
            ('http://www.ddf.com/some/stuff', 'client only'),
        )

        # Learn from small set
        for learned_url, rendering_type in learning_inputs:
            predictor.store_result(Request.from_url(url=learned_url, label=label), rendering_type=rendering_type)

        assert predictor.predict(Request.from_url(url=url, label=label)).rendering_type == expected_prediction


async def test_predictor_new_label_increased_detection_probability_recommendation() -> None:
    """Test that urls of uncommon labels have increased detection recommendation.

    This increase should gradually drop as the predictor learns more data with this label."""
    detection_ratio = 0.01
    label = 'some label'
    async with DefaultRenderingTypePredictor(detection_ratio=detection_ratio) as predictor:
        # Learn first prediction of this label
        predictor.store_result(
            Request.from_url(url='http://www.aaa.com/some/stuff', label=label), rendering_type='static'
        )
        # Increased detection_probability_recommendation
        prediction = predictor.predict(Request.from_url(url='http://www.aaa.com/some/stuffa', label=label))
        assert prediction.rendering_type == 'static'
        assert prediction.detection_probability_recommendation == detection_ratio * 4

        # Learn second prediction of this label
        predictor.store_result(
            Request.from_url(url='http://www.aaa.com/some/stuffe', label=label), rendering_type='static'
        )
        # Increased detection_probability_recommendation
        prediction = predictor.predict(Request.from_url(url='http://www.aaa.com/some/stuffa', label=label))
        assert prediction.rendering_type == 'static'
        assert prediction.detection_probability_recommendation == detection_ratio * 3

        # Learn third prediction of this label
        predictor.store_result(
            Request.from_url(url='http://www.aaa.com/some/stuffi', label=label), rendering_type='static'
        )
        # Increased detection_probability_recommendation
        prediction = predictor.predict(Request.from_url(url='http://www.aaa.com/some/stuffa', label=label))
        assert prediction.rendering_type == 'static'
        assert prediction.detection_probability_recommendation == detection_ratio * 2

        # Learn fourth prediction of this label.
        predictor.store_result(
            Request.from_url(url='http://www.aaa.com/some/stuffo', label=label), rendering_type='static'
        )
        # Label considered stable now. There should be no increase of detection_probability_recommendation.
        prediction = predictor.predict(Request.from_url(url='http://www.aaa.com/some/stuffa', label=label))
        assert prediction.rendering_type == 'static'
        assert prediction.detection_probability_recommendation == detection_ratio


async def test_unreliable_prediction() -> None:
    """Test that detection_probability_recommendation for unreliable predictions is 1.

    Create situation where no learning data of new label is available for the predictor.
    It's first prediction is not reliable as both options have 50% chance, so it should set maximum
    detection_probability_recommendation."""
    learnt_label = 'some label'

    async with DefaultRenderingTypePredictor() as predictor:
        # Learn two predictions of some label. One of each to make predictor very uncertain.
        predictor.store_result(
            Request.from_url(url='http://www.aaa.com/some/stuff', label=learnt_label), rendering_type='static'
        )
        predictor.store_result(
            Request.from_url(url='http://www.aaa.com/some/otherstuff', label=learnt_label), rendering_type='client only'
        )

        # Predict for new label. Predictor does not have enough information to give any reliable guess and should make
        # it clear by setting detection_probability_recommendation=1
        probability = predictor.predict(
            Request.from_url(url='http://www.unknown.com', label='new label')
        ).detection_probability_recommendation
        assert probability == 1


async def test_no_learning_data_prediction() -> None:
    """Test that predictor can predict even if it never learnt anything before.

    It should give some prediction, but it has to set detection_probability_recommendation=1"""
    async with DefaultRenderingTypePredictor() as predictor:
        probability = predictor.predict(
            Request.from_url(url='http://www.unknown.com', label='new label')
        ).detection_probability_recommendation

        assert probability == 1


async def test_persistent_no_learning_data_prediction() -> None:
    """Test that the model is saved after initialisation in KeyValueStore."""
    persist_key = 'test-no_learning-state'
    async with DefaultRenderingTypePredictor(persistence_enabled=True, persist_state_key=persist_key) as _predictor:
        pass

    kvs = await KeyValueStore.open()

    persisted_data = await kvs.get_value(persist_key)

    assert persisted_data is not None
    assert persisted_data['model']['is_fitted'] is False


async def test_persistent_prediction() -> None:
    """Test that the model and resources is saved after train in KeyValueStore."""
    persist_key = 'test-persistent-state'
    async with DefaultRenderingTypePredictor(persistence_enabled=True, persist_state_key=persist_key) as predictor:
        # Learn some data
        predictor.store_result(
            Request.from_url(url='http://www.aaa.com/some/stuff', label='some label'), rendering_type='static'
        )

    kvs = await KeyValueStore.open()

    persisted_data = await kvs.get_value(persist_key)

    assert persisted_data is not None
    assert persisted_data['model']['is_fitted'] is True


@pytest.mark.parametrize(
    ('persistence_enabled', 'same_result'),
    [
        pytest.param(True, True, id='with persistence'),
        pytest.param(False, False, id='without persistence'),
    ],
)
async def test_persistent_prediction_recovery(*, persistence_enabled: bool, same_result: bool) -> None:
    """Test that the model and resources is recovered from KeyValueStore."""
    persist_key = 'test-persistent-state-recovery'

    async with DefaultRenderingTypePredictor(
        detection_ratio=0.01, persistence_enabled=persistence_enabled, persist_state_key=persist_key
    ) as predictor:
        # Learn some data
        predictor.store_result(
            Request.from_url(url='http://www.aaa.com/some/stuff', label='some label'), rendering_type='static'
        )
        before_recover_prediction = predictor.predict(
            Request.from_url(url='http://www.aaa.com/some/stuff', label='some label')
        )

    # Recover predictor
    async with DefaultRenderingTypePredictor(
        detection_ratio=0.01, persistence_enabled=True, persist_state_key=persist_key
    ) as recover_predictor:
        after_recover_prediction = recover_predictor.predict(
            Request.from_url(url='http://www.aaa.com/some/stuff', label='some label')
        )

    # If persistence is enabled, the predicted results must be the same.
    if same_result:
        assert (
            before_recover_prediction.detection_probability_recommendation
            == after_recover_prediction.detection_probability_recommendation
        )
    else:
        assert (
            before_recover_prediction.detection_probability_recommendation
            != after_recover_prediction.detection_probability_recommendation
        )


@pytest.mark.parametrize(
    ('url_1', 'url_2', 'expected_rounded_similarity'),
    [
        (
            'https://docs.python.org/3/library/itertools.html#itertools.zip_longest',
            'https://docs.python.org/3.7/library/itertools.html#itertools.zip_longest',
            0.67,
        ),
        ('https://differente.com/same', 'https://differenta.com/same', 0),
        ('https://same.com/almost_the_same', 'https://same.com/almost_the_sama', 1),
        ('https://same.com/same/extra', 'https://same.com/same', 0.5),
    ],
)
def test_url_similarity(url_1: str, url_2: str, expected_rounded_similarity: float) -> None:
    assert (
        round(calculate_url_similarity(url_1=get_url_components(url_1), url_2=get_url_components(url_2)), 2)
        == expected_rounded_similarity
    )


================================================
FILE: tests/unit/crawlers/_basic/test_basic_crawler.py
================================================
# ruff: noqa: ARG001
from __future__ import annotations

import asyncio
import json
import logging
import os
import re
import sys
import time
from asyncio import Future
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
from dataclasses import dataclass
from datetime import timedelta
from itertools import product
from typing import TYPE_CHECKING, Any, Literal, cast
from unittest.mock import AsyncMock, Mock, call, patch

import pytest

from crawlee import ConcurrencySettings, Glob, service_locator
from crawlee._request import Request, RequestState
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod
from crawlee._utils.robots import RobotsTxtFile
from crawlee.configuration import Configuration
from crawlee.crawlers import BasicCrawler
from crawlee.errors import RequestCollisionError, SessionError, UserDefinedErrorHandlerError
from crawlee.events import Event, EventCrawlerStatusData
from crawlee.events._local_event_manager import LocalEventManager
from crawlee.request_loaders import RequestList, RequestManagerTandem
from crawlee.sessions import Session, SessionPool
from crawlee.statistics import FinalStatistics
from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient
from crawlee.storages import Dataset, KeyValueStore, RequestQueue

if TYPE_CHECKING:
    from collections.abc import Callable, Sequence
    from pathlib import Path

    from yarl import URL

    from crawlee._types import JsonSerializable
    from crawlee.statistics import StatisticsState


async def test_processes_requests_from_explicit_queue() -> None:
    queue = await RequestQueue.open()
    await queue.add_requests(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])

    crawler = BasicCrawler(request_manager=queue)
    calls = list[str]()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        calls.append(context.request.url)

    await crawler.run()

    assert calls == ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']


async def test_processes_requests_from_request_source_tandem() -> None:
    request_queue = await RequestQueue.open()
    await request_queue.add_requests(
        ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']
    )

    request_list = RequestList(['https://a.placeholder.com', 'https://d.placeholder.com', 'https://e.placeholder.com'])

    crawler = BasicCrawler(request_manager=RequestManagerTandem(request_list, request_queue))
    calls = set[str]()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        calls.add(context.request.url)

    await crawler.run()

    assert calls == {
        'https://a.placeholder.com',
        'https://b.placeholder.com',
        'https://c.placeholder.com',
        'https://d.placeholder.com',
        'https://e.placeholder.com',
    }


async def test_processes_requests_from_run_args() -> None:
    crawler = BasicCrawler()
    calls = list[str]()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        calls.append(context.request.url)

    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])

    assert calls == ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']


async def test_allows_multiple_run_calls() -> None:
    crawler = BasicCrawler()
    calls = list[str]()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        calls.append(context.request.url)

    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])

    assert calls == [
        'https://a.placeholder.com',
        'https://b.placeholder.com',
        'https://c.placeholder.com',
        'https://a.placeholder.com',
        'https://b.placeholder.com',
        'https://c.placeholder.com',
    ]


async def test_retries_failed_requests() -> None:
    crawler = BasicCrawler()
    calls = list[str]()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        calls.append(context.request.url)

        if context.request.url == 'https://b.placeholder.com':
            raise RuntimeError('Arbitrary crash for testing purposes')

    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])

    assert calls == [
        'https://a.placeholder.com',
        'https://b.placeholder.com',
        'https://c.placeholder.com',
        'https://b.placeholder.com',
        'https://b.placeholder.com',
        'https://b.placeholder.com',
    ]


async def test_respects_no_retry() -> None:
    crawler = BasicCrawler(max_request_retries=2)
    calls = list[str]()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        calls.append(context.request.url)
        raise RuntimeError('Arbitrary crash for testing purposes')

    await crawler.run(
        [
            'https://a.placeholder.com',
            'https://b.placeholder.com',
            Request.from_url(url='https://c.placeholder.com', no_retry=True),
        ]
    )

    assert calls == [
        'https://a.placeholder.com',
        'https://b.placeholder.com',
        'https://c.placeholder.com',
        'https://a.placeholder.com',
        'https://b.placeholder.com',
        'https://a.placeholder.com',
        'https://b.placeholder.com',
    ]


async def test_respects_request_specific_max_retries() -> None:
    crawler = BasicCrawler(max_request_retries=0)
    calls = list[str]()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        calls.append(context.request.url)
        raise RuntimeError('Arbitrary crash for testing purposes')

    await crawler.run(
        [
            'https://a.placeholder.com',
            'https://b.placeholder.com',
            Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 1}}),
        ]
    )

    assert calls == [
        'https://a.placeholder.com',
        'https://b.placeholder.com',
        'https://c.placeholder.com',
        'https://c.placeholder.com',
    ]


async def test_calls_error_handler() -> None:
    # Data structure to better track the calls to the error handler.
    @dataclass(frozen=True)
    class Call:
        url: str
        error: Exception

    # List to store the information of calls to the error handler.
    calls = list[Call]()

    crawler = BasicCrawler(max_request_retries=2)

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        if context.request.url == 'https://b.placeholder.com':
            raise RuntimeError('Arbitrary crash for testing purposes')

    @crawler.error_handler
    async def error_handler(context: BasicCrawlingContext, error: Exception) -> Request:
        # Append the current call information.
        calls.append(Call(context.request.url, error))
        return context.request

    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])

    # Verify that the error handler was called twice
    assert len(calls) == 2

    # Check calls
    for error_call in calls:
        assert error_call.url == 'https://b.placeholder.com'
        assert isinstance(error_call.error, RuntimeError)


async def test_calls_error_handler_for_session_errors() -> None:
    crawler = BasicCrawler(
        max_session_rotations=1,
    )

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        raise SessionError('Arbitrary session error for testing purposes')

    error_handler_mock = AsyncMock()

    @crawler.error_handler
    async def error_handler(context: BasicCrawlingContext, error: Exception) -> None:
        await error_handler_mock(context, error)

    await crawler.run(['https://crawlee.dev'])

    assert error_handler_mock.call_count == 1


async def test_handles_error_in_error_handler() -> None:
    crawler = BasicCrawler(max_request_retries=3)

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        if context.request.url == 'https://b.placeholder.com':
            raise RuntimeError('Arbitrary crash for testing purposes')

    @crawler.error_handler
    async def error_handler(context: BasicCrawlingContext, error: Exception) -> None:
        raise RuntimeError('Crash in error handler')

    with pytest.raises(UserDefinedErrorHandlerError):
        await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])


async def test_calls_failed_request_handler() -> None:
    crawler = BasicCrawler(max_request_retries=3)
    calls = list[tuple[BasicCrawlingContext, Exception]]()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        if context.request.url == 'https://b.placeholder.com':
            raise RuntimeError('Arbitrary crash for testing purposes')

    @crawler.failed_request_handler
    async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None:
        calls.append((context, error))

    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])

    assert len(calls) == 1
    assert calls[0][0].request.url == 'https://b.placeholder.com'
    assert isinstance(calls[0][1], RuntimeError)


@pytest.mark.parametrize('handler', ['failed_request_handler', 'error_handler'])
async def test_handlers_use_context_helpers(tmp_path: Path, handler: str) -> None:
    """Test that context helpers used in `failed_request_handler` and in `error_handler` have effect."""
    # Prepare crawler
    storage_client = FileSystemStorageClient()
    crawler = BasicCrawler(
        max_request_retries=1, storage_client=storage_client, configuration=Configuration(storage_dir=str(tmp_path))
    )
    # Test data
    rq_alias = 'other'
    test_data = {'some': 'data'}
    test_key = 'key'
    test_value = 'value'
    test_request = Request.from_url('https://d.placeholder.com')

    # Request handler with injected error
    @crawler.router.default_handler
    async def request_handler(context: BasicCrawlingContext) -> None:
        raise RuntimeError('Arbitrary crash for testing purposes')

    # Apply one of the handlers
    @getattr(crawler, handler)
    async def handler_implementation(context: BasicCrawlingContext, error: Exception) -> None:
        await context.push_data(test_data)
        await context.add_requests(requests=[test_request], rq_alias=rq_alias)
        kvs = await context.get_key_value_store()
        await kvs.set_value(test_key, test_value)

    await crawler.run(['https://b.placeholder.com'])

    # Verify that the context helpers used in handlers had effect on used storages
    dataset = await Dataset.open(storage_client=storage_client)
    kvs = await KeyValueStore.open(storage_client=storage_client)
    rq = await RequestQueue.open(alias=rq_alias, storage_client=storage_client)

    assert test_value == await kvs.get_value(test_key)
    assert [test_data] == (await dataset.get_data()).items
    assert test_request == await rq.fetch_next_request()


async def test_handles_error_in_failed_request_handler() -> None:
    crawler = BasicCrawler(max_request_retries=3)

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        if context.request.url == 'https://b.placeholder.com':
            raise RuntimeError('Arbitrary crash for testing purposes')

    @crawler.failed_request_handler
    async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None:
        raise RuntimeError('Crash in failed request handler')

    with pytest.raises(UserDefinedErrorHandlerError):
        await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])


@pytest.mark.parametrize(
    ('method', 'path', 'payload'),
    [
        pytest.param('GET', 'get', None, id='get send_request'),
        pytest.param('POST', 'post', b'Hello, world!', id='post send_request'),
    ],
)
async def test_send_request_works(server_url: URL, method: HttpMethod, path: str, payload: None | bytes) -> None:
    response_data: dict[str, Any] = {}

    crawler = BasicCrawler(max_request_retries=3)

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        response = await context.send_request(str(server_url / path), method=method, payload=payload)

        response_data['body'] = json.loads(await response.read())
        response_data['headers'] = response.headers

    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])

    response_body = response_data.get('body')
    assert response_body is not None
    assert response_body.get('data') == (payload.decode() if payload else None)

    response_headers = response_data.get('headers')
    assert response_headers is not None
    content_type = response_headers.get('content-type')
    assert content_type is not None
    assert content_type == 'application/json'


@dataclass
class AddRequestsTestInput:
    start_url: str
    loaded_url: str
    requests: Sequence[str | Request]
    expected_urls: Sequence[str]
    kwargs: EnqueueLinksKwargs


STRATEGY_TEST_URLS = (
    'https://someplace.com/',
    'http://someplace.com/index.html',
    'https://blog.someplace.com/index.html',
    'https://redirect.someplace.com',
    'https://other.place.com/index.html',
    'https://someplace.jp/',
)

INCLUDE_TEST_URLS = (
    'https://someplace.com/',
    'https://someplace.com/blog/category/cats',
    'https://someplace.com/blog/category/boots',
    'https://someplace.com/blog/archive/index.html',
    'https://someplace.com/blog/archive/cats',
)


@pytest.mark.parametrize(
    'test_input',
    argvalues=[
        # Basic use case
        pytest.param(
            AddRequestsTestInput(
                start_url='https://a.placeholder.com',
                loaded_url='https://a.placeholder.com',
                requests=[
                    'https://a.placeholder.com',
                    Request.from_url('https://b.placeholder.com'),
                    'https://c.placeholder.com',
                ],
                kwargs={},
                expected_urls=['https://b.placeholder.com', 'https://c.placeholder.com'],
            ),
            id='basic',
        ),
        # Enqueue strategy
        pytest.param(
            AddRequestsTestInput(
                start_url=STRATEGY_TEST_URLS[0],
                loaded_url=STRATEGY_TEST_URLS[0],
                requests=STRATEGY_TEST_URLS,
                kwargs=EnqueueLinksKwargs(),
                expected_urls=STRATEGY_TEST_URLS[1:],
            ),
            id='enqueue_strategy_default',
        ),
        pytest.param(
            AddRequestsTestInput(
                start_url=STRATEGY_TEST_URLS[0],
                loaded_url=STRATEGY_TEST_URLS[0],
                requests=STRATEGY_TEST_URLS,
                kwargs=EnqueueLinksKwargs(strategy='all'),
                expected_urls=STRATEGY_TEST_URLS[1:],
            ),
            id='enqueue_strategy_all',
        ),
        pytest.param(
            AddRequestsTestInput(
                start_url=STRATEGY_TEST_URLS[0],
                loaded_url=STRATEGY_TEST_URLS[0],
                requests=STRATEGY_TEST_URLS,
                kwargs=EnqueueLinksKwargs(strategy='same-domain'),
                expected_urls=STRATEGY_TEST_URLS[1:4],
            ),
            id='enqueue_strategy_same_domain',
        ),
        pytest.param(
            AddRequestsTestInput(
                start_url=STRATEGY_TEST_URLS[0],
                loaded_url=STRATEGY_TEST_URLS[0],
                requests=STRATEGY_TEST_URLS,
                kwargs=EnqueueLinksKwargs(strategy='same-hostname'),
                expected_urls=[STRATEGY_TEST_URLS[1]],
            ),
            id='enqueue_strategy_same_hostname',
        ),
        pytest.param(
            AddRequestsTestInput(
                start_url=STRATEGY_TEST_URLS[0],
                loaded_url=STRATEGY_TEST_URLS[0],
                requests=STRATEGY_TEST_URLS,
                kwargs=EnqueueLinksKwargs(strategy='same-origin'),
                expected_urls=[],
            ),
            id='enqueue_strategy_same_origin',
        ),
        # Enqueue strategy with redirect
        pytest.param(
            AddRequestsTestInput(
                start_url=STRATEGY_TEST_URLS[3],
                loaded_url=STRATEGY_TEST_URLS[0],
                requests=STRATEGY_TEST_URLS,
                kwargs=EnqueueLinksKwargs(),
                expected_urls=STRATEGY_TEST_URLS[:3] + STRATEGY_TEST_URLS[4:],
            ),
            id='redirect_enqueue_strategy_default',
        ),
        pytest.param(
            AddRequestsTestInput(
                start_url=STRATEGY_TEST_URLS[3],
                loaded_url=STRATEGY_TEST_URLS[0],
                requests=STRATEGY_TEST_URLS,
                kwargs=EnqueueLinksKwargs(strategy='all'),
                expected_urls=STRATEGY_TEST_URLS[:3] + STRATEGY_TEST_URLS[4:],
            ),
            id='redirect_enqueue_strategy_all',
        ),
        pytest.param(
            AddRequestsTestInput(
                start_url=STRATEGY_TEST_URLS[3],
                loaded_url=STRATEGY_TEST_URLS[0],
                requests=STRATEGY_TEST_URLS,
                kwargs=EnqueueLinksKwargs(strategy='same-domain'),
                expected_urls=STRATEGY_TEST_URLS[:3],
            ),
            id='redirect_enqueue_strategy_same_domain',
        ),
        pytest.param(
            AddRequestsTestInput(
                start_url=STRATEGY_TEST_URLS[3],
                loaded_url=STRATEGY_TEST_URLS[0],
                requests=STRATEGY_TEST_URLS,
                kwargs=EnqueueLinksKwargs(strategy='same-hostname'),
                expected_urls=[],
            ),
            id='redirect_enqueue_strategy_same_hostname',
        ),
        pytest.param(
            AddRequestsTestInput(
                start_url=STRATEGY_TEST_URLS[3],
                loaded_url=STRATEGY_TEST_URLS[0],
                requests=STRATEGY_TEST_URLS,
                kwargs=EnqueueLinksKwargs(strategy='same-origin'),
                expected_urls=[],
            ),
            id='redirect_enqueue_strategy_same_origin',
        ),
        # Include/exclude
        pytest.param(
            AddRequestsTestInput(
                start_url=INCLUDE_TEST_URLS[0],
                loaded_url=INCLUDE_TEST_URLS[0],
                requests=INCLUDE_TEST_URLS,
                kwargs=EnqueueLinksKwargs(include=[Glob('https://someplace.com/**/cats')]),
                expected_urls=[INCLUDE_TEST_URLS[1], INCLUDE_TEST_URLS[4]],
            ),
            id='include_exclude_1',
        ),
        pytest.param(
            AddRequestsTestInput(
                start_url=INCLUDE_TEST_URLS[0],
                loaded_url=INCLUDE_TEST_URLS[0],
                requests=INCLUDE_TEST_URLS,
                kwargs=EnqueueLinksKwargs(exclude=[Glob('https://someplace.com/**/cats')]),
                expected_urls=[INCLUDE_TEST_URLS[2], INCLUDE_TEST_URLS[3]],
            ),
            id='include_exclude_2',
        ),
        pytest.param(
            AddRequestsTestInput(
                start_url=INCLUDE_TEST_URLS[0],
                loaded_url=INCLUDE_TEST_URLS[0],
                requests=INCLUDE_TEST_URLS,
                kwargs=EnqueueLinksKwargs(
                    include=[Glob('https://someplace.com/**/cats')], exclude=[Glob('https://**/archive/**')]
                ),
                expected_urls=[INCLUDE_TEST_URLS[1]],
            ),
            id='include_exclude_3',
        ),
    ],
)
async def test_enqueue_strategy(test_input: AddRequestsTestInput) -> None:
    visit = Mock()

    crawler = BasicCrawler()

    @crawler.router.handler('start')
    async def start_handler(context: BasicCrawlingContext) -> None:
        # Assign test value to loaded_url - BasicCrawler does not do any navigation by itself
        context.request.loaded_url = test_input.loaded_url
        await context.add_requests(
            test_input.requests,
            **test_input.kwargs,
        )

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        visit(context.request.url)

    await crawler.run([Request.from_url(test_input.start_url, label='start')])

    visited = {call[0][0] for call in visit.call_args_list}
    assert visited == set(test_input.expected_urls)


async def test_session_rotation(server_url: URL) -> None:
    session_ids: list[str | None] = []

    crawler = BasicCrawler(
        max_session_rotations=7,
        max_request_retries=1,
    )

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        session_ids.append(context.session.id if context.session else None)
        raise SessionError('Test error')

    await crawler.run([str(server_url)])

    # exactly 7 handler calls happened
    assert len(session_ids) == 7

    # all session ids are not None
    assert None not in session_ids

    # and each was a different session
    assert len(set(session_ids)) == 7


async def test_final_statistics() -> None:
    crawler = BasicCrawler(max_request_retries=2)

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        id_param = context.request.get_query_param_from_url('id')
        assert id_param is not None
        id = int(id_param)

        await asyncio.sleep(0.001)

        if context.request.retry_count == 0 and id % 2 == 0:
            raise RuntimeError('First crash')

        if context.request.retry_count == 1 and id % 3 == 0:
            raise RuntimeError('Second crash')

        if context.request.retry_count == 2 and id % 4 == 0:
            raise RuntimeError('Third crash')

    final_statistics = await crawler.run(
        [Request.from_url(f'https://someplace.com/?id={id}', label='start') for id in range(50)]
    )

    assert final_statistics.requests_total == 50
    assert final_statistics.requests_finished == 45
    assert final_statistics.requests_failed == 5

    assert final_statistics.retry_histogram == [25, 16, 9]

    assert final_statistics.request_avg_finished_duration is not None
    assert final_statistics.request_avg_finished_duration > timedelta()

    assert final_statistics.request_avg_failed_duration is not None
    assert final_statistics.request_avg_failed_duration > timedelta()

    assert final_statistics.request_total_duration > timedelta()

    assert final_statistics.crawler_runtime > timedelta()

    assert final_statistics.requests_finished_per_minute > 0
    assert final_statistics.requests_failed_per_minute > 0


async def test_crawler_get_storages() -> None:
    crawler = BasicCrawler()

    rp = await crawler.get_request_manager()
    assert isinstance(rp, RequestQueue)

    dataset = await crawler.get_dataset()
    assert isinstance(dataset, Dataset)

    kvs = await crawler.get_key_value_store()
    assert isinstance(kvs, KeyValueStore)


async def test_crawler_run_requests() -> None:
    crawler = BasicCrawler()
    seen_urls = list[str]()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        seen_urls.append(context.request.url)

    start_urls = [
        'http://test.io/1',
        'http://test.io/2',
        'http://test.io/3',
    ]
    stats = await crawler.run(start_urls)

    assert seen_urls == start_urls
    assert stats.requests_total == 3
    assert stats.requests_finished == 3


async def test_context_push_and_get_data() -> None:
    crawler = BasicCrawler()
    dataset = await Dataset.open()

    await dataset.push_data({'a': 1})
    assert (await crawler.get_data()).items == [{'a': 1}]

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        await context.push_data({'b': 2})

    await dataset.push_data({'c': 3})
    assert (await crawler.get_data()).items == [{'a': 1}, {'c': 3}]

    stats = await crawler.run(['http://test.io/1'])

    assert (await crawler.get_data()).items == [{'a': 1}, {'c': 3}, {'b': 2}]
    assert stats.requests_total == 1
    assert stats.requests_finished == 1


async def test_context_push_and_get_data_handler_error() -> None:
    crawler = BasicCrawler()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        await context.push_data({'b': 2})
        raise RuntimeError('Watch me crash')

    stats = await crawler.run(['https://a.placeholder.com'])

    assert (await crawler.get_data()).items == []
    assert stats.requests_total == 1
    assert stats.requests_finished == 0
    assert stats.requests_failed == 1


async def test_crawler_push_and_export_data(tmp_path: Path) -> None:
    crawler = BasicCrawler()
    dataset = await Dataset.open()

    await dataset.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}])
    await dataset.push_data({'id': 2, 'test': 'test'})

    await crawler.export_data(path=tmp_path / 'dataset.json')
    await crawler.export_data(path=tmp_path / 'dataset.csv')

    assert json.load((tmp_path / 'dataset.json').open()) == [
        {'id': 0, 'test': 'test'},
        {'id': 1, 'test': 'test'},
        {'id': 2, 'test': 'test'},
    ]

    # On Windows, text mode file writes convert \n to \r\n, resulting in \r\n line endings.
    # On Unix/Linux, \n remains as \n.
    if sys.platform == 'win32':
        assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
    else:
        assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\n0,test\n1,test\n2,test\n'


async def test_crawler_export_data_additional_kwargs(tmp_path: Path) -> None:
    crawler = BasicCrawler()
    dataset = await Dataset.open()

    await dataset.push_data({'z': 1, 'a': 2})

    json_path = tmp_path / 'dataset.json'
    csv_path = tmp_path / 'dataset.csv'

    await crawler.export_data(path=json_path, sort_keys=True, separators=(',', ':'))
    await crawler.export_data(path=csv_path, delimiter=';', lineterminator='\n')

    assert json_path.read_text() == '[{"a":2,"z":1}]'
    assert csv_path.read_text() == 'z;a\n1;2\n'


async def test_context_push_and_export_data(tmp_path: Path) -> None:
    crawler = BasicCrawler()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        await context.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}])
        await context.push_data({'id': 2, 'test': 'test'})

    await crawler.run(['http://test.io/1'])

    await crawler.export_data(path=tmp_path / 'dataset.json')
    await crawler.export_data(path=tmp_path / 'dataset.csv')

    assert json.load((tmp_path / 'dataset.json').open()) == [
        {'id': 0, 'test': 'test'},
        {'id': 1, 'test': 'test'},
        {'id': 2, 'test': 'test'},
    ]

    # On Windows, text mode file writes convert \n to \r\n, resulting in \r\n line endings.
    # On Unix/Linux, \n remains as \n.
    if sys.platform == 'win32':
        assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
    else:
        assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\n0,test\n1,test\n2,test\n'


async def test_context_update_kv_store() -> None:
    crawler = BasicCrawler()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        store = await context.get_key_value_store()
        await store.set_value('foo', 'bar')

    await crawler.run(['https://hello.world'])

    store = await crawler.get_key_value_store()
    assert (await store.get_value('foo')) == 'bar'


async def test_context_use_state() -> None:
    crawler = BasicCrawler()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        await context.use_state({'hello': 'world'})

    await crawler.run(['https://hello.world'])

    kvs = await crawler.get_key_value_store()
    value = await kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0')

    assert value == {'hello': 'world'}


async def test_crawler_use_state() -> None:
    crawler = BasicCrawler()

    await crawler.use_state({'hello': 'world'})

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        # The state set by the crawler must be available in the context of the request handler
        state = await context.use_state()
        assert state == {'hello': 'world'}

    await crawler.run(['https://hello.world'])


async def test_context_use_state_crawlers_share_state() -> None:
    async def handler(context: BasicCrawlingContext) -> None:
        state = await context.use_state({'urls': []})
        assert isinstance(state['urls'], list)
        state['urls'].append(context.request.url)

    crawler_1 = BasicCrawler(id=0, request_handler=handler)
    crawler_2 = BasicCrawler(id=0, request_handler=handler)

    await crawler_1.run(['https://a.com'])
    await crawler_2.run(['https://b.com'])

    kvs = await KeyValueStore.open()
    assert crawler_1._id == crawler_2._id == 0
    assert await kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_{crawler_1._id}') == {
        'urls': ['https://a.com', 'https://b.com']
    }


async def test_crawlers_share_stats() -> None:
    async def handler(context: BasicCrawlingContext) -> None:
        await context.use_state({'urls': []})

    crawler_1 = BasicCrawler(id=0, request_handler=handler)
    crawler_2 = BasicCrawler(id=0, request_handler=handler, statistics=crawler_1.statistics)

    result1 = await crawler_1.run(['https://a.com'])
    result2 = await crawler_2.run(['https://b.com'])

    assert crawler_1.statistics == crawler_2.statistics
    assert result1.requests_finished == 1
    assert result2.requests_finished == 2


async def test_context_use_state_crawlers_own_state() -> None:
    async def handler(context: BasicCrawlingContext) -> None:
        state = await context.use_state({'urls': []})
        assert isinstance(state['urls'], list)
        state['urls'].append(context.request.url)

    crawler_1 = BasicCrawler(request_handler=handler)
    crawler_2 = BasicCrawler(request_handler=handler)

    await crawler_1.run(['https://a.com'])
    await crawler_2.run(['https://b.com'])

    kvs = await KeyValueStore.open()
    assert await kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0') == {'urls': ['https://a.com']}
    assert await kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_1') == {'urls': ['https://b.com']}


async def test_context_handlers_use_state(key_value_store: KeyValueStore) -> None:
    state_in_handler_one: dict[str, JsonSerializable] = {}
    state_in_handler_two: dict[str, JsonSerializable] = {}
    state_in_handler_three: dict[str, JsonSerializable] = {}

    crawler = BasicCrawler()

    @crawler.router.handler('one')
    async def handler_one(context: BasicCrawlingContext) -> None:
        state = await context.use_state({'hello': 'world'})
        state_in_handler_one.update(state)
        state['hello'] = 'new_world'
        await context.add_requests([Request.from_url('https://crawlee.dev/docs/quick-start', label='two')])

    @crawler.router.handler('two')
    async def handler_two(context: BasicCrawlingContext) -> None:
        state = await context.use_state({'hello': 'world'})
        state_in_handler_two.update(state)
        state['hello'] = 'last_world'

    @crawler.router.handler('three')
    async def handler_three(context: BasicCrawlingContext) -> None:
        state = await context.use_state({'hello': 'world'})
        state_in_handler_three.update(state)

    await crawler.run([Request.from_url('https://crawlee.dev/', label='one')])
    await crawler.run([Request.from_url('https://crawlee.dev/docs/examples', label='three')])

    # The state in handler_one must match the default state
    assert state_in_handler_one == {'hello': 'world'}

    # The state in handler_two must match the state updated in handler_one
    assert state_in_handler_two == {'hello': 'new_world'}

    # The state in handler_three must match the final state updated in previous run
    assert state_in_handler_three == {'hello': 'last_world'}

    store = await crawler.get_key_value_store()

    # The state in the KVS must match with the last set state
    assert (await store.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0')) == {'hello': 'last_world'}


@pytest.mark.parametrize(
    'use_failed_requests', [pytest.param(True, id='failed requests'), pytest.param(False, id='finished requests')]
)
async def test_max_requests_per_crawl(*, use_failed_requests: bool) -> None:
    start_urls = [
        'http://test.io/1',
        'http://test.io/2',
        'http://test.io/3',
        'http://test.io/4',
        'http://test.io/5',
    ]
    processed_urls = []

    # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
    crawler = BasicCrawler(
        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
        max_requests_per_crawl=3,
    )

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        if use_failed_requests:
            raise RuntimeError('Arbitrary crash for testing purposes')
        processed_urls.append(context.request.url)

    stats = await crawler.run(start_urls)

    # Verify that only 3 out of the 5 provided URLs were made
    if not use_failed_requests:
        assert len(processed_urls) == 3
        assert stats.requests_finished == 3
    assert stats.requests_total == 3


async def test_max_crawl_depth() -> None:
    processed_urls = []

    # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
    crawler = BasicCrawler(
        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
        max_crawl_depth=2,
    )

    @crawler.router.handler('start')
    async def start_handler(context: BasicCrawlingContext) -> None:
        processed_urls.append(context.request.url)
        await context.add_requests(['https://someplace.com/too-deep'])

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        processed_urls.append(context.request.url)

    start_request = Request.from_url('https://someplace.com/', label='start')
    start_request.crawl_depth = 2

    stats = await crawler.run([start_request])

    assert len(processed_urls) == 1
    assert stats.requests_total == 1
    assert stats.requests_finished == 1


@pytest.mark.parametrize(
    ('total_requests', 'fail_at_request', 'expected_starts', 'expected_finished'),
    [
        (3, None, 3, 3),
        (3, 2, 2, 1),
    ],
    ids=[
        'all_requests_successful',
        'abort_on_second_request',
    ],
)
async def test_abort_on_error(
    total_requests: int, fail_at_request: int | None, expected_starts: int, expected_finished: int
) -> None:
    starts_urls = []

    crawler = BasicCrawler(
        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
        abort_on_error=True,
    )

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        starts_urls.append(context.request.url)

        if context.request.user_data.get('n_request') == fail_at_request:
            raise ValueError('Error request')

    stats = await crawler.run(
        [
            Request.from_url('https://crawlee.dev', always_enqueue=True, user_data={'n_request': i + 1})
            for i in range(total_requests)
        ]
    )

    assert len(starts_urls) == expected_starts
    assert stats.requests_finished == expected_finished


def test_crawler_log() -> None:
    crawler = BasicCrawler()
    assert isinstance(crawler.log, logging.Logger)
    crawler.log.info('Test log message')


async def test_consecutive_runs_purge_request_queue() -> None:
    crawler = BasicCrawler()
    visit = Mock()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        visit(context.request.url)

    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])

    counter = Counter(args[0][0] for args in visit.call_args_list)
    assert counter == {
        'https://a.placeholder.com': 3,
        'https://b.placeholder.com': 3,
        'https://c.placeholder.com': 3,
    }


@pytest.mark.skipif(os.name == 'nt' and 'CI' in os.environ, reason='Skipped in Windows CI')
@pytest.mark.parametrize(
    ('statistics_log_format'),
    [
        pytest.param('table', id='With table for logs'),
        pytest.param('inline', id='With inline logs'),
    ],
)
async def test_logs_final_statistics(
    monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture, statistics_log_format: Literal['table', 'inline']
) -> None:
    # Set the log level to INFO to capture the final statistics log.
    caplog.set_level(logging.INFO)

    crawler = BasicCrawler(configure_logging=False, statistics_log_format=statistics_log_format)

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        await context.push_data({'something': 'something'})

    fake_statistics = FinalStatistics(
        requests_finished=4,
        requests_failed=33,
        retry_histogram=[1, 4, 8],
        request_avg_failed_duration=timedelta(seconds=99),
        request_avg_finished_duration=timedelta(milliseconds=483),
        requests_finished_per_minute=0.33,
        requests_failed_per_minute=0.1,
        request_total_duration=timedelta(minutes=12),
        requests_total=37,
        crawler_runtime=timedelta(minutes=5),
    )

    monkeypatch.setattr(crawler._statistics, 'calculate', lambda: fake_statistics)

    result = await crawler.run()
    assert result is fake_statistics

    final_statistics = next(
        (record for record in caplog.records if record.msg.startswith('Final')),
        None,
    )

    assert final_statistics is not None
    if statistics_log_format == 'table':
        assert final_statistics.msg.splitlines() == [
            'Final request statistics:',
            '┌───────────────────────────────┬────────────┐',
            '│ requests_finished             │ 4          │',
            '│ requests_failed               │ 33         │',
            '│ retry_histogram               │ [1, 4, 8]  │',
            '│ request_avg_failed_duration   │ 1min 39.0s │',
            '│ request_avg_finished_duration │ 483.0ms    │',
            '│ requests_finished_per_minute  │ 0.33       │',
            '│ requests_failed_per_minute    │ 0.1        │',
            '│ request_total_duration        │ 12min      │',
            '│ requests_total                │ 37         │',
            '│ crawler_runtime               │ 5min       │',
            '└───────────────────────────────┴────────────┘',
        ]
    else:
        assert final_statistics.msg == 'Final request statistics:'

        # `extra` parameters are not defined on `LogRecord`, so we cast to `Any` to access them.
        record = cast('Any', final_statistics)

        assert record.requests_finished == 4
        assert record.requests_failed == 33
        assert record.retry_histogram == [1, 4, 8]
        assert record.request_avg_failed_duration == 99.0
        assert record.request_avg_finished_duration == 0.483
        assert record.requests_finished_per_minute == 0.33
        assert record.requests_failed_per_minute == 0.1
        assert record.request_total_duration == 720.0
        assert record.requests_total == 37
        assert record.crawler_runtime == 300.0


async def test_crawler_manual_stop() -> None:
    """Test that no new requests are handled after crawler.stop() is called."""
    start_urls = [
        'http://test.io/1',
        'http://test.io/2',
        'http://test.io/3',
    ]
    processed_urls = []

    # Set max_concurrency to 1 to ensure testing urls are visited one by one in order.
    crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1))

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        processed_urls.append(context.request.url)
        if context.request.url == start_urls[1]:
            crawler.stop()

    stats = await crawler.run(start_urls)

    # Verify that only 2 out of the 3 provided URLs were made
    assert len(processed_urls) == 2
    assert stats.requests_total == 2
    assert stats.requests_finished == 2


@pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.Barrier was introduced in Python 3.11.')
async def test_crawler_multiple_stops_in_parallel() -> None:
    """Test that no new requests are handled after crawler.stop() is called, but ongoing requests can still finish."""

    start_urls = [
        'http://test.io/1',
        'http://test.io/2',
        'http://test.io/3',
    ]
    processed_urls = []

    # Set concurrency to 2 to ensure two urls are being visited in parallel.
    crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(desired_concurrency=2, max_concurrency=2))

    both_handlers_started = asyncio.Barrier(2)  # type:ignore[attr-defined]  # Test is skipped in older Python versions.
    only_one_handler_at_a_time = asyncio.Semaphore(1)

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        await both_handlers_started.wait()  # Block until both handlers are started.

        async with only_one_handler_at_a_time:
            # Reliably create situation where one handler called `crawler.stop()`, while other handler is still running.
            crawler.stop(reason=f'Stop called on {context.request.url}')
            processed_urls.append(context.request.url)

    stats = await crawler.run(start_urls)

    # Verify that only 2 out of the 3 provided URLs were made
    assert len(processed_urls) == 2
    assert stats.requests_total == 2
    assert stats.requests_finished == 2


async def test_services_no_side_effect_on_crawler_init() -> None:
    custom_configuration = Configuration()
    custom_event_manager = LocalEventManager.from_config(custom_configuration)
    custom_storage_client = MemoryStorageClient()

    _ = BasicCrawler(
        configuration=custom_configuration,
        event_manager=custom_event_manager,
        storage_client=custom_storage_client,
    )

    assert service_locator.get_configuration() is not custom_configuration
    assert service_locator.get_event_manager() is not custom_event_manager
    assert service_locator.get_storage_client() is not custom_storage_client


async def test_crawler_uses_default_services() -> None:
    custom_configuration = Configuration()
    service_locator.set_configuration(custom_configuration)

    custom_event_manager = LocalEventManager.from_config(custom_configuration)
    service_locator.set_event_manager(custom_event_manager)

    custom_storage_client = MemoryStorageClient()
    service_locator.set_storage_client(custom_storage_client)

    basic_crawler = BasicCrawler()

    assert basic_crawler._service_locator.get_configuration() is custom_configuration
    assert basic_crawler._service_locator.get_event_manager() is custom_event_manager
    assert basic_crawler._service_locator.get_storage_client() is custom_storage_client


async def test_services_crawlers_can_use_different_services() -> None:
    custom_configuration_1 = Configuration()
    custom_event_manager_1 = LocalEventManager.from_config(custom_configuration_1)
    custom_storage_client_1 = MemoryStorageClient()

    custom_configuration_2 = Configuration()
    custom_event_manager_2 = LocalEventManager.from_config(custom_configuration_2)
    custom_storage_client_2 = MemoryStorageClient()

    _ = BasicCrawler(
        configuration=custom_configuration_1,
        event_manager=custom_event_manager_1,
        storage_client=custom_storage_client_1,
    )

    _ = BasicCrawler(
        configuration=custom_configuration_2,
        event_manager=custom_event_manager_2,
        storage_client=custom_storage_client_2,
    )


async def test_crawler_uses_default_storages(tmp_path: Path) -> None:
    configuration = Configuration(
        storage_dir=str(tmp_path),
        purge_on_start=True,
    )
    service_locator.set_configuration(configuration)

    dataset = await Dataset.open()
    kvs = await KeyValueStore.open()
    rq = await RequestQueue.open()

    crawler = BasicCrawler()

    assert dataset is await crawler.get_dataset()
    assert kvs is await crawler.get_key_value_store()
    assert rq is await crawler.get_request_manager()


async def test_crawler_can_use_other_storages(tmp_path: Path) -> None:
    configuration = Configuration(
        storage_dir=str(tmp_path),
        purge_on_start=True,
    )
    service_locator.set_configuration(configuration)

    dataset = await Dataset.open()
    kvs = await KeyValueStore.open()
    rq = await RequestQueue.open()

    crawler = BasicCrawler(storage_client=MemoryStorageClient())

    assert dataset is not await crawler.get_dataset()
    assert kvs is not await crawler.get_key_value_store()
    assert rq is not await crawler.get_request_manager()


async def test_crawler_can_use_other_storages_of_same_type(tmp_path: Path) -> None:
    """Test that crawler can use non-global storage of the same type as global storage without conflicts"""
    a_path = tmp_path / 'a'
    b_path = tmp_path / 'b'
    a_path.mkdir()
    b_path.mkdir()
    expected_paths = {
        path / storage
        for path, storage in product({a_path, b_path}, {'datasets', 'key_value_stores', 'request_queues'})
    }

    configuration_a = Configuration(
        storage_dir=str(a_path),
        purge_on_start=True,
    )
    configuration_b = Configuration(
        storage_dir=str(b_path),
        purge_on_start=True,
    )

    # Set global configuration
    service_locator.set_configuration(configuration_a)
    service_locator.set_storage_client(FileSystemStorageClient())
    # Create storages based on the global services
    dataset = await Dataset.open()
    kvs = await KeyValueStore.open()
    rq = await RequestQueue.open()

    # Set the crawler to use different storage client
    crawler = BasicCrawler(storage_client=FileSystemStorageClient(), configuration=configuration_b)

    # Assert that the storages are different
    assert dataset is not await crawler.get_dataset()
    assert kvs is not await crawler.get_key_value_store()
    assert rq is not await crawler.get_request_manager()

    # Assert that all storages exists on the filesystem
    for path in expected_paths:
        assert path.is_dir()


async def test_allows_storage_client_overwrite_before_run(monkeypatch: pytest.MonkeyPatch) -> None:
    custom_storage_client = MemoryStorageClient()

    crawler = BasicCrawler(
        storage_client=custom_storage_client,
    )

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        await context.push_data({'foo': 'bar'})

    other_storage_client = MemoryStorageClient()
    service_locator.set_storage_client(other_storage_client)

    with monkeypatch.context() as monkey:
        spy = Mock(wraps=service_locator.get_storage_client)
        monkey.setattr(service_locator, 'get_storage_client', spy)
        await crawler.run(['https://does-not-matter.com'])
        assert spy.call_count >= 1

    dataset = await crawler.get_dataset()
    data = await dataset.get_data()
    assert data.items == [{'foo': 'bar'}]


@pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.Barrier was introduced in Python 3.11.')
async def test_context_use_state_race_condition_in_handlers(key_value_store: KeyValueStore) -> None:
    """Two parallel handlers increment global variable obtained by `use_state` method.

    Result should be incremented by 2.
    Method `use_state` must be implemented in a way that prevents race conditions in such scenario."""
    # Test is skipped in older Python versions.
    from asyncio import Barrier  # type:ignore[attr-defined] # noqa: PLC0415

    crawler = BasicCrawler()
    store = await crawler.get_key_value_store()
    await store.set_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0', {'counter': 0})
    handler_barrier = Barrier(2)

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        state = cast('dict[str, int]', await context.use_state())
        await handler_barrier.wait()  # Block until both handlers get the state.
        state['counter'] += 1
        await handler_barrier.wait()  # Block until both handlers increment the state.

    await crawler.run(['https://crawlee.dev/', 'https://crawlee.dev/docs/quick-start'])

    store = await crawler.get_key_value_store()
    # Ensure that local state is pushed back to kvs.
    await store.persist_autosaved_values()
    assert (await store.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0'))['counter'] == 2


@pytest.mark.run_alone
@pytest.mark.flaky(
    reruns=3, reason='Test is flaky on Windows and MacOS, see https://github.com/apify/crawlee-python/issues/1652.'
)
@pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.timeout was introduced in Python 3.11.')
@pytest.mark.parametrize(
    'sleep_type',
    [
        pytest.param('async_sleep'),
        pytest.param('sync_sleep', marks=pytest.mark.skip(reason='https://github.com/apify/crawlee-python/issues/908')),
    ],
)
async def test_timeout_in_handler(sleep_type: str) -> None:
    """Test that timeout from request handler is treated the same way as exception thrown in request handler.

    Handler should be able to time out even if the code causing the timeout is blocking sync code.
    Crawler should attempt to retry it.
    This test creates situation where the request handler times out twice, on third retry it does not time out."""
    # Test is skipped in older Python versions.
    from asyncio import timeout  # type:ignore[attr-defined] # noqa: PLC0415

    non_realtime_system_coefficient = 10
    handler_timeout = timedelta(seconds=1)
    max_request_retries = 3
    double_handler_timeout_s = handler_timeout.total_seconds() * 2
    handler_sleep = iter([double_handler_timeout_s, double_handler_timeout_s, 0])

    crawler = BasicCrawler(
        request_handler_timeout=handler_timeout,
        max_request_retries=max_request_retries,
        storage_client=MemoryStorageClient(),
    )

    mocked_handler_before_sleep = Mock()
    mocked_handler_after_sleep = Mock()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        mocked_handler_before_sleep()

        if sleep_type == 'async_sleep':
            await asyncio.sleep(next(handler_sleep))
        else:
            time.sleep(next(handler_sleep))  # noqa:ASYNC251  # Using blocking sleep in async function is the test.

        # This will not execute if timeout happens.
        mocked_handler_after_sleep()

    # Timeout in pytest, because previous implementation would run crawler until following:
    # "The request queue seems to be stuck for 300.0s, resetting internal state."
    async with timeout(max_request_retries * double_handler_timeout_s * non_realtime_system_coefficient):
        await crawler.run(['https://a.placeholder.com'])

    assert crawler.statistics.state.requests_finished == 1
    assert mocked_handler_before_sleep.call_count == max_request_retries
    assert mocked_handler_after_sleep.call_count == 1


@pytest.mark.flaky(
    reruns=3,
    reason='Test is flaky on Windows and MacOS, see https://github.com/apify/crawlee-python/issues/1649.',
)
@pytest.mark.parametrize(
    ('keep_alive', 'max_requests_per_crawl', 'expected_handled_requests_count'),
    [
        pytest.param(True, 2, 2, id='keep_alive, 2 requests'),
        pytest.param(True, 1, 1, id='keep_alive, but max_requests_per_crawl achieved after 1 request'),
        pytest.param(False, 2, 0, id='Crawler without keep_alive (default), crawler finished before adding requests'),
    ],
)
async def test_keep_alive(
    *, keep_alive: bool, max_requests_per_crawl: int, expected_handled_requests_count: int
) -> None:
    """Test that crawler can be kept alive without any requests and stopped with `crawler.stop()`.

    Crawler should stop if `max_requests_per_crawl` is reached regardless of the `keep_alive` flag."""
    additional_urls = ['https://a.placeholder.com', 'https://b.placeholder.com']
    expected_handler_calls = [call(url) for url in additional_urls[:expected_handled_requests_count]]

    crawler = BasicCrawler(
        keep_alive=keep_alive,
        max_requests_per_crawl=max_requests_per_crawl,
        # If more request can run in parallel, then max_requests_per_crawl is not deterministic.
        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
        storage_client=MemoryStorageClient(),
    )
    mocked_handler = Mock()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        mocked_handler(context.request.url)
        if context.request == additional_urls[-1]:
            crawler.stop()

    crawler_run_task = asyncio.create_task(crawler.run())

    # Give some time to crawler to finish(or be in keep_alive state) and add new request.
    # TODO: Replace sleep time by waiting for specific crawler state.
    # https://github.com/apify/crawlee-python/issues/925
    await asyncio.sleep(1)
    assert crawler_run_task.done() != keep_alive
    add_request_task = asyncio.create_task(crawler.add_requests(additional_urls))

    await asyncio.gather(crawler_run_task, add_request_task)

    mocked_handler.assert_has_calls(expected_handler_calls)


@pytest.mark.parametrize(
    ('retire'),
    [
        pytest.param(False, id='without retire'),
        pytest.param(True, id='with retire'),
    ],
)
async def test_session_retire_in_user_handler(*, retire: bool) -> None:
    crawler = BasicCrawler(session_pool=SessionPool(max_pool_size=1))
    sessions = list[str]()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        if context.session:
            sessions.append(context.session.id)

            context.session.retire() if retire else None

        await context.add_requests(['https://b.placeholder.com'])

    await crawler.run(['https://a.placeholder.com'])

    # The session should differ if `retire` was called and match otherwise since pool size == 1
    if retire:
        assert sessions[1] != sessions[0]
    else:
        assert sessions[1] == sessions[0]


async def test_bound_session_to_request() -> None:
    async with SessionPool() as session_pool:
        check_session: Session = await session_pool.get_session()
        used_sessions = list[str]()
        crawler = BasicCrawler(session_pool=session_pool)

        @crawler.router.default_handler
        async def handler(context: BasicCrawlingContext) -> None:
            if context.session:
                used_sessions.append(context.session.id)

        requests = [
            Request.from_url('https://a.placeholder.com', session_id=check_session.id, always_enqueue=True)
            for _ in range(10)
        ]

        await crawler.run(requests)

        assert len(used_sessions) == 10
        assert set(used_sessions) == {check_session.id}


async def test_bound_sessions_to_same_request() -> None:
    # Use a custom function to avoid errors due to random Session retrieval
    def create_session_function() -> Callable[[], Session]:
        counter = -1

        def create_session() -> Session:
            nonlocal counter
            counter += 1
            return Session(id=str(counter))

        return create_session

    check_sessions = [str(session_id) for session_id in range(10)]
    used_sessions = list[str]()
    crawler = BasicCrawler(session_pool=SessionPool(create_session_function=create_session_function()))

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        if context.session:
            used_sessions.append(context.session.id)

    requests = [
        Request.from_url('https://a.placeholder.com', session_id=str(session_id), use_extended_unique_key=True)
        for session_id in range(10)
    ]

    await crawler.run(requests)

    assert len(used_sessions) == 10
    assert set(used_sessions) == set(check_sessions)


async def test_error_bound_session_to_request() -> None:
    crawler = BasicCrawler(request_handler=AsyncMock())

    requests = [Request.from_url('https://a.placeholder.com', session_id='1', always_enqueue=True) for _ in range(10)]

    stats = await crawler.run(requests)

    assert stats.requests_total == 10
    assert stats.requests_failed == 10
    assert stats.retry_histogram == [10]


async def test_handle_error_bound_session_to_request() -> None:
    error_handler_mock = AsyncMock()
    crawler = BasicCrawler(request_handler=AsyncMock())

    @crawler.failed_request_handler
    async def error_req_hook(context: BasicCrawlingContext, error: Exception) -> None:
        if isinstance(error, RequestCollisionError):
            await error_handler_mock(context, error)

    requests = [Request.from_url('https://a.placeholder.com', session_id='1')]

    await crawler.run(requests)

    assert error_handler_mock.call_count == 1


async def test_handles_session_error_in_failed_request_handler() -> None:
    crawler = BasicCrawler(max_session_rotations=1)
    handler_requests = set()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        raise SessionError('blocked')

    @crawler.failed_request_handler
    async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None:
        handler_requests.add(context.request.url)

    requests = ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']

    await crawler.run(requests)

    assert set(requests) == handler_requests


async def test_lock_with_get_robots_txt_file_for_url(server_url: URL) -> None:
    crawler = BasicCrawler(respect_robots_txt_file=True)

    with patch('crawlee.crawlers._basic._basic_crawler.RobotsTxtFile.find', wraps=RobotsTxtFile.find) as spy:
        await asyncio.gather(
            *[asyncio.create_task(crawler._get_robots_txt_file_for_url(str(server_url))) for _ in range(10)]
        )

        # Check that the lock was acquired only once
        assert spy.call_count == 1


async def test_reduced_logs_from_timed_out_request_handler(caplog: pytest.LogCaptureFixture) -> None:
    caplog.set_level(logging.INFO)
    crawler = BasicCrawler(
        configure_logging=False,
        max_request_retries=1,
        request_handler_timeout=timedelta(seconds=1),
    )

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        # Intentionally add a delay longer than the timeout to trigger the timeout mechanism
        await asyncio.sleep(10)  # INJECTED DELAY

    # Capture all logs from the 'crawlee' logger at INFO level or higher
    with caplog.at_level(logging.INFO, logger='crawlee'):
        await crawler.run([Request.from_url('https://a.placeholder.com')])

    # Check for the timeout message in any of the logs
    found_timeout_message = False
    for record in caplog.records:
        if record.message and 'timed out after 1.0 seconds' in record.message:
            full_message = (record.message or '') + (record.exc_text or '')
            assert '\n' not in full_message
            assert '# INJECTED DELAY' in full_message
            found_timeout_message = True
            break

    assert found_timeout_message, 'Expected log message about request handler error was not found.'


async def test_reduced_logs_from_time_out_in_request_handler(caplog: pytest.LogCaptureFixture) -> None:
    crawler = BasicCrawler(configure_logging=False, max_request_retries=1)

    @crawler.router.default_handler
    async def default_handler(_: BasicCrawlingContext) -> None:
        await asyncio.wait_for(Future(), timeout=1)

    # Capture all logs from the 'crawlee' logger at INFO level or higher
    with caplog.at_level(logging.INFO, logger='crawlee'):
        await crawler.run([Request.from_url('https://a.placeholder.com')])

    # Check for 1 line summary message
    found_timeout_message = False
    for record in caplog.records:
        if re.match(
            r'Retrying request to .* due to: Timeout raised by user defined handler\. File .*, line .*,'
            r' in default_handler,     await asyncio.wait_for\(Future\(\), timeout=1\)',
            record.message,
        ):
            found_timeout_message = True
            break

    assert found_timeout_message, 'Expected log message about request handler error was not found.'


async def test_status_message_callback() -> None:
    """Test that status message callback is called with the correct message."""
    status_message_callback = AsyncMock()
    states: list[dict[str, StatisticsState | None]] = []

    async def status_callback(
        state: StatisticsState, previous_state: StatisticsState | None, message: str
    ) -> str | None:
        await status_message_callback(message)
        states.append({'state': state, 'previous_state': previous_state})
        return message

    crawler = BasicCrawler(
        status_message_callback=status_callback, status_message_logging_interval=timedelta(seconds=0.01)
    )

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        await asyncio.sleep(0.1)  # Simulate some processing time

    await crawler.run(['https://a.placeholder.com'])

    assert status_message_callback.called

    assert len(states) > 1

    first_call = states[0]
    second_call = states[1]

    # For the first call, `previous_state` is None
    assert first_call['state'] is not None
    assert first_call['previous_state'] is None

    # For second call, `previous_state` is the first state
    assert second_call['state'] is not None
    assert second_call['previous_state'] is not None
    assert second_call['previous_state'] == first_call['state']


async def test_status_message_emit() -> None:
    event_manager = service_locator.get_event_manager()

    status_message_listener = Mock()

    def listener(event_data: EventCrawlerStatusData) -> None:
        status_message_listener(event_data)

    event_manager.on(event=Event.CRAWLER_STATUS, listener=listener)

    crawler = BasicCrawler(request_handler=AsyncMock())

    await crawler.run(['https://a.placeholder.com'])

    event_manager.off(event=Event.CRAWLER_STATUS, listener=listener)

    assert status_message_listener.called


@pytest.mark.parametrize(
    ('queue_name', 'queue_alias', 'by_id'),
    [
        pytest.param('named-queue', None, False, id='with rq_name'),
        pytest.param(None, 'alias-queue', False, id='with rq_alias'),
        pytest.param('id-queue', None, True, id='with rq_id'),
    ],
)
async def test_add_requests_with_rq_param(queue_name: str | None, queue_alias: str | None, *, by_id: bool) -> None:
    crawler = BasicCrawler()
    rq = await RequestQueue.open(name=queue_name, alias=queue_alias)
    if by_id:
        queue_id = rq.id
        queue_name = None
    else:
        queue_id = None
    visit_urls = set()

    check_requests = [
        Request.from_url('https://a.placeholder.com'),
        Request.from_url('https://b.placeholder.com'),
        Request.from_url('https://c.placeholder.com'),
    ]

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        visit_urls.add(context.request.url)
        await context.add_requests(check_requests, rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)

    await crawler.run(['https://start.placeholder.com'])

    requests_from_queue = []
    while request := await rq.fetch_next_request():
        requests_from_queue.append(request)

    assert requests_from_queue == check_requests
    assert visit_urls == {'https://start.placeholder.com'}

    await rq.drop()


@pytest.mark.parametrize(
    ('queue_name', 'queue_alias', 'queue_id'),
    [
        pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'),
        pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'),
        pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'),
        pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'),
    ],
)
async def test_add_requests_error_with_multi_params(
    queue_id: str | None, queue_name: str | None, queue_alias: str | None
) -> None:
    crawler = BasicCrawler()

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        with pytest.raises(ValueError, match='Only one of `rq_id`, `rq_name` or `rq_alias` can be set'):
            await context.add_requests(
                [Request.from_url('https://a.placeholder.com')],
                rq_id=queue_id,
                rq_name=queue_name,
                rq_alias=queue_alias,
            )

    await crawler.run(['https://start.placeholder.com'])


async def test_crawler_purge_request_queue_uses_same_storage_client() -> None:
    """Make sure that purge on start does not replace the storage client in the underlying storage manager"""

    # Set some different storage_client globally and different for Crawlee.
    service_locator.set_storage_client(FileSystemStorageClient())
    unrelated_rq = await RequestQueue.open()
    unrelated_request = Request.from_url('https://x.placeholder.com')
    await unrelated_rq.add_request(unrelated_request)

    crawler = BasicCrawler(storage_client=MemoryStorageClient())

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        context.log.info(context.request.url)

    for _ in (1, 2):
        await crawler.run(requests=[Request.from_url('https://a.placeholder.com')], purge_request_queue=True)
        assert crawler.statistics.state.requests_finished == 1

    # Crawler should not fall back to the default storage after the purge
    assert await unrelated_rq.fetch_next_request() == unrelated_request


async def _run_crawler(crawler_id: int | None, requests: list[str], storage_dir: str) -> StatisticsState:
    """Run crawler and return its statistics state.

    Must be defined like this to be pickable for ProcessPoolExecutor."""

    async def request_handler(context: BasicCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')
        # Add visited url to crawler state and use it to verify state persistence.
        state = await context.use_state({'urls': []})
        state['urls'] = state.get('urls')
        assert isinstance(state['urls'], list)
        state['urls'].append(context.request.url)
        context.log.info(f'State {state}')

    crawler = BasicCrawler(
        id=crawler_id,
        request_handler=request_handler,
        concurrency_settings=ConcurrencySettings(max_concurrency=1, desired_concurrency=1),
        configuration=Configuration(
            storage_dir=storage_dir,
            purge_on_start=False,
        ),
    )

    await crawler.run(requests)
    return crawler.statistics.state


@dataclass
class _CrawlerInput:
    requests: list[str]
    id: None | int = None


def _process_run_crawlers(crawler_inputs: list[_CrawlerInput], storage_dir: str) -> list[StatisticsState]:
    return [
        asyncio.run(_run_crawler(crawler_id=crawler_input.id, requests=crawler_input.requests, storage_dir=storage_dir))
        for crawler_input in crawler_inputs
    ]


async def test_crawler_state_persistence(tmp_path: Path) -> None:
    """Test that crawler statistics and state persist and are loaded correctly.

    This test simulates starting the crawler process twice, and checks that the statistics include first run."""

    state_kvs = await KeyValueStore.open(
        storage_client=FileSystemStorageClient(), configuration=Configuration(storage_dir=str(tmp_path))
    )

    with ProcessPoolExecutor() as executor:
        # Crawl 2 requests in the first run and automatically persist the state.
        first_run_state = executor.submit(
            _process_run_crawlers,
            crawler_inputs=[_CrawlerInput(requests=['https://a.placeholder.com', 'https://b.placeholder.com'])],
            storage_dir=str(tmp_path),
        ).result()[0]
        # Expected state after first crawler run
        assert first_run_state.requests_finished == 2
        state = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0')
        assert state.get('urls') == ['https://a.placeholder.com', 'https://b.placeholder.com']

    # Do not reuse the executor to simulate a fresh process to avoid modified class attributes.
    with ProcessPoolExecutor() as executor:
        # Crawl 1 additional requests in the second run, but use previously automatically persisted state.
        second_run_state = executor.submit(
            _process_run_crawlers,
            crawler_inputs=[_CrawlerInput(requests=['https://c.placeholder.com'])],
            storage_dir=str(tmp_path),
        ).result()[0]

        # Expected state after second crawler run
        # 2 requests from first run and 1 request from second run.
        assert second_run_state.requests_finished == 3

        state = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0')
        assert state.get('urls') == [
            'https://a.placeholder.com',
            'https://b.placeholder.com',
            'https://c.placeholder.com',
        ]

    assert first_run_state.crawler_started_at == second_run_state.crawler_started_at
    assert first_run_state.crawler_finished_at
    assert second_run_state.crawler_finished_at

    assert first_run_state.crawler_finished_at < second_run_state.crawler_finished_at
    assert first_run_state.crawler_runtime < second_run_state.crawler_runtime


async def test_crawler_state_persistence_2_crawlers_with_migration(tmp_path: Path) -> None:
    """Test that crawler statistics and state persist and are loaded correctly.

    This test simulates starting the crawler process twice, and checks that the statistics include first run.
    Each time two distinct crawlers are running, and they should keep using their own statistics and state."""
    state_kvs = await KeyValueStore.open(
        storage_client=FileSystemStorageClient(), configuration=Configuration(storage_dir=str(tmp_path))
    )

    with ProcessPoolExecutor() as executor:
        # Run 2 crawler, each crawl 1 request in and automatically persist the state.
        first_run_states = executor.submit(
            _process_run_crawlers,
            crawler_inputs=[
                _CrawlerInput(requests=['https://a.placeholder.com']),
                _CrawlerInput(requests=['https://c.placeholder.com']),
            ],
            storage_dir=str(tmp_path),
        ).result()
        # Expected state after first crawler run
        assert first_run_states[0].requests_finished == 1
        assert first_run_states[1].requests_finished == 1
        state_0 = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0')
        assert state_0.get('urls') == ['https://a.placeholder.com']
        state_1 = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_1')
        assert state_1.get('urls') == ['https://c.placeholder.com']

    with ProcessPoolExecutor() as executor:
        # Run 2 crawler, each crawl 1 request in and automatically persist the state.
        second_run_states = executor.submit(
            _process_run_crawlers,
            crawler_inputs=[
                _CrawlerInput(requests=['https://b.placeholder.com']),
                _CrawlerInput(requests=['https://d.placeholder.com']),
            ],
            storage_dir=str(tmp_path),
        ).result()
        # Expected state after first crawler run
        assert second_run_states[0].requests_finished == 2
        assert second_run_states[1].requests_finished == 2
        state_0 = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0')
        assert state_0.get('urls') == ['https://a.placeholder.com', 'https://b.placeholder.com']
        state_1 = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_1')
        assert state_1.get('urls') == ['https://c.placeholder.com', 'https://d.placeholder.com']


async def test_crawler_intermediate_statistics() -> None:
    """Test that crawler statistics are correctly updating total runtime on every calculate call."""
    crawler = BasicCrawler()
    check_time = timedelta(seconds=0.1)

    async def wait_for_statistics_initialization() -> None:
        while not crawler.statistics.active:  # noqa: ASYNC110 # It is ok for tests.
            await asyncio.sleep(0.1)

    @crawler.router.default_handler
    async def handler(_: BasicCrawlingContext) -> None:
        await asyncio.sleep(check_time.total_seconds() * 5)

    # Start crawler and wait until statistics are initialized.
    crawler_task = asyncio.create_task(crawler.run(['https://a.placeholder.com']))
    await wait_for_statistics_initialization()

    # Wait some time and check that runtime is updated.
    await asyncio.sleep(check_time.total_seconds())
    crawler.statistics.calculate()
    assert crawler.statistics.state.crawler_runtime >= check_time

    # Wait for crawler to finish
    await crawler_task


async def test_protect_request_in_run_handlers() -> None:
    """Test that request in crawling context are protected in run handlers."""
    request_queue = await RequestQueue.open(name='state-test')

    request = Request.from_url('https://test.url/', user_data={'request_state': ['initial']})

    crawler = BasicCrawler(request_manager=request_queue, max_request_retries=0)

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        if isinstance(context.request.user_data['request_state'], list):
            context.request.user_data['request_state'].append('modified')
        raise ValueError('Simulated error after modifying request')

    await crawler.run([request])

    check_request = await request_queue.get_request(request.unique_key)
    assert check_request is not None
    assert check_request.user_data['request_state'] == ['initial']

    await request_queue.drop()


async def test_new_request_error_handler() -> None:
    """Test that error in new_request_handler is handled properly."""
    queue = await RequestQueue.open()
    crawler = BasicCrawler(
        request_manager=queue,
    )

    request = Request.from_url('https://a.placeholder.com')

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext) -> None:
        if '|test' in context.request.unique_key:
            return
        raise ValueError('This error should not be handled by error handler')

    @crawler.error_handler
    async def error_handler(context: BasicCrawlingContext, error: Exception) -> Request | None:
        return Request.from_url(
            context.request.url,
            unique_key=f'{context.request.unique_key}|test',
        )

    await crawler.run([request])

    original_request = await queue.get_request(request.unique_key)
    error_request = await queue.get_request(f'{request.unique_key}|test')

    assert original_request is not None
    assert original_request.state == RequestState.ERROR_HANDLER
    assert original_request.was_already_handled

    assert error_request is not None
    assert error_request.state == RequestState.DONE
    assert error_request.was_already_handled


================================================
FILE: tests/unit/crawlers/_basic/test_context_pipeline.py
================================================
from __future__ import annotations

import logging
from dataclasses import dataclass
from typing import TYPE_CHECKING
from unittest.mock import AsyncMock

import pytest

from crawlee import Request
from crawlee._types import BasicCrawlingContext
from crawlee.crawlers import ContextPipeline
from crawlee.errors import ContextPipelineFinalizationError, ContextPipelineInitializationError, RequestHandlerError
from crawlee.sessions._session import Session

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator


@dataclass(frozen=True)
class EnhancedCrawlingContext(BasicCrawlingContext):
    foo: str


@dataclass(frozen=True)
class MoreEnhancedCrawlingContext(EnhancedCrawlingContext):
    bar: int


async def test_calls_consumer_without_middleware() -> None:
    consumer = AsyncMock()

    pipeline = ContextPipeline()
    context = BasicCrawlingContext(
        request=Request.from_url(url='https://test.io/'),
        send_request=AsyncMock(),
        add_requests=AsyncMock(),
        session=Session(),
        proxy_info=AsyncMock(),
        push_data=AsyncMock(),
        use_state=AsyncMock(),
        get_key_value_store=AsyncMock(),
        log=logging.getLogger(),
    )

    await pipeline(context, consumer)

    consumer.assert_called_once_with(context)


async def test_calls_consumers_and_middlewares() -> None:
    events = list[str]()

    async def consumer(context: MoreEnhancedCrawlingContext) -> None:
        events.append('consumer_called')
        assert context.bar == 4

    async def middleware_a(context: BasicCrawlingContext) -> AsyncGenerator[EnhancedCrawlingContext, None]:
        events.append('middleware_a_in')
        yield EnhancedCrawlingContext(
            request=context.request,
            foo='foo',
            send_request=AsyncMock(),
            add_requests=AsyncMock(),
            session=context.session,
            proxy_info=AsyncMock(),
            push_data=AsyncMock(),
            use_state=AsyncMock(),
            get_key_value_store=AsyncMock(),
            log=logging.getLogger(),
        )
        events.append('middleware_a_out')

    async def middleware_b(context: EnhancedCrawlingContext) -> AsyncGenerator[MoreEnhancedCrawlingContext, None]:
        events.append('middleware_b_in')
        yield MoreEnhancedCrawlingContext(
            request=context.request,
            foo=context.foo,
            bar=4,
            send_request=AsyncMock(),
            add_requests=AsyncMock(),
            session=context.session,
            proxy_info=AsyncMock(),
            push_data=AsyncMock(),
            use_state=AsyncMock(),
            get_key_value_store=AsyncMock(),
            log=logging.getLogger(),
        )
        events.append('middleware_b_out')

    pipeline = ContextPipeline[BasicCrawlingContext]().compose(middleware_a).compose(middleware_b)

    context = BasicCrawlingContext(
        request=Request.from_url(url='https://test.io/'),
        send_request=AsyncMock(),
        add_requests=AsyncMock(),
        session=Session(),
        proxy_info=AsyncMock(),
        push_data=AsyncMock(),
        use_state=AsyncMock(),
        get_key_value_store=AsyncMock(),
        log=logging.getLogger(),
    )
    await pipeline(context, consumer)

    assert events == [
        'middleware_a_in',
        'middleware_b_in',
        'consumer_called',
        'middleware_b_out',
        'middleware_a_out',
    ]


async def test_wraps_consumer_errors() -> None:
    consumer = AsyncMock(side_effect=RuntimeError('Arbitrary crash for testing purposes'))

    pipeline = ContextPipeline()
    context = BasicCrawlingContext(
        request=Request.from_url(url='https://test.io/'),
        send_request=AsyncMock(),
        add_requests=AsyncMock(),
        session=Session(),
        proxy_info=AsyncMock(),
        push_data=AsyncMock(),
        use_state=AsyncMock(),
        get_key_value_store=AsyncMock(),
        log=logging.getLogger(),
    )

    with pytest.raises(RequestHandlerError):
        await pipeline(context, consumer)


async def test_handles_exceptions_in_middleware_initialization() -> None:
    consumer = AsyncMock()
    cleanup = AsyncMock()

    async def step_1(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingContext, None]:
        yield context
        await cleanup()

    async def step_2(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingContext, None]:
        raise RuntimeError('Crash during middleware initialization')
        yield context

    pipeline = ContextPipeline().compose(step_1).compose(step_2)
    context = BasicCrawlingContext(
        request=Request.from_url(url='https://test.io/'),
        send_request=AsyncMock(),
        add_requests=AsyncMock(),
        session=Session(),
        proxy_info=AsyncMock(),
        push_data=AsyncMock(),
        use_state=AsyncMock(),
        get_key_value_store=AsyncMock(),
        log=logging.getLogger(),
    )

    with pytest.raises(ContextPipelineInitializationError):
        await pipeline(context, consumer)

    assert not consumer.called
    assert cleanup.called


async def test_handles_exceptions_in_middleware_finalization() -> None:
    consumer = AsyncMock()
    cleanup = AsyncMock()

    async def step_1(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingContext, None]:
        yield context
        await cleanup()

    async def step_2(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingContext, None]:
        yield context
        raise RuntimeError('Crash during middleware finalization')

    pipeline = ContextPipeline().compose(step_1).compose(step_2)
    context = BasicCrawlingContext(
        request=Request.from_url(url='https://test.io/'),
        send_request=AsyncMock(),
        add_requests=AsyncMock(),
        session=Session(),
        proxy_info=AsyncMock(),
        push_data=AsyncMock(),
        use_state=AsyncMock(),
        get_key_value_store=AsyncMock(),
        log=logging.getLogger(),
    )

    with pytest.raises(ContextPipelineFinalizationError):
        await pipeline(context, consumer)

    assert consumer.called
    assert not cleanup.called


================================================
FILE: tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
================================================
from __future__ import annotations

import asyncio
from datetime import timedelta
from typing import TYPE_CHECKING
from unittest import mock

import pytest

from crawlee import ConcurrencySettings, Glob, HttpHeaders, Request, RequestTransformAction, SkippedReason
from crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.storages import RequestQueue

if TYPE_CHECKING:
    from yarl import URL

    from crawlee._request import RequestOptions
    from crawlee.http_clients._base import HttpClient


async def test_basic(server_url: URL, http_client: HttpClient) -> None:
    crawler = BeautifulSoupCrawler(http_client=http_client)
    handler = mock.AsyncMock()

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        links = context.soup.find_all('a')
        await handler(links)

    await crawler.run([str(server_url / 'start_enqueue')])

    assert handler.called

    # The handler should find three links
    assert len(handler.call_args[0][0]) == 3


async def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:
    redirect_target = str(server_url / 'start_enqueue')
    redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))
    requests = [redirect_url]

    crawler = BeautifulSoupCrawler(http_client=http_client)
    visit = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links()

    await crawler.run(requests)

    expected_visit_calls = [
        mock.call(redirect_url),
        mock.call(str(server_url / 'sub_index')),
        mock.call(str(server_url / 'page_1')),
        mock.call(str(server_url / 'page_2')),
        mock.call(str(server_url / 'page_3')),
        mock.call(str(server_url / 'page_4')),
        mock.call(str(server_url / 'base_page')),
        mock.call(str(server_url / 'base_subpath/page_5')),
    ]
    assert visit.mock_calls[0] == expected_visit_calls[0]
    visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:
    redirect_target = str(server_url / 'start_enqueue_non_href')
    redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))
    requests = [redirect_url]

    crawler = BeautifulSoupCrawler(http_client=http_client)
    visit = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links(selector='img', attribute='src')

    await crawler.run(requests)

    expected_visit_calls = [
        mock.call(redirect_url),
        mock.call(str(server_url / 'base_subpath/image_1')),
        mock.call(str(server_url / 'image_2')),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_enqueue_links_selector(server_url: URL, http_client: HttpClient) -> None:
    crawler = BeautifulSoupCrawler(http_client=http_client)
    visit = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links(selector='a.foo')

    await crawler.run([str(server_url / 'start_enqueue')])

    expected_visit_calls = [
        mock.call(str(server_url / 'start_enqueue')),
        mock.call(str(server_url / 'sub_index')),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpClient) -> None:
    start_urls = [str(server_url / 'start_enqueue')]
    processed_urls = []

    # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
    crawler = BeautifulSoupCrawler(
        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
        max_requests_per_crawl=3,
        http_client=http_client,
    )

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        await context.enqueue_links()
        processed_urls.append(context.request.url)

    stats = await crawler.run(start_urls)

    # Verify that only 3 out of the possible 5 requests were made
    assert len(processed_urls) == 3
    assert stats.requests_total == 3
    assert stats.requests_finished == 3


async def test_enqueue_links_with_transform_request_function(server_url: URL, http_client: HttpClient) -> None:
    crawler = BeautifulSoupCrawler(http_client=http_client)
    visit = mock.Mock()
    headers = []

    def test_transform_request_function(
        request_options: RequestOptions,
    ) -> RequestOptions | RequestTransformAction:
        if 'page_3' in request_options['url']:
            return 'skip'

        request_options['headers'] = HttpHeaders({'transform-header': 'my-header'})
        return request_options

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        visit(context.request.url)
        headers.append(context.request.headers)

        await context.enqueue_links(transform_request_function=test_transform_request_function)

    await crawler.run([str(server_url / 'start_enqueue')])

    # url /page_3 should not be visited
    expected_visit_calls = [
        mock.call(str(server_url / 'start_enqueue')),
        mock.call(str(server_url / 'sub_index')),
        mock.call(str(server_url / 'page_1')),
        mock.call(str(server_url / 'page_2')),
        mock.call(str(server_url / 'base_page')),
        mock.call(str(server_url / 'page_4')),
        mock.call(str(server_url / 'base_subpath/page_5')),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)

    # # all urls added to `enqueue_links` must have a custom header
    assert headers[1]['transform-header'] == 'my-header'
    assert headers[2]['transform-header'] == 'my-header'
    assert headers[3]['transform-header'] == 'my-header'


async def test_handle_blocked_request(server_url: URL, http_client: HttpClient) -> None:
    crawler = BeautifulSoupCrawler(max_session_rotations=1, http_client=http_client)
    stats = await crawler.run([str(server_url / 'incapsula')])
    assert stats.requests_failed == 1


def test_default_logger() -> None:
    assert BeautifulSoupCrawler().log.name == 'BeautifulSoupCrawler'


async def test_respect_robots_txt(server_url: URL, http_client: HttpClient) -> None:
    crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True)
    visit = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links()

    await crawler.run([str(server_url / 'start_enqueue')])

    expected_visit_calls = [
        mock.call(str(server_url / 'start_enqueue')),
        mock.call(str(server_url / 'sub_index')),
        mock.call(str(server_url / 'base_page')),
        mock.call(str(server_url / 'base_subpath/page_5')),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
    """Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
    visit = mock.Mock()
    fail = mock.Mock()
    crawler = BeautifulSoupCrawler(
        http_client=http_client,
        respect_robots_txt_file=True,
        max_request_retries=0,
    )

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links(strategy='all')

    @crawler.failed_request_handler
    async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
        fail(context.request.url)

    await crawler.run([str(server_url / 'problematic_links')])

    # Email must be skipped
    # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
    expected_visit_calls = [
        mock.call(str(server_url / 'problematic_links')),
        mock.call('https://avatars.githubusercontent.com/apify'),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)

    # The budplaceholder.com does not exist.
    expected_fail_calls = [
        mock.call('https://budplaceholder.com/'),
    ]
    fail.assert_has_calls(expected_fail_calls, any_order=True)


async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
    crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True)
    skip = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        await context.enqueue_links()

    @crawler.on_skipped_request
    async def skipped_hook(url: str, _reason: SkippedReason) -> None:
        skip(url)

    await crawler.run([str(server_url / 'start_enqueue')])

    expected_skip_calls = [
        mock.call(str(server_url / 'page_1')),
        mock.call(str(server_url / 'page_2')),
        mock.call(str(server_url / 'page_3')),
        mock.call(str(server_url / 'page_4')),
    ]
    skip.assert_has_calls(expected_skip_calls, any_order=True)


async def test_extract_links(server_url: URL, http_client: HttpClient) -> None:
    crawler = BeautifulSoupCrawler(http_client=http_client)
    extracted_links: list[str] = []

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        links = await context.extract_links(exclude=[Glob(f'{server_url}sub_index')])
        extracted_links.extend(request.url for request in links)

    await crawler.run([str(server_url / 'start_enqueue')])

    assert len(extracted_links) == 1
    assert extracted_links[0] == str(server_url / 'page_1')


async def test_extract_non_href_links(server_url: URL, http_client: HttpClient) -> None:
    crawler = BeautifulSoupCrawler(http_client=http_client)
    extracted_links: list[str] = []

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        links = await context.extract_links(selector='li', attribute='data-href')
        extracted_links.extend(request.url for request in links)

    await crawler.run([str(server_url / 'non_href_links')])

    assert len(extracted_links) == 1
    assert extracted_links[0] == str(server_url / 'page_2')


@pytest.mark.parametrize(
    ('queue_name', 'queue_alias', 'by_id'),
    [
        pytest.param('named-queue', None, False, id='with rq_name'),
        pytest.param(None, 'alias-queue', False, id='with rq_alias'),
        pytest.param('id-queue', None, True, id='with rq_id'),
    ],
)
async def test_enqueue_links_with_rq_param(
    server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool
) -> None:
    crawler = BeautifulSoupCrawler(http_client=http_client)
    rq = await RequestQueue.open(name=queue_name, alias=queue_alias)
    if by_id:
        queue_name = None
        queue_id = rq.id
    else:
        queue_id = None
    visit_urls: set[str] = set()

    @crawler.router.default_handler
    async def handler(context: BeautifulSoupCrawlingContext) -> None:
        visit_urls.add(context.request.url)
        await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)

    await crawler.run([str(server_url / 'start_enqueue')])

    requests_from_queue: list[str] = []
    while request := await rq.fetch_next_request():
        requests_from_queue.append(request.url)

    assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')}
    assert visit_urls == {str(server_url / 'start_enqueue')}

    await rq.drop()


@pytest.mark.parametrize(
    ('queue_name', 'queue_alias', 'by_id'),
    [
        pytest.param('named-queue', None, False, id='with rq_name'),
        pytest.param(None, 'alias-queue', False, id='with rq_alias'),
        pytest.param('id-queue', None, True, id='with rq_id'),
    ],
)
async def test_enqueue_links_requests_with_rq_param(
    server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool
) -> None:
    crawler = BeautifulSoupCrawler(http_client=http_client)
    rq = await RequestQueue.open(name=queue_name, alias=queue_alias)
    if by_id:
        queue_name = None
        queue_id = rq.id
    else:
        queue_id = None
    visit_urls: set[str] = set()

    check_requests: list[str] = [
        'https://a.placeholder.com',
        'https://b.placeholder.com',
        'https://c.placeholder.com',
    ]

    @crawler.router.default_handler
    async def handler(context: BeautifulSoupCrawlingContext) -> None:
        visit_urls.add(context.request.url)
        await context.enqueue_links(
            requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id, strategy='all'
        )

    await crawler.run([str(server_url / 'start_enqueue')])

    requests_from_queue: list[str] = []
    while request := await rq.fetch_next_request():
        requests_from_queue.append(request.url)

    assert set(requests_from_queue) == set(check_requests)
    assert visit_urls == {str(server_url / 'start_enqueue')}

    await rq.drop()


@pytest.mark.parametrize(
    ('queue_id', 'queue_name', 'queue_alias'),
    [
        pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'),
        pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'),
        pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'),
        pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'),
    ],
)
async def test_enqueue_links_error_with_multi_params(
    server_url: URL, http_client: HttpClient, queue_id: str | None, queue_name: str | None, queue_alias: str | None
) -> None:
    crawler = BeautifulSoupCrawler(http_client=http_client)

    @crawler.router.default_handler
    async def handler(context: BeautifulSoupCrawlingContext) -> None:
        with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'):
            await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)

    await crawler.run([str(server_url / 'start_enqueue')])


async def test_navigation_timeout_on_slow_request(server_url: URL, http_client: HttpClient) -> None:
    """Test that navigation_timeout causes TimeoutError on slow HTTP requests."""
    crawler = BeautifulSoupCrawler(
        http_client=http_client,
        navigation_timeout=timedelta(seconds=1),
        max_request_retries=0,
    )

    failed_request_handler = mock.AsyncMock()
    crawler.failed_request_handler(failed_request_handler)

    request_handler = mock.AsyncMock()
    crawler.router.default_handler(request_handler)

    # Request endpoint that delays 5 seconds - should timeout at 1 second
    await crawler.run([str(server_url.with_path('/slow').with_query(delay=5))])

    assert failed_request_handler.call_count == 1
    assert isinstance(failed_request_handler.call_args[0][1], asyncio.TimeoutError)


async def test_navigation_timeout_applies_to_hooks(server_url: URL) -> None:
    crawler = BeautifulSoupCrawler(
        navigation_timeout=timedelta(seconds=1),
        max_request_retries=0,
    )

    request_handler = mock.AsyncMock()
    crawler.router.default_handler(request_handler)
    crawler.pre_navigation_hook(lambda _: asyncio.sleep(1))

    # Pre-navigation hook takes 1 second (exceeds navigation timeout), so the URL will not be handled
    result = await crawler.run([str(server_url)])

    assert result.requests_failed == 1
    assert result.requests_finished == 0
    assert request_handler.call_count == 0


async def test_slow_navigation_does_not_count_toward_handler_timeout(server_url: URL, http_client: HttpClient) -> None:
    crawler = BeautifulSoupCrawler(
        http_client=http_client,
        request_handler_timeout=timedelta(seconds=0.5),
        max_request_retries=0,
    )

    request_handler = mock.AsyncMock()
    crawler.router.default_handler(request_handler)

    # Navigation takes 1 second (exceeds handler timeout), but should still succeed
    result = await crawler.run([str(server_url.with_path('/slow').with_query(delay=1))])

    assert result.requests_failed == 0
    assert result.requests_finished == 1
    assert request_handler.call_count == 1


async def test_enqueue_strategy_after_redirect(server_url: URL, redirect_server_url: URL) -> None:
    crawler = BeautifulSoupCrawler()

    handler_calls = mock.AsyncMock()

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        await handler_calls(context.request.url)

        target_url = str(server_url.with_path('redirect').with_query(url=str(redirect_server_url)))

        await context.enqueue_links(requests=[Request.from_url(target_url)], strategy='same-origin')

    await crawler.run([str(server_url)])

    assert handler_calls.called
    assert handler_calls.call_count == 1


async def test_enqueue_links_with_limit(server_url: URL, http_client: HttpClient) -> None:
    start_url = str(server_url / 'sub_index')
    requests = [start_url]

    crawler = BeautifulSoupCrawler(http_client=http_client)
    visit = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links(limit=1)

    await crawler.run(requests)

    # Only one link should be enqueued from sub_index due to the limit
    expected_visit_calls = [
        mock.call(start_url),
        mock.call(str(server_url / 'page_3')),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)


================================================
FILE: tests/unit/crawlers/_http/test_http_crawler.py
================================================
from __future__ import annotations

import json
from typing import TYPE_CHECKING
from unittest.mock import AsyncMock, Mock
from urllib.parse import parse_qs, urlencode

import pytest

from crawlee import ConcurrencySettings, Request, RequestState
from crawlee.crawlers import HttpCrawler
from crawlee.sessions import SessionPool
from crawlee.statistics import Statistics
from crawlee.storages import RequestQueue
from tests.unit.server_endpoints import HELLO_WORLD

if TYPE_CHECKING:
    from collections.abc import Awaitable, Callable

    from yarl import URL

    from crawlee._types import BasicCrawlingContext
    from crawlee.crawlers import HttpCrawlingContext
    from crawlee.http_clients._base import HttpClient

# Payload, e.g. data for a form submission.
PAYLOAD = {
    'custname': 'John Doe',
    'custtel': '1234567890',
    'custemail': 'johndoe@example.com',
    'size': 'large',
    'topping': '["bacon", "cheese", "mushroom"]',
    'delivery': '13:00',
    'comments': 'Please ring the doorbell upon arrival.',
}


@pytest.fixture
async def mock_request_handler() -> Callable[[HttpCrawlingContext], Awaitable[None]] | AsyncMock:
    return AsyncMock()


@pytest.fixture
async def crawler(
    http_client: HttpClient, mock_request_handler: Callable[[HttpCrawlingContext], Awaitable[None]]
) -> HttpCrawler:
    return HttpCrawler(http_client=http_client, request_handler=mock_request_handler)


@pytest.fixture
async def crawler_without_retries(
    mock_request_handler: Callable[[HttpCrawlingContext], Awaitable[None]],
) -> HttpCrawler:
    return HttpCrawler(
        request_handler=mock_request_handler,
        retry_on_blocked=False,
        max_request_retries=0,
    )


async def test_fetches_html(
    crawler: HttpCrawler,
    mock_request_handler: AsyncMock,
    server_url: URL,
) -> None:
    await crawler.add_requests([str(server_url)])
    await crawler.run()

    mock_request_handler.assert_called_once()
    assert mock_request_handler.call_args[0][0].request.url == str(server_url)


async def test_handles_redirects(crawler: HttpCrawler, mock_request_handler: AsyncMock, server_url: URL) -> None:
    redirect_target = str(server_url)
    redirect_url = str(server_url.with_path('redirect').with_query(url=redirect_target))
    await crawler.add_requests([redirect_url])
    await crawler.run()

    mock_request_handler.assert_called_once()
    assert mock_request_handler.call_args[0][0].request.loaded_url == redirect_target
    assert mock_request_handler.call_args[0][0].request.url == redirect_url


@pytest.mark.parametrize(
    ('additional_http_error_status_codes', 'ignore_http_error_status_codes', 'expected_number_error'),
    [
        # error without retry for all 4xx statuses
        pytest.param([], [], 1, id='default_behavior'),
        # make retry for codes in `additional_http_error_status_codes` list
        pytest.param([402], [], 3, id='additional_status_codes'),
        # take as successful status codes from the `ignore_http_error_status_codes` list
        pytest.param([], [402], 0, id='ignore_error_status_codes'),
        # check precedence for `additional_http_error_status_codes`
        pytest.param([402], [402], 3, id='additional_and_ignore'),
    ],
)
async def test_handles_client_errors(
    additional_http_error_status_codes: list[int],
    ignore_http_error_status_codes: list[int],
    expected_number_error: int,
    mock_request_handler: AsyncMock,
    server_url: URL,
) -> None:
    crawler = HttpCrawler(
        request_handler=mock_request_handler,
        additional_http_error_status_codes=additional_http_error_status_codes,
        ignore_http_error_status_codes=ignore_http_error_status_codes,
        max_request_retries=2,
    )

    await crawler.add_requests([str(server_url / 'status/402')])
    await crawler.run()

    assert crawler.statistics.error_tracker.total == expected_number_error

    # Request handler should not be called for error status codes.
    if expected_number_error:
        mock_request_handler.assert_not_called()
    else:
        mock_request_handler.assert_called()


@pytest.mark.parametrize(
    ('ignore_http_error_status_codes', 'use_session_pool', 'expected_session_rotate', 'expected_number_error'),
    [
        # change session and retry for no block 4xx statuses
        pytest.param([], True, 4, 1, id='default_behavior'),
        # error without retry for all 4xx statuses
        pytest.param([], False, 0, 1, id='default_behavior_without_session_pool'),
        # take as successful status codes from the `ignore_http_error_status_codes` list with Session Pool
        pytest.param([403], True, 0, 0, id='ignore_error_status_codes'),
        # take as successful status codes from the `ignore_http_error_status_codes` list without Session Pool
        pytest.param([403], False, 0, 0, id='ignore_error_status_codes_without_session_pool'),
    ],
)
async def test_handles_session_block_errors(
    *,
    ignore_http_error_status_codes: list[int],
    use_session_pool: bool,
    expected_session_rotate: int,
    expected_number_error: int,
    mock_request_handler: AsyncMock,
    server_url: URL,
) -> None:
    crawler = HttpCrawler(
        request_handler=mock_request_handler,
        ignore_http_error_status_codes=ignore_http_error_status_codes,
        max_request_retries=3,
        max_session_rotations=5,
        use_session_pool=use_session_pool,
    )

    await crawler.add_requests([str(server_url / 'status/403')])
    await crawler.run()

    assert crawler.statistics.error_tracker.total == expected_number_error
    assert crawler.statistics.error_tracker_retry.total == expected_session_rotate

    # Request handler should not be called for error status codes.
    if expected_number_error:
        mock_request_handler.assert_not_called()
    else:
        mock_request_handler.assert_called()


async def test_handles_server_error(crawler: HttpCrawler, mock_request_handler: AsyncMock, server_url: URL) -> None:
    await crawler.add_requests([str(server_url / 'status/500')])
    await crawler.run()

    mock_request_handler.assert_not_called()


async def test_stores_cookies(http_client: HttpClient, server_url: URL) -> None:
    visit = Mock()
    track_session_usage = Mock()

    async with SessionPool(max_pool_size=1) as session_pool:
        crawler = HttpCrawler(
            # /cookies/set might redirect us to a page that we can't access - no problem, we only care about cookies
            ignore_http_error_status_codes=[401],
            session_pool=session_pool,
            http_client=http_client,
        )

        @crawler.router.default_handler
        async def handler(context: HttpCrawlingContext) -> None:
            visit(context.request.url)
            track_session_usage(context.session.id if context.session else None)

        await crawler.run(
            [
                str(server_url.with_path('set_cookies').extend_query(a=1)),
                str(server_url.with_path('set_cookies').extend_query(b=2)),
                str(server_url.with_path('set_cookies').extend_query(c=3)),
            ]
        )

        visited = {call[0][0] for call in visit.call_args_list}
        assert len(visited) == 3

        session_ids = {call[0][0] for call in track_session_usage.call_args_list}
        assert len(session_ids) == 1

        session = await session_pool.get_session_by_id(session_ids.pop())
        assert session is not None
        assert {cookie['name']: cookie['value'] for cookie in session.cookies.get_cookies_as_dicts()} == {
            'a': '1',
            'b': '2',
            'c': '3',
        }


async def test_do_not_retry_on_client_errors(crawler: HttpCrawler, server_url: URL) -> None:
    await crawler.add_requests([str(server_url / 'status/400')])
    stats = await crawler.run()

    # by default, client errors are not retried
    assert stats.requests_failed == 1
    assert stats.retry_histogram == [1]
    assert stats.requests_total == 1


async def test_http_status_statistics(crawler: HttpCrawler, server_url: URL) -> None:
    await crawler.add_requests([str(server_url.with_path('status/500').with_query(id=i)) for i in range(10)])
    await crawler.add_requests([str(server_url.with_path('status/402').with_query(id=i)) for i in range(10)])
    await crawler.add_requests([str(server_url.with_path('status/403').with_query(id=i)) for i in range(10)])
    await crawler.add_requests([str(server_url.with_query(id=i)) for i in range(10)])

    await crawler.run()
    assert crawler.statistics.state.requests_with_status_code == {
        '200': 10,
        '403': 100,  # block errors change session and retry
        '402': 10,  # client errors are not retried by default
        '500': 40,  # server errors are retried by default
    }


async def test_sending_payload_as_raw_data(http_client: HttpClient, server_url: URL) -> None:
    crawler = HttpCrawler(http_client=http_client)
    responses = []

    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        response = json.loads(await context.http_response.read())
        # The post endpoint returns the provided payload in the response.
        responses.append(response)

    encoded_payload = urlencode(PAYLOAD).encode()
    request = Request.from_url(
        url=str(server_url / 'post'),
        method='POST',
        payload=encoded_payload,
    )

    await crawler.run([request])

    assert len(responses) == 1, 'Request handler should be called exactly once.'
    assert responses[0]['data'].encode() == encoded_payload, 'Response payload data does not match the sent payload.'

    # The reconstructed payload data should match the original payload. We have to flatten the values, because
    # parse_qs returns a list of values for each key.
    response_data = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(responses[0]['data']).items()}
    assert response_data == PAYLOAD, 'The reconstructed payload data does not match the sent payload.'

    assert responses[0]['json'] is None, 'Response JSON data should be empty when only raw data is sent.'
    assert responses[0]['form'] == {}, 'Response form data should be empty when only raw data is sent.'


async def test_sending_payload_as_form_data(http_client: HttpClient, server_url: URL) -> None:
    crawler = HttpCrawler(http_client=http_client)
    responses = []

    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        response = json.loads(await context.http_response.read())
        # The /post endpoint returns the provided payload in the response.
        responses.append(response)

    request = Request.from_url(
        url=str(server_url / 'post'),
        method='POST',
        headers={'content-type': 'application/x-www-form-urlencoded'},
        payload=urlencode(PAYLOAD).encode(),
    )

    await crawler.run([request])

    assert len(responses) == 1, 'Request handler should be called exactly once.'
    assert responses[0]['form'] == PAYLOAD, 'Form data in response does not match the sent payload.'

    assert responses[0]['json'] is None, 'Response JSON data should be empty when only form data is sent.'
    assert responses[0]['data'] == '', 'Response raw data should be empty when only form data is sent.'


async def test_sending_payload_as_json(http_client: HttpClient, server_url: URL) -> None:
    crawler = HttpCrawler(http_client=http_client)
    responses = []

    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        response = json.loads(await context.http_response.read())
        # The /post endpoint returns the provided payload in the response.
        responses.append(response)

    json_payload = json.dumps(PAYLOAD).encode()
    request = Request.from_url(
        url=str(server_url / 'post'),
        method='POST',
        payload=json_payload,
        headers={'content-type': 'application/json'},
    )

    await crawler.run([request])

    assert len(responses) == 1, 'Request handler should be called exactly once.'
    assert responses[0]['data'].encode() == json_payload, 'Response raw JSON data does not match the sent payload.'
    assert responses[0]['json'] == PAYLOAD, 'Response JSON data does not match the sent payload.'

    assert responses[0]['form'] == {}, 'Response form data should be empty when only JSON data is sent.'


async def test_sending_url_query_params(http_client: HttpClient, server_url: URL) -> None:
    crawler = HttpCrawler(http_client=http_client)
    responses = []

    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        response = json.loads(await context.http_response.read())
        # The /get endpoint returns the provided query parameters in the response.
        responses.append(response)

    base_url = server_url / 'get'
    query_params = {'param1': 'value1', 'param2': 'value2'}
    request = Request.from_url(url=str(base_url.extend_query(query_params)))

    await crawler.run([request])

    assert len(responses) == 1, 'Request handler should be called exactly once.'

    response_args = responses[0]['args']
    assert response_args == query_params, 'Reconstructed query params must match the original query params.'


async def test_http_crawler_pre_navigation_hook_execution(server_url: URL) -> None:
    """Test that pre-navigation hooks are executed."""
    crawler = HttpCrawler(request_handler=AsyncMock())

    call_mock = AsyncMock()

    # Register pre navigation hook.
    @crawler.pre_navigation_hook
    async def pre_nav_hook(context: BasicCrawlingContext) -> None:
        await call_mock(context.request.loaded_url)

    await crawler.run([str(server_url)])

    # `pre_navigation_hook` is called before the request is made, so the loaded URL should be None.
    call_mock.assert_called_once_with(None)


async def test_http_crawler_post_navigation_hook_execution(server_url: URL) -> None:
    """Test that post-navigation hooks are executed."""
    crawler = HttpCrawler(request_handler=AsyncMock())

    call_mock = AsyncMock()

    # Register post navigation hook.
    @crawler.post_navigation_hook
    async def post_nav_hook(context: HttpCrawlingContext) -> None:
        await call_mock(context.request.loaded_url)

    await crawler.run([str(server_url)])

    # `post_navigation_hook` is called after the request is made, so the loaded URL should be the result URL.
    call_mock.assert_called_once_with(str(server_url))


async def test_http_crawler_navigation_hooks_order(server_url: URL) -> None:
    """Test that post-navigation hooks are executed in correct order."""
    execution_order = []

    crawler = HttpCrawler()

    # Register final context handler.
    @crawler.router.default_handler
    async def default_request_handler(_context: HttpCrawlingContext) -> None:
        execution_order.append('final handler')

    # Register pre navigation hook.
    @crawler.pre_navigation_hook
    async def pre_nav_hook_1(_context: BasicCrawlingContext) -> None:
        execution_order.append('pre-navigation-hook 1')

    # Register pre navigation hook.
    @crawler.pre_navigation_hook
    async def pre_nav_hook(_context: BasicCrawlingContext) -> None:
        execution_order.append('pre-navigation-hook 2')

    # Register post navigation hook.
    @crawler.post_navigation_hook
    async def post_nav_hook_1(_context: HttpCrawlingContext) -> None:
        execution_order.append('post-navigation-hook 1')

    # Register post navigation hook.
    @crawler.post_navigation_hook
    async def post_nav_hook_2(_context: HttpCrawlingContext) -> None:
        execution_order.append('post-navigation-hook 2')

    await crawler.run([str(server_url)])

    assert execution_order == [
        'pre-navigation-hook 1',
        'pre-navigation-hook 2',
        'post-navigation-hook 1',
        'post-navigation-hook 2',
        'final handler',
    ]


async def test_isolation_cookies(http_client: HttpClient, server_url: URL) -> None:
    """Test isolation cookies for Session with curl"""
    sessions_ids: list[str] = []
    sessions_cookies: dict[str, dict[str, str]] = {}
    response_cookies: dict[str, dict[str, str]] = {}

    crawler = HttpCrawler(
        session_pool=SessionPool(
            max_pool_size=1,
            create_session_settings={
                'max_error_score': 50,
            },
        ),
        http_client=http_client,
        max_request_retries=10,
        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
    )

    @crawler.router.default_handler
    async def handler(context: HttpCrawlingContext) -> None:
        if not context.session:
            return

        sessions_ids.append(context.session.id)

        if context.request.unique_key not in {'1', '2'}:
            return

        sessions_cookies[context.session.id] = {
            cookie['name']: cookie['value'] for cookie in context.session.cookies.get_cookies_as_dicts()
        }
        response_data = json.loads(await context.http_response.read())
        response_cookies[context.session.id] = response_data.get('cookies')

        if context.request.user_data.get('retire_session'):
            context.session.retire()

    await crawler.run(
        [
            # The first request sets the cookie in the session
            str(server_url.with_path('set_cookies').extend_query(a=1)),
            # With the second request, we check the cookies in the session and set retire
            Request.from_url(str(server_url.with_path('/cookies')), unique_key='1', user_data={'retire_session': True}),
            # The third request is made with a new session to make sure it does not use another session's cookies
            Request.from_url(str(server_url.with_path('/cookies')), unique_key='2'),
        ]
    )

    assert len(sessions_cookies) == 2
    assert len(response_cookies) == 2

    assert sessions_ids[0] == sessions_ids[1]

    cookie_session_id = sessions_ids[0]
    clean_session_id = sessions_ids[2]

    assert cookie_session_id != clean_session_id

    # The initiated cookies must match in both the response and the session store
    assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}

    # For a clean session, the cookie should not be in the session store or in the response
    # This way we can be sure that no cookies are being leaked through the http client
    assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {}


async def test_store_complex_cookies(server_url: URL) -> None:
    visit = Mock()
    track_session_usage = Mock()
    async with SessionPool(max_pool_size=1) as session_pool:
        crawler = HttpCrawler(session_pool=session_pool)

        @crawler.router.default_handler
        async def handler(context: HttpCrawlingContext) -> None:
            visit(context.request.url)
            track_session_usage(context.session.id if context.session else None)

        await crawler.run([str(server_url / 'set_complex_cookies')])

        visited = {call[0][0] for call in visit.call_args_list}
        assert len(visited) == 1

        session_ids = {call[0][0] for call in track_session_usage.call_args_list}
        assert len(session_ids) == 1

        session = await session_pool.get_session_by_id(session_ids.pop())
        assert session is not None

        session_cookies_dict = {cookie['name']: cookie for cookie in session.cookies.get_cookies_as_dicts()}

        assert len(session_cookies_dict) == 6

        # cookie string: 'basic=1; Path=/; HttpOnly; SameSite=Lax'
        assert session_cookies_dict['basic'] == {
            'name': 'basic',
            'value': '1',
            'domain': server_url.host,
            'path': '/',
            'secure': False,
            'http_only': True,
            'same_site': 'Lax',
        }

        # cookie string: 'withpath=2; Path=/html; SameSite=None'
        assert session_cookies_dict['withpath'] == {
            'name': 'withpath',
            'value': '2',
            'domain': server_url.host,
            'path': '/html',
            'secure': False,
            'http_only': False,
            'same_site': 'None',
        }

        # cookie string: 'strict=3; Path=/; SameSite=Strict'
        assert session_cookies_dict['strict'] == {
            'name': 'strict',
            'value': '3',
            'domain': server_url.host,
            'path': '/',
            'secure': False,
            'http_only': False,
            'same_site': 'Strict',
        }

        # cookie string: 'secure=4; Path=/; HttpOnly; Secure; SameSite=Strict'
        assert session_cookies_dict['secure'] == {
            'name': 'secure',
            'value': '4',
            'domain': server_url.host,
            'path': '/',
            'secure': True,
            'http_only': True,
            'same_site': 'Strict',
        }

        # cookie string: 'short=5; Path=/;'
        assert session_cookies_dict['short'] == {
            'name': 'short',
            'value': '5',
            'domain': server_url.host,
            'path': '/',
            'secure': False,
            'http_only': False,
        }

        # Some clients may ignore `.` at the beginning of the domain
        # https://www.rfc-editor.org/rfc/rfc6265#section-4.1.2.3
        assert session_cookies_dict['domain'] == {
            'name': 'domain',
            'value': '6',
            'domain': {server_url.host},
            'path': '/',
            'secure': False,
            'http_only': False,
        } or {
            'name': 'domain',
            'value': '6',
            'domain': f'.{server_url.host}',
            'path': '/',
            'secure': False,
            'http_only': False,
        }


def test_default_logger() -> None:
    assert HttpCrawler().log.name == 'HttpCrawler'


async def test_get_snapshot(server_url: URL) -> None:
    crawler = HttpCrawler()

    snapshot = None

    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        nonlocal snapshot
        snapshot = await context.get_snapshot()

    await crawler.run([str(server_url)])

    assert snapshot is not None
    assert snapshot.html is not None
    assert snapshot.html == HELLO_WORLD.decode('utf8')


async def test_error_snapshot_through_statistics(server_url: URL) -> None:
    statistics = Statistics.with_default_state(save_error_snapshots=True)
    crawler = HttpCrawler(statistics=statistics)

    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        raise RuntimeError(rf'Exception /\ with file name unfriendly symbols in {context.request.url}')

    await crawler.run([str(server_url)])

    kvs = await crawler.get_key_value_store()
    kvs_content = {}
    async for key_info in kvs.iterate_keys():
        # Skip any non-error snapshot keys, e.g. __RQ_STATE_.
        if 'ERROR_SNAPSHOT' not in key_info.key:
            continue
        kvs_content[key_info.key] = await kvs.get_value(key_info.key)

    # One error, three time retried.
    content_key = next(iter(kvs_content))
    assert crawler.statistics.error_tracker.total == 4
    assert crawler.statistics.error_tracker.unique_error_count == 1
    assert len(kvs_content) == 1
    assert content_key.endswith('.html')
    assert kvs_content[content_key] == HELLO_WORLD.decode('utf8')


async def test_request_state(server_url: URL) -> None:
    queue = await RequestQueue.open(alias='http_request_state')
    crawler = HttpCrawler(request_manager=queue)

    success_request = Request.from_url(str(server_url))
    assert success_request.state == RequestState.UNPROCESSED

    error_request = Request.from_url(str(server_url / 'error'), user_data={'cause_error': True})

    requests_states: dict[str, dict[str, RequestState]] = {success_request.unique_key: {}, error_request.unique_key: {}}

    @crawler.pre_navigation_hook
    async def pre_navigation_hook(context: BasicCrawlingContext) -> None:
        requests_states[context.request.unique_key]['pre_navigation'] = context.request.state

    @crawler.router.default_handler
    async def request_handler(context: HttpCrawlingContext) -> None:
        if context.request.user_data.get('cause_error'):
            raise ValueError('Caused error as requested')
        requests_states[context.request.unique_key]['request_handler'] = context.request.state

    @crawler.error_handler
    async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
        requests_states[context.request.unique_key]['error_handler'] = context.request.state

    @crawler.failed_request_handler
    async def failed_request_handler(context: BasicCrawlingContext, _error: Exception) -> None:
        requests_states[context.request.unique_key]['failed_request_handler'] = context.request.state

    await crawler.run([success_request, error_request])

    handled_success_request = await queue.get_request(success_request.unique_key)

    assert handled_success_request is not None
    assert handled_success_request.state == RequestState.DONE

    assert requests_states[success_request.unique_key] == {
        'pre_navigation': RequestState.BEFORE_NAV,
        'request_handler': RequestState.REQUEST_HANDLER,
    }

    handled_error_request = await queue.get_request(error_request.unique_key)
    assert handled_error_request is not None
    assert handled_error_request.state == RequestState.ERROR

    assert requests_states[error_request.unique_key] == {
        'pre_navigation': RequestState.BEFORE_NAV,
        'error_handler': RequestState.ERROR_HANDLER,
        'failed_request_handler': RequestState.ERROR,
    }

    await queue.drop()


================================================
FILE: tests/unit/crawlers/_parsel/test_parsel_crawler.py
================================================
from __future__ import annotations

import sys
from typing import TYPE_CHECKING
from unittest import mock

import pytest

from crawlee import ConcurrencySettings, Glob, HttpHeaders, Request, RequestTransformAction, SkippedReason
from crawlee.crawlers import ParselCrawler
from crawlee.storages import RequestQueue

if TYPE_CHECKING:
    from yarl import URL

    from crawlee._request import RequestOptions
    from crawlee.crawlers import BasicCrawlingContext, ParselCrawlingContext
    from crawlee.http_clients._base import HttpClient


async def test_basic(server_url: URL, http_client: HttpClient) -> None:
    crawler = ParselCrawler(http_client=http_client)
    handler = mock.AsyncMock()

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        links = context.selector.css('a::attr(href)').getall()
        await handler(links)

    await crawler.run([str(server_url / 'start_enqueue')])

    assert handler.called

    # The handler should find three links
    assert len(handler.call_args[0][0]) == 3


async def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:
    redirect_target = str(server_url / 'start_enqueue')
    redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))
    requests = [redirect_url]

    crawler = ParselCrawler(http_client=http_client)
    visit = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        url = str(context.request.url)
        visit(url)
        await context.enqueue_links()

    await crawler.run(requests)

    expected_visit_calls = [
        mock.call(redirect_url),
        mock.call(str(server_url / 'sub_index')),
        mock.call(str(server_url / 'page_1')),
        mock.call(str(server_url / 'page_2')),
        mock.call(str(server_url / 'page_3')),
        mock.call(str(server_url / 'page_4')),
        mock.call(str(server_url / 'base_page')),
        mock.call(str(server_url / 'base_subpath/page_5')),
    ]
    assert visit.mock_calls[0] == expected_visit_calls[0]
    visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:
    redirect_target = str(server_url / 'start_enqueue_non_href')
    redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))
    requests = [redirect_url]

    crawler = ParselCrawler(http_client=http_client)
    visit = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links(selector='img', attribute='src')

    await crawler.run(requests)

    expected_visit_calls = [
        mock.call(redirect_url),
        mock.call(str(server_url / 'base_subpath/image_1')),
        mock.call(str(server_url / 'image_2')),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_enqueue_links_with_incompatible_kwargs_raises_error(server_url: URL) -> None:
    """Call `enqueue_links` with arguments that can't be used together."""
    crawler = ParselCrawler(max_request_retries=1)
    exceptions = []

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        try:
            # Testing runtime enforcement of the overloads.
            await context.enqueue_links(requests=[Request.from_url(str(server_url / 'start_enqueue'))], selector='a')
        except Exception as e:
            exceptions.append(e)

    await crawler.run([str(server_url)])

    assert len(exceptions) == 1
    assert type(exceptions[0]) is ValueError


async def test_enqueue_links_selector(server_url: URL, http_client: HttpClient) -> None:
    crawler = ParselCrawler(http_client=http_client)
    visit = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links(selector='a.foo')

    await crawler.run([str(server_url / 'start_enqueue')])

    expected_visit_calls = [
        mock.call(str(server_url / 'start_enqueue')),
        mock.call(str(server_url / 'sub_index')),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpClient) -> None:
    start_urls = [str(server_url / 'start_enqueue')]
    processed_urls = []

    # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
    crawler = ParselCrawler(
        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
        max_requests_per_crawl=3,
        http_client=http_client,
    )

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        await context.enqueue_links()
        processed_urls.append(context.request.url)

    stats = await crawler.run(start_urls)

    # Verify that only 3 out of the possible 5 requests were made
    assert len(processed_urls) == 3
    assert stats.requests_total == 3
    assert stats.requests_finished == 3


async def test_enqueue_links_with_transform_request_function(server_url: URL, http_client: HttpClient) -> None:
    crawler = ParselCrawler(http_client=http_client)
    visit = mock.Mock()
    headers = []

    def test_transform_request_function(
        request_options: RequestOptions,
    ) -> RequestOptions | RequestTransformAction:
        if 'page_3' in request_options['url']:
            return 'skip'

        request_options['headers'] = HttpHeaders({'transform-header': 'my-header'})
        return request_options

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        visit(context.request.url)
        headers.append(context.request.headers)
        await context.enqueue_links(transform_request_function=test_transform_request_function, label='test')

    await crawler.run([str(server_url / 'start_enqueue')])

    # url /page_3 should not be visited
    expected_visit_calls = [
        mock.call(str(server_url / 'start_enqueue')),
        mock.call(str(server_url / 'sub_index')),
        mock.call(str(server_url / 'page_1')),
        mock.call(str(server_url / 'page_2')),
        mock.call(str(server_url / 'page_4')),
        mock.call(str(server_url / 'base_page')),
        mock.call(str(server_url / 'base_subpath/page_5')),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)

    # all urls added to `enqueue_links` must have a custom header
    assert headers[1]['transform-header'] == 'my-header'
    assert headers[2]['transform-header'] == 'my-header'
    assert headers[3]['transform-header'] == 'my-header'


async def test_handle_blocked_request(server_url: URL, http_client: HttpClient) -> None:
    crawler = ParselCrawler(max_session_rotations=1, http_client=http_client)

    stats = await crawler.run([str(server_url / 'incapsula')])
    assert stats.requests_failed == 1


async def test_handle_blocked_status_code(server_url: URL, http_client: HttpClient) -> None:
    crawler = ParselCrawler(max_session_rotations=1, http_client=http_client)

    # Patch internal calls and run crawler
    with (
        mock.patch.object(
            crawler._statistics,
            'record_request_processing_failure',
            wraps=crawler._statistics.record_request_processing_failure,
        ) as record_request_processing_failure,
        mock.patch.object(
            crawler._statistics.error_tracker, 'add', wraps=crawler._statistics.error_tracker.add
        ) as error_tracker_add,
    ):
        stats = await crawler.run([str(server_url / 'status/403')])

    assert stats.requests_failed == 1
    assert record_request_processing_failure.called
    assert error_tracker_add.called
    assert crawler._statistics.error_tracker.total == 1


# TODO: Remove the skip mark when the test is fixed:
# https://github.com/apify/crawlee-python/issues/838
@pytest.mark.skip(reason='The test does not work with `crawlee._utils.try_import.ImportWrapper`.')
def test_import_error_handled() -> None:
    # Simulate ImportError for parsel
    with mock.patch.dict('sys.modules', {'parsel': None}):
        # Invalidate ParselCrawler import
        sys.modules.pop('crawlee.crawlers', None)
        sys.modules.pop('crawlee.crawlers._parsel', None)
        with pytest.raises(ImportError) as import_error:
            from crawlee.crawlers import ParselCrawler  # noqa: F401 PLC0415

    # Check if the raised ImportError contains the expected message
    assert str(import_error.value) == (
        "To import this, you need to install the 'parsel' extra."
        "For example, if you use pip, run `pip install 'crawlee[parsel]'`."
    )


async def test_json(server_url: URL, http_client: HttpClient) -> None:
    crawler = ParselCrawler(http_client=http_client)
    handler = mock.AsyncMock()

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        result = context.selector.jmespath('hello').getall()
        await handler(result)

    await crawler.run([str(server_url / 'json')])

    assert handler.called

    assert handler.call_args[0][0] == ['world']


async def test_xml(server_url: URL, http_client: HttpClient) -> None:
    crawler = ParselCrawler(http_client=http_client)
    handler = mock.AsyncMock()

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        result = context.selector.css('hello').getall()
        await handler(result)

    await crawler.run([str(server_url / 'xml')])

    assert handler.called

    assert handler.call_args[0][0] == ['<hello>world</hello>']


def test_default_logger() -> None:
    assert ParselCrawler().log.name == 'ParselCrawler'


async def test_respect_robots_txt(server_url: URL, http_client: HttpClient) -> None:
    crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True)
    visit = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links()

    await crawler.run([str(server_url / 'start_enqueue')])

    expected_visit_calls = [
        mock.call(str(server_url / 'start_enqueue')),
        mock.call(str(server_url / 'sub_index')),
        mock.call(str(server_url / 'base_page')),
        mock.call(str(server_url / 'base_subpath/page_5')),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
    """Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
    visit = mock.Mock()
    fail = mock.Mock()
    crawler = ParselCrawler(
        http_client=http_client,
        respect_robots_txt_file=True,
        max_request_retries=0,
    )

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links(strategy='all')

    @crawler.failed_request_handler
    async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
        fail(context.request.url)

    await crawler.run([str(server_url / 'problematic_links')])

    # Email must be skipped
    # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
    expected_visit_calls = [
        mock.call(str(server_url / 'problematic_links')),
        mock.call('https://avatars.githubusercontent.com/apify'),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)

    # The budplaceholder.com does not exist.
    expected_fail_calls = [
        mock.call('https://budplaceholder.com/'),
    ]
    fail.assert_has_calls(expected_fail_calls, any_order=True)


async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
    crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True)
    skip = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        await context.enqueue_links()

    @crawler.on_skipped_request
    async def skipped_hook(url: str, _reason: SkippedReason) -> None:
        skip(url)

    await crawler.run([str(server_url / 'start_enqueue')])

    expected_skip_calls = [
        mock.call(str(server_url / 'page_1')),
        mock.call(str(server_url / 'page_2')),
        mock.call(str(server_url / 'page_3')),
        mock.call(str(server_url / 'page_4')),
    ]
    skip.assert_has_calls(expected_skip_calls, any_order=True)


async def test_extract_links(server_url: URL, http_client: HttpClient) -> None:
    crawler = ParselCrawler(http_client=http_client)
    extracted_links: list[str] = []

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        links = await context.extract_links(exclude=[Glob(f'{server_url}sub_index')])
        extracted_links.extend(request.url for request in links)

    await crawler.run([str(server_url / 'start_enqueue')])

    assert len(extracted_links) == 1
    assert extracted_links[0] == str(server_url / 'page_1')


async def test_extract_non_href_links(server_url: URL, http_client: HttpClient) -> None:
    crawler = ParselCrawler(http_client=http_client)
    extracted_links: list[str] = []

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        links = await context.extract_links(selector='li', attribute='data-href')
        extracted_links.extend(request.url for request in links)

    await crawler.run([str(server_url / 'non_href_links')])

    assert len(extracted_links) == 1
    assert extracted_links[0] == str(server_url / 'page_2')


@pytest.mark.parametrize(
    ('queue_name', 'queue_alias', 'by_id'),
    [
        pytest.param('named-queue', None, False, id='with rq_name'),
        pytest.param(None, 'alias-queue', False, id='with rq_alias'),
        pytest.param('id-queue', None, True, id='with rq_id'),
    ],
)
async def test_enqueue_links_with_rq_param(
    server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool
) -> None:
    crawler = ParselCrawler(http_client=http_client)
    rq = await RequestQueue.open(name=queue_name, alias=queue_alias)
    if by_id:
        queue_name = None
        queue_id = rq.id
    else:
        queue_id = None
    visit_urls: set[str] = set()

    @crawler.router.default_handler
    async def handler(context: ParselCrawlingContext) -> None:
        visit_urls.add(context.request.url)
        await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)

    await crawler.run([str(server_url / 'start_enqueue')])

    requests_from_queue: list[str] = []
    while request := await rq.fetch_next_request():
        requests_from_queue.append(request.url)

    assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')}
    assert visit_urls == {str(server_url / 'start_enqueue')}

    await rq.drop()


@pytest.mark.parametrize(
    ('queue_name', 'queue_alias', 'by_id'),
    [
        pytest.param('named-queue', None, False, id='with rq_name'),
        pytest.param(None, 'alias-queue', False, id='with rq_alias'),
        pytest.param('id-queue', None, True, id='with rq_id'),
    ],
)
async def test_enqueue_links_requests_with_rq_param(
    server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool
) -> None:
    crawler = ParselCrawler(http_client=http_client)
    rq = await RequestQueue.open(name=queue_name, alias=queue_alias)
    if by_id:
        queue_name = None
        queue_id = rq.id
    else:
        queue_id = None
    visit_urls: set[str] = set()

    check_requests: list[str] = [
        'https://a.placeholder.com',
        'https://b.placeholder.com',
        'https://c.placeholder.com',
    ]

    @crawler.router.default_handler
    async def handler(context: ParselCrawlingContext) -> None:
        visit_urls.add(context.request.url)
        await context.enqueue_links(
            requests=check_requests, rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias, strategy='all'
        )

    await crawler.run([str(server_url / 'start_enqueue')])

    requests_from_queue: list[str] = []
    while request := await rq.fetch_next_request():
        requests_from_queue.append(request.url)

    assert set(requests_from_queue) == set(check_requests)
    assert visit_urls == {str(server_url / 'start_enqueue')}

    await rq.drop()


@pytest.mark.parametrize(
    ('queue_id', 'queue_name', 'queue_alias'),
    [
        pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'),
        pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'),
        pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'),
        pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'),
    ],
)
async def test_enqueue_links_error_with_multi_params(
    server_url: URL, http_client: HttpClient, queue_id: str | None, queue_name: str | None, queue_alias: str | None
) -> None:
    crawler = ParselCrawler(http_client=http_client)

    @crawler.router.default_handler
    async def handler(context: ParselCrawlingContext) -> None:
        with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'):
            await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)

    await crawler.run([str(server_url / 'start_enqueue')])


async def test_enqueue_links_with_limit(server_url: URL, http_client: HttpClient) -> None:
    start_url = str(server_url / 'sub_index')
    requests = [start_url]

    crawler = ParselCrawler(http_client=http_client)
    visit = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links(limit=1)

    await crawler.run(requests)

    # Only one link should be enqueued from sub_index due to the limit
    expected_visit_calls = [
        mock.call(start_url),
        mock.call(str(server_url / 'page_3')),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)


================================================
FILE: tests/unit/crawlers/_playwright/test_playwright_crawler.py
================================================
from __future__ import annotations

import asyncio
import json
import logging
from datetime import timedelta
from typing import TYPE_CHECKING, Any, Literal
from unittest import mock
from unittest.mock import AsyncMock, Mock

import pytest

from crawlee import (
    ConcurrencySettings,
    Glob,
    HttpHeaders,
    Request,
    RequestState,
    RequestTransformAction,
    SkippedReason,
    service_locator,
)
from crawlee.configuration import Configuration
from crawlee.crawlers import PlaywrightCrawler
from crawlee.fingerprint_suite import (
    DefaultFingerprintGenerator,
    FingerprintGenerator,
    HeaderGeneratorOptions,
    ScreenOptions,
)
from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values
from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
from crawlee.http_clients import ImpitHttpClient
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.sessions import Session, SessionPool
from crawlee.statistics import Statistics
from crawlee.statistics._error_snapshotter import ErrorSnapshotter
from crawlee.storages import RequestQueue
from tests.unit.server_endpoints import GENERIC_RESPONSE, HELLO_WORLD

if TYPE_CHECKING:
    from pathlib import Path

    from yarl import URL

    from crawlee._request import RequestOptions
    from crawlee._types import HttpMethod, HttpPayload
    from crawlee.browsers._types import BrowserType
    from crawlee.crawlers import (
        BasicCrawlingContext,
        PlaywrightCrawlingContext,
        PlaywrightPostNavCrawlingContext,
        PlaywrightPreNavCrawlingContext,
    )


@pytest.mark.parametrize(
    ('method', 'path', 'payload'),
    [
        pytest.param('GET', 'get', None, id='get request'),
        pytest.param('POST', 'post', None, id='post request'),
        pytest.param('POST', 'post', b'Hello, world!', id='post request with payload'),
    ],
)
async def test_basic_request(method: HttpMethod, path: str, payload: HttpPayload, server_url: URL) -> None:
    requests = [Request.from_url(str(server_url / path), method=method, payload=payload)]
    crawler = PlaywrightCrawler()
    result: dict = {}

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        assert context.page is not None
        result['request_url'] = context.request.url
        result['page_url'] = context.page.url
        result['page_content'] = await context.page.content()

    await crawler.run(requests)
    assert result.get('request_url') == result.get('page_url') == requests[0].url
    assert (payload.decode() if payload else '') in result.get('page_content', '')


async def test_enqueue_links(redirect_server_url: URL, server_url: URL) -> None:
    redirect_target = str(server_url / 'start_enqueue')
    redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))
    requests = [redirect_url]
    crawler = PlaywrightCrawler()
    visit = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links()

    await crawler.run(requests)

    expected_visit_calls = [
        mock.call(redirect_url),
        mock.call(str(server_url / 'sub_index')),
        mock.call(str(server_url / 'page_1')),
        mock.call(str(server_url / 'page_2')),
        mock.call(str(server_url / 'page_3')),
        mock.call(str(server_url / 'page_4')),
        mock.call(str(server_url / 'base_page')),
        mock.call(str(server_url / 'base_subpath/page_5')),
    ]
    assert visit.mock_calls[0] == expected_visit_calls[0]
    visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL) -> None:
    redirect_target = str(server_url / 'start_enqueue_non_href')
    redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))
    requests = [redirect_url]
    crawler = PlaywrightCrawler()
    visit = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links(selector='img', attribute='src')

    await crawler.run(requests)

    expected_visit_calls = [
        mock.call(redirect_url),
        mock.call(str(server_url / 'base_subpath/image_1')),
        mock.call(str(server_url / 'image_2')),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_enqueue_links_with_incompatible_kwargs_raises_error(server_url: URL) -> None:
    """Call `enqueue_links` with arguments that can't be used together."""
    crawler = PlaywrightCrawler(max_request_retries=1)
    exceptions = []

    @crawler.pre_navigation_hook
    async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
        await context.page.route('**/*', lambda route: route.fulfill(status=200))

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        try:
            # Testing runtime enforcement of the overloads.
            await context.enqueue_links(requests=[Request.from_url('https://www.whatever.com')], selector='a')
        except Exception as e:
            exceptions.append(e)

    await crawler.run([str(server_url)])

    assert len(exceptions) == 1
    assert type(exceptions[0]) is ValueError


async def test_enqueue_links_with_transform_request_function(server_url: URL) -> None:
    crawler = PlaywrightCrawler()
    visit = mock.Mock()
    headers = []

    def test_transform_request_function(request: RequestOptions) -> RequestOptions | RequestTransformAction:
        if request['url'] == str(server_url / 'sub_index'):
            request['headers'] = HttpHeaders({'transform-header': 'my-header'})
            return request
        return 'skip'

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        visit(context.request.url)
        headers.append(context.request.headers)
        await context.enqueue_links(transform_request_function=test_transform_request_function)

    await crawler.run([str(server_url / 'start_enqueue')])

    expected_visit_calls = [
        mock.call(str(server_url / 'start_enqueue')),
        mock.call(str(server_url / 'sub_index')),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)

    # all urls added to `enqueue_links` must have a custom header
    assert headers[1]['transform-header'] == 'my-header'


async def test_nonexistent_url_invokes_error_handler() -> None:
    crawler = PlaywrightCrawler(max_request_retries=3, request_handler=mock.AsyncMock())

    error_handler = mock.AsyncMock(return_value=None)
    crawler.error_handler(error_handler)

    failed_handler = mock.AsyncMock(return_value=None)
    crawler.failed_request_handler(failed_handler)

    await crawler.run(['https://this-does-not-exist-22343434.com'])
    assert error_handler.call_count == 3
    assert failed_handler.call_count == 1


async def test_redirect_handling(server_url: URL, redirect_server_url: URL) -> None:
    # Set up a dummy crawler that tracks visited URLs
    crawler = PlaywrightCrawler()
    handled_urls = set[str]()

    redirect_target = str(server_url / 'start_enqueue')
    redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        handled_urls.add(context.request.loaded_url or '')

    # Request with redirects
    request = Request.from_url(url=redirect_url)

    # Ensure that the request uses the same origin strategy - `redirect_target` will be considered out of scope
    request.crawlee_data.enqueue_strategy = 'same-origin'

    # No URLs should be visited in the run
    await crawler.run([request])
    assert handled_urls == set()


@pytest.mark.parametrize(
    'fingerprint_generator',
    [
        pytest.param(None, id='No fingerprint generator. Headers generated by header generator.'),
        pytest.param(
            DefaultFingerprintGenerator(header_options=HeaderGeneratorOptions(browsers=['chrome'])),
            id='Explicitly passed fingerprint generator.',
        ),
        pytest.param('default', id='Default fingerprint generator.'),
    ],
)
async def test_chromium_headless_headers(
    header_network: dict, fingerprint_generator: None | FingerprintGenerator | Literal['default'], server_url: URL
) -> None:
    browser_type: BrowserType = 'chromium'
    crawler = PlaywrightCrawler(headless=True, browser_type=browser_type, fingerprint_generator=fingerprint_generator)
    headers = dict[str, str]()

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        response = await context.response.text()
        response_headers = json.loads(response)

        for key, val in response_headers.items():
            headers[key] = val

    await crawler.run([str(server_url / 'headers')])

    user_agent = headers.get('user-agent')
    assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'}), user_agent
    assert any(
        keyword in user_agent
        for keyword in BROWSER_TYPE_HEADER_KEYWORD[fingerprint_browser_type_from_playwright_browser_type(browser_type)]
    ), user_agent

    assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua')
    assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile')
    assert headers.get('sec-ch-ua-platform') in get_available_header_values(header_network, 'sec-ch-ua-platform')

    assert 'headless' not in headers['sec-ch-ua'].lower()
    assert 'headless' not in headers['user-agent'].lower()


@pytest.mark.flaky(reruns=3, reason='Test is flaky.')
async def test_firefox_headless_headers(header_network: dict, server_url: URL) -> None:
    browser_type: BrowserType = 'firefox'
    crawler = PlaywrightCrawler(headless=True, browser_type=browser_type)
    headers = dict[str, str]()

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        response = await context.response.text()
        response_headers = json.loads(response)

        for key, val in response_headers.items():
            headers[key] = val

    await crawler.run([str(server_url / 'headers')])

    assert 'user-agent' in headers
    assert 'sec-ch-ua' not in headers
    assert 'sec-ch-ua-mobile' not in headers
    assert 'sec-ch-ua-platform' not in headers

    assert 'headless' not in headers['user-agent'].lower()

    user_agent = headers.get('user-agent')
    assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'})
    assert any(
        keyword in user_agent
        for keyword in BROWSER_TYPE_HEADER_KEYWORD[fingerprint_browser_type_from_playwright_browser_type(browser_type)]
    )


async def test_custom_headers(server_url: URL) -> None:
    crawler = PlaywrightCrawler()
    response_headers = dict[str, str]()
    request_headers = {'Power-Header': 'ring', 'Library': 'storm', 'My-Test-Header': 'fuzz'}

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        response = await context.response.text()
        context_response_headers = json.loads(response)
        for key, val in context_response_headers.items():
            response_headers[key] = val

    await crawler.run([Request.from_url(str(server_url / 'headers'), headers=request_headers)])

    assert response_headers.get('power-header') == request_headers['Power-Header']
    assert response_headers.get('library') == request_headers['Library']
    assert response_headers.get('my-test-header') == request_headers['My-Test-Header']


async def test_pre_navigation_hook() -> None:
    crawler = PlaywrightCrawler(request_handler=mock.AsyncMock())
    visit = mock.Mock()

    @crawler.pre_navigation_hook
    async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
        visit()
        await context.page.route('**/*', lambda route: route.fulfill(status=200))

    await crawler.run(['https://test.com', 'https://test.io'])

    assert visit.call_count == 2


async def test_proxy_set() -> None:
    # Configure crawler with proxy settings
    proxy_value = 'http://1111:1111'
    crawler = PlaywrightCrawler(proxy_configuration=ProxyConfiguration(proxy_urls=[proxy_value]))

    handler_data = {}

    mock_handler = mock.AsyncMock(return_value=None)
    crawler.router.default_handler(mock_handler)

    # Use pre_navigation_hook to verify proxy and configure playwright route
    @crawler.pre_navigation_hook
    async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
        if context.proxy_info:
            # Store information about the used proxy
            handler_data['proxy'] = context.proxy_info.url

        # Emulate server response to prevent Playwright from making real requests
        await context.page.route('**/*', lambda route: route.fulfill(status=200))

    await crawler.run(['https://test.com'])

    assert handler_data.get('proxy') == proxy_value


@pytest.mark.run_alone
@pytest.mark.parametrize(
    'use_incognito_pages',
    [
        pytest.param(False, id='without use_incognito_pages'),
        pytest.param(True, id='with use_incognito_pages'),
    ],
)
async def test_isolation_cookies(*, use_incognito_pages: bool, server_url: URL) -> None:
    sessions_ids: list[str] = []
    sessions: dict[str, Session] = {}
    sessions_cookies: dict[str, dict[str, str]] = {}
    response_cookies: dict[str, dict[str, str]] = {}

    crawler = PlaywrightCrawler(
        session_pool=SessionPool(max_pool_size=1),
        use_incognito_pages=use_incognito_pages,
        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
    )

    @crawler.router.default_handler
    async def handler(context: PlaywrightCrawlingContext) -> None:
        if not context.session:
            return

        sessions_ids.append(context.session.id)
        sessions[context.session.id] = context.session

        if context.request.unique_key == '1':
            # With the second request, we check the cookies in the session and set retire
            await context.add_requests(
                [
                    Request.from_url(
                        str(server_url.with_path('/cookies')), unique_key='2', user_data={'retire_session': True}
                    )
                ]
            )
            return

        response_data = json.loads(await context.response.text())
        response_cookies[context.session.id] = response_data.get('cookies')

        if context.request.user_data.get('retire_session'):
            context.session.retire()

        if context.request.unique_key == '2':
            # The third request is made with a new session to make sure it does not use another session's cookies
            await context.add_requests([Request.from_url(str(server_url.with_path('/cookies')), unique_key='3')])

    await crawler.run(
        [
            # The first request sets the cookie in the session
            Request.from_url(str(server_url.with_path('set_cookies').extend_query(a=1)), unique_key='1'),
        ]
    )

    assert len(response_cookies) == 2
    assert len(sessions) == 2

    assert sessions_ids[0] == sessions_ids[1]

    sessions_cookies = {
        sessions_id: {
            cookie['name']: cookie['value'] for cookie in sessions[sessions_id].cookies.get_cookies_as_dicts()
        }
        for sessions_id in sessions_ids
    }

    assert len(sessions_cookies) == 2

    cookie_session_id = sessions_ids[0]
    clean_session_id = sessions_ids[2]

    assert cookie_session_id != clean_session_id

    # When using `use_incognito_pages` there should be full cookie isolation
    if use_incognito_pages:
        # The initiated cookies must match in both the response and the session store
        assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}

        # For a clean session, the cookie should not be in the sesstion store or in the response
        # This way we can be sure that no cookies are being leaked through the http client
        assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {}
    # Without `use_incognito_pages` we will have access to the session cookie,
    # but there will be a cookie leak via PlaywrightContext
    else:
        # The initiated cookies must match in both the response and the session store
        assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}

        # PlaywrightContext makes cookies shared by all sessions that work with it.
        # So in this case a clean session contains the same cookies
        assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {'a': '1'}


async def test_save_cookies_after_handler_processing(server_url: URL) -> None:
    """Test that cookies are saved correctly."""
    async with SessionPool(max_pool_size=1) as session_pool:
        crawler = PlaywrightCrawler(session_pool=session_pool)

        session_ids = []

        @crawler.router.default_handler
        async def request_handler(context: PlaywrightCrawlingContext) -> None:
            # Simulate cookies installed from an external source in the browser
            await context.page.context.add_cookies([{'name': 'check', 'value': 'test', 'url': str(server_url)}])

            if context.session:
                session_ids.append(context.session.id)

        await crawler.run([str(server_url)])

        assert len(session_ids) == 1

        check_session = await session_pool.get_session()

        assert check_session.id == session_ids[0]
        session_cookies = {cookie['name']: cookie['value'] for cookie in check_session.cookies.get_cookies_as_dicts()}

        assert session_cookies == {'check': 'test'}


async def test_read_write_cookies(server_url: URL) -> None:
    """Test that cookies are reloaded correctly."""
    async with SessionPool(max_pool_size=1) as session_pool:
        crawler = PlaywrightCrawler(session_pool=session_pool)

        playwright_cookies = []
        session_cookies = []

        # Check that no errors occur when reading and writing cookies.
        @crawler.router.default_handler
        async def request_handler(context: PlaywrightCrawlingContext) -> None:
            cookies = await context.page.context.cookies()
            playwright_cookies.extend(cookies)

            if context.session:
                context.session.cookies.set_cookies_from_playwright_format(cookies)
                session_cookies.extend(context.session.cookies.get_cookies_as_dicts())

        await crawler.run([str(server_url / 'set_complex_cookies')])

        # Check that the cookie was received with `partitionKey`
        assert any('partitionKey' in cookie for cookie in playwright_cookies)

        assert len(playwright_cookies) == len(session_cookies)


async def test_custom_fingerprint_uses_generator_options(server_url: URL) -> None:
    min_width = 300
    max_width = 600
    min_height = 500
    max_height = 1200

    fingerprint_generator = DefaultFingerprintGenerator(
        header_options=HeaderGeneratorOptions(browsers=['firefox'], operating_systems=['android']),
        screen_options=ScreenOptions(
            min_width=min_width, max_width=max_width, min_height=min_height, max_height=max_height
        ),
    )

    crawler = PlaywrightCrawler(headless=True, fingerprint_generator=fingerprint_generator)

    fingerprints = dict[str, Any]()

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        for relevant_key in (
            'window.navigator.userAgent',
            'window.navigator.userAgentData',
            'window.screen.height',
            'window.screen.width',
        ):
            fingerprints[relevant_key] = await context.page.evaluate(f'()=>{relevant_key}')

    await crawler.run([str(server_url)])

    assert 'Firefox' in fingerprints['window.navigator.userAgent']
    assert fingerprints['window.navigator.userAgentData']['platform'] == 'Android'
    assert min_width <= int(fingerprints['window.screen.width']) <= max_width
    assert min_height <= int(fingerprints['window.screen.height']) <= max_height


async def test_custom_fingerprint_matches_header_user_agent(server_url: URL) -> None:
    """Test that generated fingerprint and header have matching user agent."""

    crawler = PlaywrightCrawler(headless=True, fingerprint_generator=DefaultFingerprintGenerator())
    response_headers = dict[str, str]()
    fingerprints = dict[str, str]()

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        response = await context.response.text()
        context_response_headers = dict(json.loads(response))

        response_headers['User-Agent'] = context_response_headers['user-agent']
        fingerprints['window.navigator.userAgent'] = await context.page.evaluate('()=>window.navigator.userAgent')

    await crawler.run([str(server_url / 'headers')])

    assert response_headers['User-Agent'] == fingerprints['window.navigator.userAgent']


async def test_ignore_http_error_status_codes(server_url: URL) -> None:
    """Test that error codes that would normally trigger session error can be ignored."""
    crawler = PlaywrightCrawler(ignore_http_error_status_codes={403})
    target_url = str(server_url / 'status/403')
    mocked_handler = Mock()

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        mocked_handler(context.request.url)

    await crawler.run([target_url])

    mocked_handler.assert_called_once_with(target_url)


async def test_additional_http_error_status_codes(server_url: URL) -> None:
    """Test that use of `additional_http_error_status_codes` can raise error on common status code."""
    crawler = PlaywrightCrawler(additional_http_error_status_codes={200})

    mocked_handler = Mock()

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        mocked_handler(context.request.url)

    await crawler.run([str(server_url)])

    mocked_handler.assert_not_called()


async def test_launch_with_user_data_dir(tmp_path: Path, server_url: URL) -> None:
    """Check that the persist context is created in the specified folder in `user_data_dir`."""
    check_path = tmp_path / 'Default'
    crawler = PlaywrightCrawler(
        headless=True, user_data_dir=tmp_path, request_handler=mock.AsyncMock(return_value=None)
    )

    assert not check_path.exists()

    await crawler.run([str(server_url)])

    assert check_path.exists()


async def test_launch_with_user_data_dir_and_fingerprint(tmp_path: Path, server_url: URL) -> None:
    """Check that the persist context works with fingerprints."""
    check_path = tmp_path / 'Default'
    fingerprints = dict[str, str]()

    crawler = PlaywrightCrawler(
        headless=True,
        user_data_dir=tmp_path,
        request_handler=mock.AsyncMock(return_value=None),
        fingerprint_generator=DefaultFingerprintGenerator(),
    )

    @crawler.pre_navigation_hook
    async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
        fingerprints['window.navigator.userAgent'] = await context.page.evaluate('()=>window.navigator.userAgent')

    assert not check_path.exists()

    await crawler.run([str(server_url)])

    assert check_path.exists()

    assert fingerprints['window.navigator.userAgent']
    assert 'headless' not in fingerprints['window.navigator.userAgent'].lower()


async def test_get_snapshot(server_url: URL) -> None:
    crawler = PlaywrightCrawler()

    snapshot = None

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        nonlocal snapshot
        snapshot = await context.get_snapshot()

    await crawler.run([str(server_url)])

    assert snapshot is not None
    assert snapshot.html is not None
    assert snapshot.screenshot is not None
    # Check at least jpeg start and end expected bytes. Content is not relevant for the test.
    assert snapshot.screenshot.startswith(b'\xff\xd8')
    assert snapshot.screenshot.endswith(b'\xff\xd9')
    assert snapshot.html == HELLO_WORLD.decode('utf-8')


async def test_error_snapshot_through_statistics(server_url: URL) -> None:
    """Test correct use of error snapshotter by the Playwright crawler.

    In this test the crawler will visit 4 pages.
    - 2 x page endpoints will return the same error
    - homepage endpoint will return unique error
    - headers endpoint will return no error
    """
    max_retries = 2
    crawler = PlaywrightCrawler(
        statistics=Statistics.with_default_state(save_error_snapshots=True), max_request_retries=max_retries
    )

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        if 'page' in context.request.url:
            raise RuntimeError('page error')
        if 'headers' in context.request.url:
            return
        raise RuntimeError('home error')

    await crawler.run(
        [str(server_url), str(server_url / 'page_1'), str(server_url / 'page_2'), str(server_url / 'headers')]
    )

    kvs = await crawler.get_key_value_store()
    kvs_content = {}

    async for key_info in kvs.iterate_keys():
        # Skip any non-error snapshot keys, e.g. __RQ_STATE_.
        if 'ERROR_SNAPSHOT' not in key_info.key:
            continue
        kvs_content[key_info.key] = await kvs.get_value(key_info.key)

        assert set(key_info.key).issubset(ErrorSnapshotter.ALLOWED_CHARACTERS)
        if key_info.key.endswith('.jpg'):
            # Check at least jpeg start and end expected bytes. Content is not relevant for the test.
            assert kvs_content[key_info.key].startswith(b'\xff\xd8')
            assert kvs_content[key_info.key].endswith(b'\xff\xd9')
        elif 'page' in key_info.key:
            assert kvs_content[key_info.key] == GENERIC_RESPONSE.decode('utf-8')
        else:
            assert kvs_content[key_info.key] == HELLO_WORLD.decode('utf-8')

    # Three errors twice retried errors, but only 2 unique -> 4 (2 x (html and jpg)) artifacts expected.
    assert crawler.statistics.error_tracker.total == 3 * (max_retries + 1)
    assert crawler.statistics.error_tracker.unique_error_count == 2
    assert len(list(kvs_content.keys())) == 4


async def test_respect_robots_txt(server_url: URL) -> None:
    crawler = PlaywrightCrawler(respect_robots_txt_file=True)
    visit = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links()

    await crawler.run([str(server_url / 'start_enqueue')])

    expected_visit_calls = [
        mock.call(str(server_url / 'start_enqueue')),
        mock.call(str(server_url / 'sub_index')),
        mock.call(str(server_url / 'base_page')),
        mock.call(str(server_url / 'base_subpath/page_5')),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_respect_robots_txt_with_problematic_links(server_url: URL) -> None:
    """Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
    visit = mock.Mock()
    fail = mock.Mock()
    crawler = PlaywrightCrawler(
        respect_robots_txt_file=True,
        max_request_retries=0,
    )

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links(strategy='all')

    @crawler.failed_request_handler
    async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
        fail(context.request.url)

    await crawler.run([str(server_url / 'problematic_links')])

    # Email must be skipped
    # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
    expected_visit_calls = [
        mock.call(str(server_url / 'problematic_links')),
        mock.call('https://avatars.githubusercontent.com/apify'),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)

    # The budplaceholder.com does not exist.
    expected_fail_calls = [
        mock.call('https://budplaceholder.com/'),
    ]
    fail.assert_has_calls(expected_fail_calls, any_order=True)


async def test_on_skipped_request(server_url: URL) -> None:
    crawler = PlaywrightCrawler(respect_robots_txt_file=True)
    skip = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        await context.enqueue_links()

    @crawler.on_skipped_request
    async def skipped_hook(url: str, _reason: SkippedReason) -> None:
        skip(url)

    await crawler.run([str(server_url / 'start_enqueue')])

    expected_skip_calls = [
        mock.call(str(server_url / 'page_1')),
        mock.call(str(server_url / 'page_2')),
        mock.call(str(server_url / 'page_3')),
        mock.call(str(server_url / 'page_4')),
    ]
    skip.assert_has_calls(expected_skip_calls, any_order=True)


async def test_send_request(server_url: URL) -> None:
    check_data: dict[str, Any] = {}

    crawler = PlaywrightCrawler()

    @crawler.pre_navigation_hook
    async def pre_hook(context: PlaywrightPreNavCrawlingContext) -> None:
        send_request_response = await context.send_request(str(server_url / 'user-agent'))
        check_data['pre_send_request'] = dict(json.loads(await send_request_response.read()))

    @crawler.post_navigation_hook
    async def post_hook(context: PlaywrightPostNavCrawlingContext) -> None:
        send_request_response = await context.send_request(str(server_url / 'user-agent'))
        check_data['post_send_request'] = dict(json.loads(await send_request_response.read()))

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        response = await context.response.text()
        check_data['default'] = dict(json.loads(response))
        send_request_response = await context.send_request(str(server_url / 'user-agent'))
        check_data['send_request'] = dict(json.loads(await send_request_response.read()))

    await crawler.run([str(server_url / 'user-agent')])

    assert check_data['default'].get('user-agent') is not None
    assert check_data['send_request'].get('user-agent') is not None

    assert check_data['pre_send_request'] == check_data['send_request']
    assert check_data['post_send_request'] == check_data['send_request']
    assert check_data['default'] == check_data['send_request']


async def test_send_request_with_client(server_url: URL) -> None:
    """Check that the persist context works with fingerprints."""
    check_data: dict[str, Any] = {}

    crawler = PlaywrightCrawler(http_client=ImpitHttpClient(headers={'user-agent': 'My User-Agent'}))

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        response = await context.response.text()
        check_data['default'] = dict(json.loads(response))
        send_request_response = await context.send_request(str(server_url / 'user-agent'))
        check_data['send_request'] = dict(json.loads(await send_request_response.read()))

    await crawler.run([str(server_url / 'user-agent')])

    assert check_data['default'].get('user-agent') is not None
    assert check_data['send_request']['user-agent'] == 'My User-Agent'

    assert check_data['default'] != check_data['send_request']


async def test_passing_configuration() -> None:
    """Check that the configuration is allowed to be passed to the Playwrightcrawler."""
    service_locator.set_configuration(Configuration(log_level='INFO'))
    configuration = Configuration(log_level='WARNING')

    crawler = PlaywrightCrawler(configuration=configuration)

    assert service_locator.get_configuration().log_level == 'INFO'
    assert crawler._service_locator.get_configuration().log_level == 'WARNING'


async def test_extract_links(server_url: URL) -> None:
    crawler = PlaywrightCrawler()
    extracted_links: list[str] = []

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        links = await context.extract_links(exclude=[Glob(f'{server_url}sub_index')])
        extracted_links.extend(request.url for request in links)

    await crawler.run([str(server_url / 'start_enqueue')])

    assert len(extracted_links) == 1
    assert extracted_links[0] == str(server_url / 'page_1')


async def test_extract_non_href_links(server_url: URL) -> None:
    crawler = PlaywrightCrawler()
    extracted_links: list[str] = []

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        links = await context.extract_links(selector='li', attribute='data-href')
        extracted_links.extend(request.url for request in links)

    await crawler.run([str(server_url / 'non_href_links')])

    assert len(extracted_links) == 1
    assert extracted_links[0] == str(server_url / 'page_2')


async def test_reduced_logs_from_playwright_navigation_timeout(caplog: pytest.LogCaptureFixture) -> None:
    caplog.set_level(logging.INFO)
    crawler = PlaywrightCrawler(configure_logging=False)
    non_existent_page = 'https://totally-non-existing-site.com/blablablba'

    # Capture all logs from the 'crawlee' logger at INFO level or higher
    with caplog.at_level(logging.INFO, logger='crawlee'):
        await crawler.run([Request.from_url(non_existent_page)])

    expected_summarized_log = (
        f'Retrying request to {non_existent_page} due to: Page.goto: net::ERR_NAME_NOT_RESOLVED at {non_existent_page}'
    )

    # Find the Playwright specific error message in the logs
    found_playwright_message = False
    for record in caplog.records:
        if record.message and expected_summarized_log in record.message:
            full_message = (record.message or '') + (record.exc_text or '')
            assert '\n' not in full_message
            found_playwright_message = True
            break

    assert found_playwright_message, 'Expected log message about request handler error was not found.'


@pytest.mark.parametrize(
    ('queue_name', 'queue_alias', 'by_id'),
    [
        pytest.param('named-queue', None, False, id='with rq_name'),
        pytest.param(None, 'alias-queue', False, id='with rq_alias'),
        pytest.param('id-queue', None, True, id='with rq_id'),
    ],
)
async def test_enqueue_links_with_rq_param(
    server_url: URL, queue_name: str | None, queue_alias: str | None, *, by_id: bool
) -> None:
    crawler = PlaywrightCrawler()
    rq = await RequestQueue.open(name=queue_name, alias=queue_alias)
    if by_id:
        queue_name = None
        queue_id = rq.id
    else:
        queue_id = None
    visit_urls: set[str] = set()

    @crawler.router.default_handler
    async def handler(context: PlaywrightCrawlingContext) -> None:
        visit_urls.add(context.request.url)
        await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)

    await crawler.run([str(server_url / 'start_enqueue')])

    requests_from_queue: list[str] = []
    while request := await rq.fetch_next_request():
        requests_from_queue.append(request.url)

    assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')}
    assert visit_urls == {str(server_url / 'start_enqueue')}

    await rq.drop()


@pytest.mark.parametrize(
    ('queue_name', 'queue_alias', 'by_id'),
    [
        pytest.param('named-queue', None, False, id='with rq_name'),
        pytest.param(None, 'alias-queue', False, id='with rq_alias'),
        pytest.param('id-queue', None, True, id='with rq_id'),
    ],
)
async def test_enqueue_links_requests_with_rq_param(
    server_url: URL, queue_name: str | None, queue_alias: str | None, *, by_id: bool
) -> None:
    crawler = PlaywrightCrawler()
    rq = await RequestQueue.open(name=queue_name, alias=queue_alias)
    if by_id:
        queue_name = None
        queue_id = rq.id
    else:
        queue_id = None
    visit_urls: set[str] = set()

    check_requests: list[str] = [
        'https://a.placeholder.com',
        'https://b.placeholder.com',
        'https://c.placeholder.com',
    ]

    @crawler.router.default_handler
    async def handler(context: PlaywrightCrawlingContext) -> None:
        visit_urls.add(context.request.url)
        await context.enqueue_links(
            requests=check_requests, rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias, strategy='all'
        )

    await crawler.run([str(server_url / 'start_enqueue')])

    requests_from_queue: list[str] = []
    while request := await rq.fetch_next_request():
        requests_from_queue.append(request.url)

    assert set(requests_from_queue) == set(check_requests)
    assert visit_urls == {str(server_url / 'start_enqueue')}

    await rq.drop()


@pytest.mark.parametrize(
    ('queue_id', 'queue_name', 'queue_alias'),
    [
        pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'),
        pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'),
        pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'),
        pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'),
    ],
)
async def test_enqueue_links_error_with_multi_params(
    server_url: URL, queue_id: str | None, queue_name: str | None, queue_alias: str | None
) -> None:
    crawler = PlaywrightCrawler()

    @crawler.router.default_handler
    async def handler(context: PlaywrightCrawlingContext) -> None:
        with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'):
            await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)

    await crawler.run([str(server_url / 'start_enqueue')])


async def test_navigation_timeout_on_slow_page_load(server_url: URL) -> None:
    crawler = PlaywrightCrawler(
        navigation_timeout=timedelta(seconds=1),
        max_request_retries=0,
    )

    request_handler = AsyncMock()
    crawler.router.default_handler(request_handler)

    failed_request_handler = AsyncMock()
    crawler.failed_request_handler(failed_request_handler)

    result = await crawler.run([str((server_url / 'slow').with_query(delay=2))])

    assert result.requests_failed == 1
    assert result.requests_finished == 0

    assert request_handler.call_count == 0

    assert failed_request_handler.call_count == 1
    assert isinstance(failed_request_handler.call_args[0][1], asyncio.TimeoutError)


async def test_navigation_timeout_applies_to_hooks(server_url: URL) -> None:
    crawler = PlaywrightCrawler(
        navigation_timeout=timedelta(seconds=0.5),
        max_request_retries=0,
    )

    request_handler = AsyncMock()
    crawler.router.default_handler(request_handler)
    crawler.pre_navigation_hook(lambda _: asyncio.sleep(1))

    # Pre-navigation hook takes 1 second (exceeds navigation timeout), so the URL will not be handled
    result = await crawler.run([str(server_url)])

    assert result.requests_failed == 1
    assert result.requests_finished == 0
    assert request_handler.call_count == 0


async def test_slow_navigation_does_not_count_toward_handler_timeout(server_url: URL) -> None:
    crawler = PlaywrightCrawler(
        request_handler_timeout=timedelta(seconds=0.5),
        max_request_retries=0,
    )

    request_handler = AsyncMock()
    crawler.router.default_handler(request_handler)

    # Navigation takes 1 second (exceeds handler timeout), but should still succeed
    result = await crawler.run([str((server_url / 'slow').with_query(delay=1))])

    assert result.requests_failed == 0
    assert result.requests_finished == 1
    assert request_handler.call_count == 1


async def test_request_state(server_url: URL) -> None:
    queue = await RequestQueue.open(alias='playwright_request_state')
    crawler = PlaywrightCrawler(request_manager=queue)

    success_request = Request.from_url(str(server_url))
    assert success_request.state == RequestState.UNPROCESSED

    error_request = Request.from_url(str(server_url / 'error'), user_data={'cause_error': True})

    requests_states: dict[str, dict[str, RequestState]] = {success_request.unique_key: {}, error_request.unique_key: {}}

    @crawler.pre_navigation_hook
    async def pre_navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None:
        requests_states[context.request.unique_key]['pre_navigation'] = context.request.state

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        if context.request.user_data.get('cause_error'):
            raise ValueError('Caused error as requested')
        requests_states[context.request.unique_key]['request_handler'] = context.request.state

    @crawler.error_handler
    async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
        requests_states[context.request.unique_key]['error_handler'] = context.request.state

    @crawler.failed_request_handler
    async def failed_request_handler(context: BasicCrawlingContext, _error: Exception) -> None:
        requests_states[context.request.unique_key]['failed_request_handler'] = context.request.state

    await crawler.run([success_request, error_request])

    handled_success_request = await queue.get_request(success_request.unique_key)

    assert handled_success_request is not None
    assert handled_success_request.state == RequestState.DONE

    assert requests_states[success_request.unique_key] == {
        'pre_navigation': RequestState.BEFORE_NAV,
        'request_handler': RequestState.REQUEST_HANDLER,
    }

    handled_error_request = await queue.get_request(error_request.unique_key)
    assert handled_error_request is not None
    assert handled_error_request.state == RequestState.ERROR

    assert requests_states[error_request.unique_key] == {
        'pre_navigation': RequestState.BEFORE_NAV,
        'error_handler': RequestState.ERROR_HANDLER,
        'failed_request_handler': RequestState.ERROR,
    }

    await queue.drop()


async def test_enqueue_links_with_limit(server_url: URL) -> None:
    start_url = str(server_url / 'sub_index')
    requests = [start_url]

    crawler = PlaywrightCrawler()
    visit = mock.Mock()

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        visit(context.request.url)
        await context.enqueue_links(limit=1)

    await crawler.run(requests)

    # Only one link should be enqueued from sub_index due to the limit
    expected_visit_calls = [
        mock.call(start_url),
        mock.call(str(server_url / 'page_3')),
    ]
    visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_playwright_crawler_pre_navigation_hook_execution(server_url: URL) -> None:
    """Test that pre-navigation hooks are executed."""
    crawler = PlaywrightCrawler(request_handler=AsyncMock())

    call_mock = AsyncMock()

    # Register pre navigation hook.
    @crawler.pre_navigation_hook
    async def pre_nav_hook(context: PlaywrightPreNavCrawlingContext) -> None:
        await call_mock(context.page.url)

    await crawler.run([str(server_url)])

    # `pre_navigation_hook` is called before the request is made, so the loaded URL should be 'about:blank'.
    call_mock.assert_called_once_with('about:blank')


async def test_playwright_crawler_post_navigation_hook_execution(server_url: URL) -> None:
    """Test that post-navigation hooks are executed."""
    crawler = PlaywrightCrawler(request_handler=AsyncMock())

    call_mock = AsyncMock()

    # Register post navigation hook.
    @crawler.post_navigation_hook
    async def post_nav_hook(context: PlaywrightPostNavCrawlingContext) -> None:
        await call_mock(context.page.url)

    await crawler.run([str(server_url)])

    # `post_navigation_hook` is called after the request is made, so the loaded URL should be the result URL.
    call_mock.assert_called_once_with(str(server_url))


async def test_playwright_navigation_hooks_order(server_url: URL) -> None:
    """Test that post-navigation hooks are executed in correct order."""
    execution_order = []

    crawler = PlaywrightCrawler()

    #  Register final context handler.
    @crawler.router.default_handler
    async def default_request_handler(_context: PlaywrightCrawlingContext) -> None:
        execution_order.append('final handler')

    #  Register pre navigation hook.
    @crawler.pre_navigation_hook
    async def pre_nav_hook_1(_context: PlaywrightPreNavCrawlingContext) -> None:
        execution_order.append('pre-navigation-hook 1')

    #  Register pre navigation hook.
    @crawler.pre_navigation_hook
    async def pre_nav_hook(_context: PlaywrightPreNavCrawlingContext) -> None:
        execution_order.append('pre-navigation-hook 2')

    #  Register post navigation hook.
    @crawler.post_navigation_hook
    async def post_nav_hook_1(_context: PlaywrightPostNavCrawlingContext) -> None:
        execution_order.append('post-navigation-hook 1')

    #  Register post navigation hook.
    @crawler.post_navigation_hook
    async def post_nav_hook_2(_context: PlaywrightPostNavCrawlingContext) -> None:
        execution_order.append('post-navigation-hook 2')

    await crawler.run([str(server_url)])

    assert execution_order == [
        'pre-navigation-hook 1',
        'pre-navigation-hook 2',
        'post-navigation-hook 1',
        'post-navigation-hook 2',
        'final handler',
    ]


================================================
FILE: tests/unit/crawlers/_playwright/test_utils.py
================================================
from playwright.async_api import async_playwright
from yarl import URL

from crawlee.crawlers._playwright._utils import block_requests, infinite_scroll


async def test_infinite_scroll_on_dynamic_page(server_url: URL) -> None:
    """Checks that infinite_scroll loads all items on a page with infinite scrolling."""
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        target_url = str(server_url / 'infinite_scroll')

        # Get data with manual scrolling
        await page.goto(target_url)

        manual_items = []
        for _ in range(4):
            items = await page.query_selector_all('.item')
            manual_items = items
            await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
            await page.wait_for_timeout(1000)

        # Reset page
        await page.close()
        page = await browser.new_page()
        await page.goto(target_url)

        # Get data with infinite_scroll utility
        before_scroll = await page.query_selector_all('.item')
        assert len(before_scroll) != len(manual_items)
        assert len(before_scroll) == 10

        await infinite_scroll(page)

        after_scroll = await page.query_selector_all('.item')

        assert len(before_scroll) < len(after_scroll)
        assert len(manual_items) == len(after_scroll)

        await browser.close()


async def test_infinite_scroll_no_page_without_scroll(server_url: URL) -> None:
    """Checks that infinite_scroll does not call error on a page without infinite scrolling."""
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        await page.goto(str(server_url))

        await infinite_scroll(page)

        title = await page.title()

        assert title == 'Hello, world!'

        await browser.close()


async def test_double_call_infinite_scroll(server_url: URL) -> None:
    """Checks that calling infinite_scroll twice does not load more items the second time."""
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        await page.goto(str(server_url / 'infinite_scroll'))

        await infinite_scroll(page)
        first_count = len(await page.query_selector_all('.item'))

        await infinite_scroll(page)
        second_count = len(await page.query_selector_all('.item'))

        assert first_count == second_count

        await browser.close()


async def test_block_requests_default(server_url: URL) -> None:
    """Checks that block_requests blocks the correct resources by default."""
    async with async_playwright() as p:
        browser = await p.chromium.launch()

        target_url = str(server_url / 'resource_loading_page')

        # Default behavior, all resources load
        page = await browser.new_page()
        loaded_urls_no_block = []

        page.on('requestfinished', lambda req: loaded_urls_no_block.append(req.url.rsplit('/', 1)[-1]))
        await page.goto(target_url)
        await page.wait_for_load_state('networkidle')
        await page.close()

        # With blocking — collect loaded resources
        page = await browser.new_page()
        loaded_urls_blocked = []

        page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))
        await block_requests(page)
        await page.goto(target_url)
        await page.wait_for_load_state('networkidle')
        await page.close()

        await browser.close()

    # Without blocking, both resources should load
    assert set(loaded_urls_no_block) == {'resource_loading_page', 'test.js', 'test.png'}

    # With blocking, only JS should load
    assert set(loaded_urls_blocked) == {'resource_loading_page', 'test.js'}


async def test_block_requests_with_extra_patterns(server_url: URL) -> None:
    """Checks that block_requests blocks the correct resources with extra patterns."""
    async with async_playwright() as p:
        browser = await p.chromium.launch()

        target_url = str(server_url / 'resource_loading_page')

        page = await browser.new_page()
        loaded_urls_blocked = []

        page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))
        await block_requests(page, extra_url_patterns=['*.js'])
        await page.goto(target_url)
        await page.wait_for_load_state('networkidle')
        await page.close()

        await browser.close()

        # With blocking, only HTML should load
        assert set(loaded_urls_blocked) == {'resource_loading_page'}


async def test_block_requests_with_custom_patterns(server_url: URL) -> None:
    """Checks that block_requests blocks the correct resources with custom patterns."""
    async with async_playwright() as p:
        browser = await p.chromium.launch()

        target_url = str(server_url / 'resource_loading_page')

        page = await browser.new_page()
        loaded_urls_blocked = []

        page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))
        await block_requests(page, url_patterns=['*.js'])
        await page.goto(target_url)
        await page.wait_for_load_state('networkidle')
        await page.close()

        await browser.close()

        # With blocking, only PNG should load
        assert set(loaded_urls_blocked) == {'resource_loading_page', 'test.png'}


================================================
FILE: tests/unit/events/test_event_manager.py
================================================
from __future__ import annotations

import asyncio
import logging
from datetime import timedelta
from functools import update_wrapper
from typing import TYPE_CHECKING, Any
from unittest import mock
from unittest.mock import AsyncMock, MagicMock

import pytest

from crawlee.events import Event, EventManager, EventSystemInfoData

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator


@pytest.fixture
async def event_manager() -> AsyncGenerator[EventManager, None]:
    async with EventManager() as event_manager:
        yield event_manager


@pytest.fixture
def event_system_info_data() -> EventSystemInfoData:
    return MagicMock(spec=EventSystemInfoData)


@pytest.fixture
def async_listener() -> AsyncMock:
    async def async_listener(payload: Any) -> None:
        pass

    al = AsyncMock()
    update_wrapper(al, async_listener)
    return al


@pytest.fixture
def sync_listener() -> MagicMock:
    def sync_listener(payload: Any) -> None:
        pass

    sl = MagicMock()
    update_wrapper(sl, sync_listener)
    return sl


async def test_emit_invokes_registered_sync_listener(
    sync_listener: MagicMock,
    event_manager: EventManager,
    event_system_info_data: EventSystemInfoData,
) -> None:
    event_manager.on(event=Event.SYSTEM_INFO, listener=sync_listener)
    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)

    await asyncio.sleep(0.1)  # Allow some time for the event to be processed

    assert sync_listener.call_count == 1
    assert sync_listener.call_args[0] == (event_system_info_data,)


async def test_emit_invokes_both_sync_and_async_listeners(
    sync_listener: MagicMock,
    async_listener: AsyncMock,
    event_manager: EventManager,
    event_system_info_data: EventSystemInfoData,
) -> None:
    event_manager.on(event=Event.SYSTEM_INFO, listener=sync_listener)
    event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener)
    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)

    await asyncio.sleep(0.1)  # Allow some time for the event to be processed

    assert async_listener.call_count == 1
    assert async_listener.call_args[0] == (event_system_info_data,)

    assert sync_listener.call_count == 1
    assert sync_listener.call_args[0] == (event_system_info_data,)


async def test_emit_event_with_no_listeners(
    event_manager: EventManager,
    event_system_info_data: EventSystemInfoData,
    async_listener: AsyncMock,
) -> None:
    # Register a listener for a different event
    event_manager.on(event=Event.ABORTING, listener=async_listener)

    # Attempt to emit an event for which no listeners are registered, it should not fail
    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)
    await asyncio.sleep(0.1)  # Allow some time for the event to be processed

    # Ensure the listener for the other event was not called
    assert async_listener.call_count == 0


async def test_emit_invokes_parameterless_listener(
    event_manager: EventManager,
    event_system_info_data: EventSystemInfoData,
) -> None:
    sync_mock = MagicMock()

    def sync_listener() -> None:
        sync_mock()

    async_mock = MagicMock()

    async def async_listener() -> None:
        async_mock()

    event_manager.on(event=Event.SYSTEM_INFO, listener=sync_listener)
    event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener)

    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)
    await asyncio.sleep(0.1)  # Allow some time for the event to be processed

    assert sync_mock.call_count == 1
    assert async_mock.call_count == 1


async def test_remove_nonexistent_listener_does_not_fail(
    async_listener: AsyncMock,
    event_manager: EventManager,
) -> None:
    # Attempt to remove a specific listener that was never added.
    event_manager.off(event=Event.SYSTEM_INFO, listener=async_listener)
    # Attempt to remove all listeners.
    event_manager.off(event=Event.ABORTING)


async def test_removed_listener_not_invoked_on_emit(
    async_listener: AsyncMock,
    event_manager: EventManager,
    event_system_info_data: EventSystemInfoData,
) -> None:
    event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener)
    event_manager.off(event=Event.SYSTEM_INFO, listener=async_listener)
    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)

    await asyncio.sleep(0.1)  # Allow some time for the event to be processed
    assert async_listener.call_count == 0


async def test_close_clears_listeners_and_tasks(async_listener: AsyncMock) -> None:
    async with EventManager() as event_manager:
        event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener)

    assert async_listener.call_count == 0
    assert len(event_manager._listener_tasks) == 0
    assert len(event_manager._listeners_to_wrappers) == 0


async def test_close_after_emit_processes_event(
    async_listener: AsyncMock,
    event_system_info_data: EventSystemInfoData,
) -> None:
    async with EventManager() as event_manager:
        event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener)
        event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)

    # Event should be processed before the event manager is closed
    assert async_listener.call_count == 1
    assert async_listener.call_args[0] == (event_system_info_data,)

    assert len(event_manager._listener_tasks) == 0
    assert len(event_manager._listeners_to_wrappers) == 0


async def test_wait_for_all_listeners_cancelled_error(
    monkeypatch: pytest.MonkeyPatch,
    caplog: pytest.LogCaptureFixture,
) -> None:
    # Simulate long-running listener tasks
    async def long_running_listener() -> None:
        await asyncio.sleep(10)

    # Define a side effect function that raises CancelledError
    async def mock_async_wait(*_: Any, **__: Any) -> None:
        raise asyncio.CancelledError

    with pytest.raises(asyncio.CancelledError), caplog.at_level(logging.WARNING):  # noqa: PT012
        async with EventManager(close_timeout=timedelta(milliseconds=10)) as event_manager:
            event_manager.on(event=Event.SYSTEM_INFO, listener=long_running_listener)

            # Use monkeypatch to replace asyncio.wait with mock_async_wait
            monkeypatch.setattr('asyncio.wait', mock_async_wait)


async def test_methods_raise_error_when_not_active(event_system_info_data: EventSystemInfoData) -> None:
    event_manager = EventManager()

    assert event_manager.active is False

    with pytest.raises(RuntimeError, match=r'EventManager is not active.'):
        event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)

    with pytest.raises(RuntimeError, match=r'EventManager is not active.'):
        await event_manager.wait_for_all_listeners_to_complete()

    with pytest.raises(RuntimeError, match=r'EventManager is already active.'):
        async with event_manager, event_manager:
            pass

    async with event_manager:
        event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)
        await event_manager.wait_for_all_listeners_to_complete()

        assert event_manager.active is True


async def test_event_manager_in_context_persistence() -> None:
    """Test that entering the `EventManager` context emits persist state event at least once."""
    event_manager = EventManager()

    with mock.patch.object(event_manager, '_emit_persist_state_event', AsyncMock()) as mocked_emit_persist_state_event:
        async with event_manager:
            pass

    assert mocked_emit_persist_state_event.call_count >= 1


================================================
FILE: tests/unit/events/test_local_event_manager.py
================================================
from __future__ import annotations

import asyncio
from datetime import timedelta
from typing import Any
from unittest.mock import AsyncMock

from crawlee.events import LocalEventManager
from crawlee.events._types import Event, EventSystemInfoData


async def test_emit_system_info_event() -> None:
    mocked_listener = AsyncMock()

    async def async_listener(payload: Any) -> None:
        await mocked_listener(payload)

    system_info_interval = timedelta(milliseconds=50)
    test_tolerance_coefficient = 10
    async with LocalEventManager(system_info_interval=system_info_interval) as event_manager:
        event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener)
        await asyncio.sleep(system_info_interval.total_seconds() * test_tolerance_coefficient)

    assert mocked_listener.call_count >= 1
    assert isinstance(mocked_listener.call_args[0][0], EventSystemInfoData)


================================================
FILE: tests/unit/fingerprint_suite/test_adapters.py
================================================
from collections.abc import Iterable

import pytest
from browserforge.headers import Browser

from crawlee.fingerprint_suite import (
    DefaultFingerprintGenerator,
    HeaderGeneratorOptions,
    ScreenOptions,
)
from crawlee.fingerprint_suite._browserforge_adapter import PatchedHeaderGenerator
from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD


def test_fingerprint_generator_has_default() -> None:
    """Test that header generator can work without any options."""
    assert DefaultFingerprintGenerator().generate()


def test_fingerprint_generator_some_options_stress_test() -> None:
    """Test that header generator can work consistently."""
    fingerprint_generator = DefaultFingerprintGenerator(
        mock_web_rtc=True,
        screen_options=ScreenOptions(min_width=500),
        header_options=HeaderGeneratorOptions(strict=True),
    )

    for _ in range(20):
        fingerprint = fingerprint_generator.generate()

        assert fingerprint.mockWebRTC is True
        assert fingerprint.screen.availWidth > 500


def test_fingerprint_generator_all_options() -> None:
    """Test that header generator can work with all the options. Some most basic checks of fingerprint.

    Fingerprint generation option might have no effect if there is no fingerprint sample present in collected data.
    """
    min_width = 600
    max_width = 1800
    min_height = 400
    max_height = 1200

    fingerprint = DefaultFingerprintGenerator(
        mock_web_rtc=True,
        slim=True,
        screen_options=ScreenOptions(
            min_width=min_width,
            max_width=max_width,
            min_height=min_height,
            max_height=max_height,
        ),
        header_options=HeaderGeneratorOptions(
            strict=True,
            browsers=['firefox'],
            operating_systems=['windows'],
            devices=['mobile'],
            locales=['en'],  #  Does not generate any other values than `en-US` regardless of the input in browserforge
            http_version='2',  # Http1 does not work in browserforge
        ),
    ).generate()

    assert fingerprint.screen.availWidth >= min_width
    assert fingerprint.screen.availWidth <= max_width
    assert fingerprint.screen.availHeight >= min_height
    assert fingerprint.screen.availHeight <= max_height

    assert fingerprint.mockWebRTC is True
    assert fingerprint.slim is True
    assert 'Firefox' in fingerprint.navigator.userAgent
    assert 'Win' in fingerprint.navigator.oscpu
    assert 'en-US' in fingerprint.navigator.languages


@pytest.mark.parametrize(
    'browser',
    [
        'firefox',
        ['firefox'],
        [Browser(name='firefox')],
    ],
)
def test_patched_header_generator_generate(browser: Iterable[str | Browser]) -> None:
    """Test that PatchedHeaderGenerator works with all the possible types correctly."""
    header = PatchedHeaderGenerator().generate(browser=browser)
    assert any(keyword in header['User-Agent'] for keyword in BROWSER_TYPE_HEADER_KEYWORD['firefox'])


================================================
FILE: tests/unit/fingerprint_suite/test_header_generator.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

import pytest

from crawlee.fingerprint_suite import HeaderGenerator
from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values
from crawlee.fingerprint_suite._consts import (
    BROWSER_TYPE_HEADER_KEYWORD,
)

if TYPE_CHECKING:
    from crawlee.fingerprint_suite._types import SupportedBrowserType


def test_get_common_headers(header_network: dict) -> None:
    header_generator = HeaderGenerator()
    headers = header_generator.get_common_headers()

    assert 'Accept' in headers
    assert headers['Accept'] in get_available_header_values(header_network, {'Accept', 'accept'})
    assert 'Accept-Language' in headers


def test_get_random_user_agent_header() -> None:
    """Test that a random User-Agent header is generated."""
    header_generator = HeaderGenerator()
    headers = header_generator.get_random_user_agent_header()

    assert 'User-Agent' in headers
    assert headers['User-Agent']


@pytest.mark.parametrize('browser_type', ['chrome', 'firefox', 'edge', 'safari'])
def test_get_user_agent_header_stress_test(browser_type: SupportedBrowserType, header_network: dict) -> None:
    """Test that the User-Agent header is consistently generated correctly.

    (Very fast even when stress tested.)"""
    for _ in range(100):
        header_generator = HeaderGenerator()
        headers = header_generator.get_user_agent_header(browser_type=browser_type)

        assert 'User-Agent' in headers
        assert any(keyword in headers['User-Agent'] for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type])
        assert headers['User-Agent'] in get_available_header_values(header_network, {'user-agent', 'User-Agent'})


def test_get_user_agent_header_invalid_browser_type() -> None:
    """Test that an invalid browser type raises a ValueError."""
    header_generator = HeaderGenerator()

    with pytest.raises(ValueError, match=r'Unsupported browser type'):
        header_generator.get_user_agent_header(browser_type='invalid_browser')  # ty: ignore[invalid-argument-type]


def test_get_sec_ch_ua_headers_chromium(header_network: dict) -> None:
    """Test that Sec-Ch-Ua headers are generated correctly for Chrome."""
    header_generator = HeaderGenerator()
    headers = header_generator.get_sec_ch_ua_headers(browser_type='chrome')

    assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua')
    assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile')
    assert headers.get('sec-ch-ua-platform') in get_available_header_values(header_network, 'sec-ch-ua-platform')


def test_get_sec_ch_ua_headers_firefox() -> None:
    """Test that sec-ch-ua headers are not generated for Firefox."""
    header_generator = HeaderGenerator()
    headers = header_generator.get_sec_ch_ua_headers(browser_type='firefox')

    assert not headers


def test_get_sec_ch_ua_headers_invalid_browser_type() -> None:
    """Test that an invalid browser type raises a ValueError for sec-ch-ua headers."""
    header_generator = HeaderGenerator()

    with pytest.raises(ValueError, match=r'Unsupported browser type'):
        header_generator.get_sec_ch_ua_headers(browser_type='invalid_browser')  # ty: ignore[invalid-argument-type]


================================================
FILE: tests/unit/http_clients/test_http_clients.py
================================================
from __future__ import annotations

import os
from typing import TYPE_CHECKING

import pytest
from curl_cffi import CurlHttpVersion

from crawlee import Request
from crawlee.errors import ProxyError
from crawlee.http_clients import CurlImpersonateHttpClient, HttpClient, HttpxHttpClient, ImpitHttpClient
from crawlee.statistics import Statistics
from tests.unit.server_endpoints import HELLO_WORLD

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from _pytest.fixtures import SubRequest
    from yarl import URL

    from crawlee.proxy_configuration import ProxyInfo


@pytest.fixture
async def custom_http_client(request: SubRequest) -> AsyncGenerator[HttpClient]:
    """Helper fixture to reduce code duplication.

    If clients are not initialized, create their default instances.
    Return client in active context, leave the context after the test."""

    client = request.param if isinstance(request.param, HttpClient) else request.param()
    async with client as _:
        yield _


async def test_http_1(http_client: HttpClient, server_url: URL) -> None:
    response = await http_client.send_request(str(server_url))
    assert response.http_version == 'HTTP/1.1'


@pytest.mark.parametrize(
    'custom_http_client',
    [
        pytest.param(CurlImpersonateHttpClient(http_version=CurlHttpVersion.V2_0), id='curl'),
        pytest.param(HttpxHttpClient(http1=False, http2=True), id='httpx'),
        pytest.param(ImpitHttpClient(), id='impit'),
    ],
    indirect=['custom_http_client'],
)
async def test_http_2(custom_http_client: HttpClient) -> None:
    response = await custom_http_client.send_request('https://apify.com/')
    assert response.http_version == 'HTTP/2'


@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
async def test_crawl_with_proxy(
    http_client: HttpClient,
    proxy: ProxyInfo,
    server_url: URL,
) -> None:
    url = str(server_url / 'status/222')
    request = Request.from_url(url)

    async with Statistics.with_default_state() as statistics:
        result = await http_client.crawl(request, proxy_info=proxy, statistics=statistics)

    assert result.http_response.status_code == 222  # 222 - authentication successful


@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
async def test_crawl_with_proxy_disabled(
    http_client: HttpClient,
    disabled_proxy: ProxyInfo,
) -> None:
    url = 'https://apify.com/'
    request = Request.from_url(url)

    with pytest.raises(ProxyError):
        async with Statistics.with_default_state() as statistics:
            await http_client.crawl(request, proxy_info=disabled_proxy, statistics=statistics)


@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
async def test_send_request_with_proxy(
    http_client: HttpClient,
    proxy: ProxyInfo,
    server_url: URL,
) -> None:
    url = str(server_url / 'status/222')

    response = await http_client.send_request(url, proxy_info=proxy)
    assert response.status_code == 222  # 222 - authentication successful


@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
async def test_send_request_with_proxy_disabled(
    http_client: HttpClient,
    disabled_proxy: ProxyInfo,
) -> None:
    url = 'https://apify.com/'

    with pytest.raises(ProxyError):
        await http_client.send_request(url, proxy_info=disabled_proxy)


async def test_crawl_allow_redirects_by_default(http_client: HttpClient, server_url: URL) -> None:
    target_url = str(server_url / 'status/200')
    redirect_url = str((server_url / 'redirect').update_query(url=target_url))
    request = Request.from_url(redirect_url)
    crawling_result = await http_client.crawl(request)

    assert crawling_result.http_response.status_code == 200
    assert request.loaded_url == target_url


@pytest.mark.parametrize(
    'custom_http_client',
    [
        pytest.param(CurlImpersonateHttpClient(allow_redirects=False), id='curl'),
        pytest.param(HttpxHttpClient(follow_redirects=False), id='httpx'),
        pytest.param(ImpitHttpClient(follow_redirects=False), id='impit'),
    ],
    indirect=['custom_http_client'],
)
async def test_crawl_allow_redirects_false(custom_http_client: HttpClient, server_url: URL) -> None:
    target_url = str(server_url / 'status/200')
    redirect_url = str((server_url / 'redirect').update_query(url=target_url))
    request = Request.from_url(redirect_url)

    crawling_result = await custom_http_client.crawl(request)

    assert crawling_result.http_response.status_code == 302
    assert crawling_result.http_response.headers['Location'] == target_url
    assert request.loaded_url == redirect_url


async def test_send_request_allow_redirects_by_default(http_client: HttpClient, server_url: URL) -> None:
    target_url = str(server_url / 'status/200')
    redirect_url = str((server_url / 'redirect').update_query(url=target_url))

    response = await http_client.send_request(redirect_url)

    assert response.status_code == 200


@pytest.mark.parametrize(
    'custom_http_client',
    [
        pytest.param(CurlImpersonateHttpClient(allow_redirects=False), id='curl'),
        pytest.param(HttpxHttpClient(follow_redirects=False), id='httpx'),
        pytest.param(ImpitHttpClient(follow_redirects=False), id='impit'),
    ],
    indirect=['custom_http_client'],
)
async def test_send_request_allow_redirects_false(custom_http_client: HttpClient, server_url: URL) -> None:
    target_url = str(server_url / 'status/200')
    redirect_url = str((server_url / 'redirect').update_query(url=target_url))

    response = await custom_http_client.send_request(redirect_url)

    assert response.status_code == 302
    assert response.headers['Location'] == target_url


async def test_stream(http_client: HttpClient, server_url: URL) -> None:
    content_body: bytes = b''

    async with http_client.stream(str(server_url)) as response:
        assert response.status_code == 200
        async for chunk in response.read_stream():
            content_body += chunk

    assert content_body == HELLO_WORLD


async def test_stream_error_double_read_stream(http_client: HttpClient, server_url: URL) -> None:
    async with http_client.stream(str(server_url)) as response:
        assert response.status_code == 200
        content_body_first: bytes = b''
        async for chunk in response.read_stream():
            content_body_first += chunk

        with pytest.raises(RuntimeError):
            [chunk async for chunk in response.read_stream()]

    assert content_body_first == HELLO_WORLD


async def test_stream_error_for_read(http_client: HttpClient, server_url: URL) -> None:
    async with http_client.stream(str(server_url)) as response:
        assert response.status_code == 200

        with pytest.raises(RuntimeError):
            await response.read()


async def test_send_request_error_for_read_stream(http_client: HttpClient, server_url: URL) -> None:
    response = await http_client.send_request(str(server_url))

    assert response.status_code == 200
    with pytest.raises(RuntimeError):
        [item async for item in response.read_stream()]


async def test_send_crawl_error_for_read_stream(http_client: HttpClient, server_url: URL) -> None:
    response = await http_client.crawl(Request.from_url(str(server_url)))
    http_response = response.http_response

    assert http_response.status_code == 200
    with pytest.raises(RuntimeError):
        [item async for item in http_response.read_stream()]


@pytest.mark.parametrize(
    'custom_http_client',
    [
        pytest.param(CurlImpersonateHttpClient(), id='curl'),
        pytest.param(HttpxHttpClient(), id='httpx'),
        pytest.param(ImpitHttpClient(), id='impit'),
    ],
)
async def test_reuse_context_manager(custom_http_client: HttpClient, server_url: URL) -> None:
    async with custom_http_client:
        response = await custom_http_client.send_request(str(server_url))
        assert response.status_code == 200

    # Reusing the context manager should not raise an error
    async with custom_http_client:
        response = await custom_http_client.send_request(str(server_url))
        assert response.status_code == 200


async def test_work_after_cleanup(http_client: HttpClient, server_url: URL) -> None:
    response = await http_client.send_request(str(server_url))
    assert response.status_code == 200

    # Cleanup the client
    await http_client.cleanup()

    # After cleanup, the client should still work
    response = await http_client.send_request(str(server_url))
    assert response.status_code == 200


async def test_compressed_chunked_stream(http_client: HttpClient, server_url: URL) -> None:
    content_body: bytes = b''

    async with http_client.stream(str(server_url / 'get_compressed')) as response:
        assert response.status_code == 200
        async for chunk in response.read_stream():
            content_body += chunk

    assert content_body == HELLO_WORLD * 1000


================================================
FILE: tests/unit/http_clients/test_httpx.py
================================================
from __future__ import annotations

import json
from typing import TYPE_CHECKING

import pytest

from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values
from crawlee.fingerprint_suite._consts import COMMON_ACCEPT_LANGUAGE
from crawlee.http_clients import HttpxHttpClient

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from yarl import URL

    from crawlee.http_clients import HttpClient


@pytest.fixture
async def http_client() -> AsyncGenerator[HttpClient]:
    async with HttpxHttpClient(http2=False) as client:
        yield client


async def test_common_headers_and_user_agent(server_url: URL, header_network: dict) -> None:
    """Test that the relevant headers use header values from header generator instead of default Httpx headers.

    Httpx uses own headers by default which is not desired as it could increase blocking chances.
    """
    client = HttpxHttpClient()

    response = await client.send_request(str(server_url / 'headers'))
    response_headers = json.loads((await response.read()).decode())

    assert 'accept' in response_headers
    assert response_headers['accept'] in get_available_header_values(header_network, {'Accept', 'accept'})

    assert 'accept-language' in response_headers
    assert response_headers['accept-language'] == COMMON_ACCEPT_LANGUAGE

    # By default, HTTPX uses its own User-Agent, which should be replaced by the one from the header generator.
    assert 'user-agent' in response_headers
    assert 'python-httpx' not in response_headers['user-agent']
    assert response_headers['user-agent'] in get_available_header_values(header_network, {'User-Agent', 'user-agent'})


================================================
FILE: tests/unit/otel/test_crawler_instrumentor.py
================================================
import io
import json
import re
from unittest import mock

from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor
from opentelemetry.trace import set_tracer_provider
from yarl import URL

from crawlee import ConcurrencySettings
from crawlee.crawlers import ParselCrawler
from crawlee.otel.crawler_instrumentor import CrawlerInstrumentor
from crawlee.storages import Dataset


async def test_crawler_instrumentor_capability(server_url: URL) -> None:
    """Test OpenTelemetry instrumentation capability of the crawler.

    Instrument the crawler and one additional class and check that telemetry data is generated correctly.
    Telemetry data is redirected to an in-memory file for testing purposes."""

    resource = Resource.create(
        {
            'service.name': 'ExampleCrawler',
            'service.version': '1.0.0',
            'environment': 'development',
        }
    )
    # Set up the OpenTelemetry tracer provider and exporter
    provider = TracerProvider(resource=resource)
    in_memory_sink_for_telemetry = io.StringIO(newline='\n')
    exporter = ConsoleSpanExporter(out=in_memory_sink_for_telemetry)
    provider.add_span_processor(SimpleSpanProcessor(exporter))
    set_tracer_provider(provider)
    # Instrument the crawler with OpenTelemetry
    instrumentor = CrawlerInstrumentor(instrument_classes=[Dataset])
    instrumentor.instrument()

    # Generate first telemetry data from `Dataset` public methods.
    # `Dataset` is in `instrument_classes` argument, and thus it's public methods are instrumented.
    dataset = await Dataset.open(name='test-dataset')
    await dataset.drop()

    # Other traces will be from crawler run.
    crawler = ParselCrawler(
        max_requests_per_crawl=1,
        request_handler=mock.AsyncMock(),
        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
    )

    # Run crawler and generate more telemetry data.
    await crawler.run([str(server_url)])

    # Telemetry jsons are packed together in one string. Unpack them and load as json objects.
    telemetry_strings = in_memory_sink_for_telemetry.getvalue()
    telemetry_data = [
        json.loads(telemetry_string) for telemetry_string in re.split(r'(?<=\})\s*(?=\{)', telemetry_strings)
    ]

    # Do some basic checks on the telemetry data.
    # The point of this test is not to check completeness of the data, but telemetry capability.

    # Extra `instrument_classes` telemetry - KeyValueStore.open() is parent to KeyValueStore.__init__() span.
    assert telemetry_data[0]['name'] == '__init__'
    assert telemetry_data[0]['attributes']['code.function.name'] == 'Dataset.__init__'
    assert telemetry_data[0]['resource']['attributes'] == dict(resource.attributes)

    assert telemetry_data[1]['name'] == 'open'
    assert telemetry_data[1]['attributes']['code.function.name'] == 'Dataset.open'
    assert telemetry_data[1]['resource']['attributes'] == dict(resource.attributes)

    # Opening KeyValueStore creates a new trace.
    assert telemetry_data[0]['context']['trace_id'] == telemetry_data[1]['context']['trace_id']

    assert telemetry_data[2]['name'] == 'drop'
    assert telemetry_data[2]['attributes']['code.function.name'] == 'Dataset.drop'
    assert telemetry_data[2]['resource']['attributes'] == dict(resource.attributes)

    # Dropping KeyValueStore creates a new trace.
    assert telemetry_data[2]['context']['trace_id'] != telemetry_data[1]['context']['trace_id']

    # Crawler telemetry - all crawler spans will be in one trace as there is only one request in this test.
    assert telemetry_data[3]['name'] == '_execute_pre_navigation_hooks, action'
    assert telemetry_data[3]['attributes']['code.function.name'] == 'AbstractHttpCrawler._execute_pre_navigation_hooks'
    assert telemetry_data[3]['attributes']['url.full'] == str(server_url)
    assert telemetry_data[3]['resource']['attributes'] == dict(resource.attributes)

    assert telemetry_data[-1]['name'] == '__run_task_function'
    assert telemetry_data[-1]['attributes']['code.function.name'] == 'BasicCrawler.__run_task_function'
    assert telemetry_data[-1]['resource']['attributes'] == dict(resource.attributes)

    # Processing of the request is in the same trace.
    assert telemetry_data[3]['context']['trace_id'] == telemetry_data[-1]['context']['trace_id']

    # Check that trace_ids of unrelated traces are not the same.
    assert telemetry_data[0]['context']['trace_id'] != telemetry_data[-1]['context']['trace_id']


================================================
FILE: tests/unit/proxy_configuration/test_new_proxy_info.py
================================================
from __future__ import annotations

from itertools import cycle

import pytest

from crawlee import Request
from crawlee.proxy_configuration import ProxyConfiguration


async def test_returns_proxy_info() -> None:
    """Test that proxy_urls can return contain both string and None."""
    config = ProxyConfiguration(proxy_urls=[None, 'http://proxy.com:1111'])

    proxy_info = await config.new_proxy_info(None, None, None)
    assert proxy_info is None

    proxy_info = await config.new_proxy_info(None, None, None)
    assert proxy_info is not None
    assert proxy_info.url == 'http://proxy.com:1111'
    assert proxy_info.hostname == 'proxy.com'
    assert proxy_info.username == ''
    assert proxy_info.password == ''
    assert proxy_info.port == 1111


async def test_throws_on_invalid_new_url_function() -> None:
    config = ProxyConfiguration(
        new_url_function=lambda session_id=None, request=None: 'http://proxy.com:1111*invalid_url'  # noqa: ARG005
    )

    with pytest.raises(ValueError):  # noqa: PT011
        await config.new_proxy_info(None, None, None)


async def test_returns_proxy_info_with_new_url_function() -> None:
    """Test that new_url_function can return string and None."""
    proxy_iterator = cycle([None, 'http://proxy.com:1111'])

    config = ProxyConfiguration(new_url_function=lambda session_id=None, request=None: next(proxy_iterator))  # noqa: ARG005

    proxy_info = await config.new_proxy_info(None, None, None)
    assert proxy_info is None

    proxy_info = await config.new_proxy_info(None, None, None)
    assert proxy_info is not None
    assert proxy_info.url == 'http://proxy.com:1111'
    assert proxy_info.hostname == 'proxy.com'
    assert proxy_info.username == ''
    assert proxy_info.password == ''
    assert proxy_info.port == 1111


async def test_returns_proxy_info_with_new_url_function_async() -> None:
    async def new_url(session_id: str | None = None, request: Request | None = None) -> str:  # noqa: ARG001
        return 'http://proxy.com:1111'

    config = ProxyConfiguration(new_url_function=new_url)

    proxy_info = await config.new_proxy_info(None, None, None)

    assert proxy_info is not None
    assert proxy_info.url == 'http://proxy.com:1111'
    assert proxy_info.hostname == 'proxy.com'
    assert proxy_info.username == ''
    assert proxy_info.password == ''
    assert proxy_info.port == 1111


async def test_rotates_proxies() -> None:
    proxy_urls: list[str | None] = ['http://proxy:1111', 'http://proxy:2222', 'http://proxy:3333']
    config = ProxyConfiguration(proxy_urls=proxy_urls)

    info = await config.new_proxy_info(None, None, None)
    assert info is not None
    assert info.url == proxy_urls[0]

    info = await config.new_proxy_info(None, None, None)
    assert info is not None
    assert info.url == proxy_urls[1]

    info = await config.new_proxy_info(None, None, None)
    assert info is not None
    assert info.url == proxy_urls[2]


async def test_rotates_proxies_with_sessions() -> None:
    proxy_urls: list[str | None] = ['http://proxy:1111', 'http://proxy:2222', 'http://proxy:3333']
    request = Request(url='http://some.domain/abc', unique_key='1')
    sessions = [f'session_{i}' for i in range(6)]

    config = ProxyConfiguration(proxy_urls=proxy_urls)

    # A single session should always receive the same proxy
    info = await config.new_proxy_info(sessions[0], None, None)
    assert info is not None
    assert info.url == proxy_urls[0]

    info = await config.new_proxy_info(sessions[0], None, None)
    assert info is not None
    assert info.url == proxy_urls[0]

    info = await config.new_proxy_info(sessions[0], None, None)
    assert info is not None
    assert info.url == proxy_urls[0]

    info = await config.new_proxy_info(sessions[0], request, None)
    assert info is not None
    assert info.url == proxy_urls[0]

    info = await config.new_proxy_info(sessions[0], request, None)
    assert info is not None
    assert info.url == proxy_urls[0]

    # Different sessions should get rotated proxies
    info = await config.new_proxy_info(sessions[1], None, None)
    assert info is not None
    assert info.url == proxy_urls[1]

    info = await config.new_proxy_info(sessions[2], request, None)
    assert info is not None
    assert info.url == proxy_urls[2]

    info = await config.new_proxy_info(sessions[3], None, None)
    assert info is not None
    assert info.url == proxy_urls[0]

    info = await config.new_proxy_info(sessions[4], None, None)
    assert info is not None
    assert info.url == proxy_urls[1]

    info = await config.new_proxy_info(sessions[5], request, None)
    assert info is not None
    assert info.url == proxy_urls[2]

    # Without sessions should get rotated proxies
    info = await config.new_proxy_info(None, None, None)
    assert info is not None
    assert info.url == proxy_urls[0]

    info = await config.new_proxy_info(None, request, None)
    assert info is not None
    assert info.url == proxy_urls[1]

    info = await config.new_proxy_info(None, None, None)
    assert info is not None
    assert info.url == proxy_urls[2]

    info = await config.new_proxy_info(None, None, None)
    assert info is not None
    assert info.url == proxy_urls[0]

    info = await config.new_proxy_info(None, request, None)
    assert info is not None
    assert info.url == proxy_urls[1]


@pytest.mark.parametrize(
    ('url', 'expected_port'),
    [
        # Default ports based on the URL scheme
        ('http://proxy.com', 80),
        ('https://proxy.com', 443),
        # Explicit ports specified in the URL
        ('http://proxy.com:80', 80),
        ('http://proxy.com:1234', 1234),
    ],
)
async def test_sets_port(url: str, expected_port: int) -> None:
    """Test that the port property is set correctly.

    The port is inferred from the URL scheme if it is not specified in the URL.
    """
    config = ProxyConfiguration(proxy_urls=[url])

    info = await config.new_proxy_info(None, None, None)
    assert info is not None
    assert info.port == expected_port


================================================
FILE: tests/unit/proxy_configuration/test_tiers.py
================================================
from __future__ import annotations

from crawlee import Request
from crawlee.proxy_configuration import ProxyConfiguration


async def test_rotates_proxies_uniformly_with_no_request() -> None:
    tiered_proxy_urls: list[list[str | None]] = [
        ['http://proxy:1111', 'http://proxy:2222'],
        ['http://proxy:3333', 'http://proxy:4444'],
    ]

    config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)

    info = await config.new_proxy_info(None, None, None)
    assert info is not None
    assert info.url == tiered_proxy_urls[0][0]

    info = await config.new_proxy_info(None, None, None)
    assert info is not None
    assert info.url == tiered_proxy_urls[0][1]

    info = await config.new_proxy_info(None, None, None)
    assert info is not None
    assert info.url == tiered_proxy_urls[1][0]

    info = await config.new_proxy_info(None, None, None)
    assert info is not None
    assert info.url == tiered_proxy_urls[1][1]

    info = await config.new_proxy_info(None, None, None)
    assert info is not None
    assert info.url == tiered_proxy_urls[0][0]


async def test_retrying_request_makes_tier_go_up() -> None:
    tiered_proxy_urls: list[list[str | None]] = [
        ['http://proxy:1111'],
        ['http://proxy:2222'],
        ['http://proxy:3333'],
        ['http://proxy:4444'],
    ]

    config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)

    # Calling `new_proxy_info` with the same request most probably means it's being retried
    request_1 = Request(url='http://some.domain/abc', unique_key='1')

    info = await config.new_proxy_info(None, request_1, None)
    assert info is not None
    assert info.url == tiered_proxy_urls[0][0]

    info = await config.new_proxy_info(None, request_1, None)
    assert info is not None
    assert info.url == tiered_proxy_urls[1][0]

    info = await config.new_proxy_info(None, request_1, None)
    assert info is not None
    assert info.url == tiered_proxy_urls[2][0]

    # Subsequent requests with the same domain should use the same tier
    request_2 = Request(url='http://some.domain/xyz', unique_key='2')

    info = await config.new_proxy_info(None, request_2, None)
    assert info is not None
    assert info.url == tiered_proxy_urls[2][0]


async def test_retrying_request_makes_tier_go_up_with_sessions() -> None:
    tiered_proxy_urls: list[list[str | None]] = [
        ['http://proxy:1111'],
        ['http://proxy:2222'],
        ['http://proxy:3333'],
        ['http://proxy:4444'],
    ]

    config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)

    request = Request(url='http://some.domain/abc', unique_key='1')

    # Calling `new_proxy_info` with the same request likely means that it is being retried.
    # However, a single session should always receive the same proxy
    info = await config.new_proxy_info('session_id', request, None)
    assert info is not None
    assert info.url == tiered_proxy_urls[0][0]

    info = await config.new_proxy_info('session_id', request, None)
    assert info is not None
    assert info.url == tiered_proxy_urls[0][0]

    info = await config.new_proxy_info('session_id', request, None)
    assert info is not None
    assert info.url == tiered_proxy_urls[0][0]

    # For a new session, we will get a proxy from the corresponding tier
    info = await config.new_proxy_info('session_id2', request, None)
    assert info is not None
    assert info.url == tiered_proxy_urls[3][0]

    info = await config.new_proxy_info('session_id2', request, None)
    assert info is not None
    assert info.url == tiered_proxy_urls[3][0]


async def test_successful_request_makes_tier_go_down() -> None:
    """Repeatedly requesting a proxy for a single request will cause the proxy tier to go up -
    ProxyConfiguration assumes those are retries. Then, requesting a proxy for different requests to the same domain
    will cause the tier to drop back down."""

    tiered_proxy_urls: list[list[str | None]] = [
        ['http://proxy:1111'],
        ['http://proxy:2222'],
        ['http://proxy:3333'],
        ['http://proxy:4444'],
    ]

    config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)

    request_1 = Request(url='http://some.domain/abc', unique_key='1')

    info = None
    for tier in tiered_proxy_urls:
        info = await config.new_proxy_info(None, request_1, None)
        assert info is not None
        assert info.url == tier[0]

    for i in range(100):
        new_request = Request(url=f'http://some.domain/{i}', unique_key=str(i))
        info = await config.new_proxy_info(None, new_request, None)

    assert info is not None
    assert info.url == tiered_proxy_urls[0][0]


async def test_none_proxy_retrying_request_makes_tier_go_up() -> None:
    tiered_proxy_urls: list[list[str | None]] = [
        [None],
        ['http://proxy:1111'],
    ]

    config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)

    # Calling `new_proxy_info` with the same request most probably means it's being retried
    request_1 = Request(url='http://some.domain/abc', unique_key='1')

    # No proxy used.
    info = await config.new_proxy_info(None, request_1, None)
    assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.'

    # Proxy should go up one tier for same request that was already sent before.
    info = await config.new_proxy_info(None, request_1, None)
    assert info is not None, (
        'config.new_proxy_info is expected to generate non-none proxy info from non-none tiered_proxy_urls.'
    )
    assert info.url == tiered_proxy_urls[1][0]


async def test_none_proxy_rotates_proxies_uniformly_with_no_request() -> None:
    tiered_proxy_urls = [
        [None, 'http://proxy:1111'],
    ]

    config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)

    # No proxy used.
    info = await config.new_proxy_info(None, None, None)
    assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.'

    # Proxy should be rotated on the same proxy tier for a new request.
    info = await config.new_proxy_info(None, None, None)
    assert info is not None, (
        'config.new_proxy_info is expected to generate non-none proxy info from non-none tiered_proxy_urls.'
    )
    assert info.url == tiered_proxy_urls[0][1]

    # Proxy rotation starts from the beginning of the proxy list after last proxy in tier was used. No proxy used again.
    info = await config.new_proxy_info(None, None, None)
    assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.'


================================================
FILE: tests/unit/request_loaders/test_request_list.py
================================================
from collections.abc import AsyncGenerator

from crawlee.request_loaders._request_list import RequestList
from crawlee.storages import KeyValueStore


async def test_sync_traversal() -> None:
    request_list = RequestList(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])

    while not await request_list.is_finished():
        item = await request_list.fetch_next_request()
        assert item is not None

        await request_list.mark_request_as_handled(item)

    assert await request_list.is_empty()


async def test_async_traversal() -> None:
    async def generator() -> AsyncGenerator[str]:
        yield 'https://a.placeholder.com'
        yield 'https://b.placeholder.com'
        yield 'https://c.placeholder.com'

    request_list = RequestList(generator())

    while not await request_list.is_finished():
        item = await request_list.fetch_next_request()
        assert item is not None

        await request_list.mark_request_as_handled(item)

    assert await request_list.is_empty()


async def test_is_empty_does_not_depend_on_fetch_next_request() -> None:
    request_list = RequestList(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])

    item_1 = await request_list.fetch_next_request()
    assert item_1 is not None
    assert not await request_list.is_finished()

    item_2 = await request_list.fetch_next_request()
    assert item_2 is not None
    assert not await request_list.is_finished()

    item_3 = await request_list.fetch_next_request()
    assert item_3 is not None
    assert not await request_list.is_finished()

    assert await request_list.is_empty()
    assert not await request_list.is_finished()

    await request_list.mark_request_as_handled(item_1)
    await request_list.mark_request_as_handled(item_2)
    await request_list.mark_request_as_handled(item_3)

    assert await request_list.is_empty()
    assert await request_list.is_finished()


async def test_persist_requests_key_with_sync_iterable() -> None:
    """Test that persist_requests_key persists request data from a sync iterable."""
    persist_key = 'test_requests_persist_sync'
    urls = ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']

    # Create a request list with persistence enabled
    request_list = RequestList(urls, persist_requests_key=persist_key)

    # Fetch one request to trigger initialization
    first_request = await request_list.fetch_next_request()
    assert first_request is not None
    assert first_request.url == 'https://a.placeholder.com'

    # Check that the requests were persisted
    kvs = await KeyValueStore.open()
    persisted_data = await kvs.get_value(persist_key)
    assert persisted_data is not None


async def test_persist_requests_key_with_empty_iterator() -> None:
    """Test behavior when persist_requests_key is provided but the iterator is empty."""
    persist_key = 'test_empty_iterator'

    # Create request list with empty iterator
    request_list = RequestList([], persist_requests_key=persist_key)

    # Should be empty immediately
    assert await request_list.is_empty()
    assert await request_list.is_finished()

    # Check that empty requests were persisted
    kvs = await KeyValueStore.open()
    persisted_data = await kvs.get_value(persist_key)
    assert persisted_data is not None


async def test_requests_restoration_without_state() -> None:
    """Test that persisted request data is properly restored on subsequent RequestList creation."""
    persist_requests_key = 'test_requests_restoration'
    urls = ['https://restore1.placeholder.com', 'https://restore2.placeholder.com']

    # Create first request list and process one request
    request_list_1 = RequestList(urls, persist_requests_key=persist_requests_key)
    first_request = await request_list_1.fetch_next_request()
    assert first_request is not None
    assert first_request.url == 'https://restore1.placeholder.com'
    await request_list_1.mark_request_as_handled(first_request)

    # Create second request list with same persist key (simulating restart)
    # Since we don't have state persistence, it will start from the beginning of the persisted data
    spy = iter(['1', '2', '3'])
    request_list_2 = RequestList(spy, persist_requests_key=persist_requests_key)

    # Should be able to fetch requests from persisted data, but starts from beginning
    first_request_again = await request_list_2.fetch_next_request()
    assert first_request_again is not None
    assert first_request_again.url == 'https://restore1.placeholder.com'
    await request_list_2.mark_request_as_handled(first_request_again)

    # Make sure that the second instance did not consume the input iterator
    assert len(list(spy)) == 3


async def test_state_restoration() -> None:
    """Test that persisted processing state is properly restored on subsequent RequestList creation."""
    persist_state_key = 'test_state_restoration'
    urls = [
        'https://restore1.placeholder.com',
        'https://restore2.placeholder.com',
        'https://restore3.placeholder.com',
        'https://restore4.placeholder.com',
    ]

    # Create first request list and process one request
    request_list_1 = RequestList(
        urls,
        persist_state_key=persist_state_key,
    )

    first_request = await request_list_1.fetch_next_request()
    assert first_request is not None
    assert first_request.url == 'https://restore1.placeholder.com'
    await request_list_1.mark_request_as_handled(first_request)
    await request_list_1._state.persist_state()

    # Create second request list with same persist key (simulating restart)
    request_list_2 = RequestList(
        urls,
        persist_state_key=persist_state_key,
    )

    # Should be able to continue where the previous instance left off
    next_request = await request_list_2.fetch_next_request()
    assert next_request is not None
    assert next_request.url == 'https://restore2.placeholder.com'
    await request_list_2.mark_request_as_handled(next_request)

    next_request = await request_list_2.fetch_next_request()
    assert next_request is not None
    assert next_request.url == 'https://restore3.placeholder.com'
    await request_list_2.mark_request_as_handled(next_request)

    next_request = await request_list_2.fetch_next_request()
    assert next_request is not None
    assert next_request.url == 'https://restore4.placeholder.com'
    await request_list_2.mark_request_as_handled(next_request)


async def test_requests_and_state_restoration() -> None:
    """Test that persisted request data and processing state is properly restored on subsequent RequestList creation."""
    persist_requests_key = 'test_requests_restoration'
    persist_state_key = 'test_state_restoration'
    urls = [
        'https://restore1.placeholder.com',
        'https://restore2.placeholder.com',
        'https://restore3.placeholder.com',
    ]

    # Create first request list and process one request
    request_list_1 = RequestList(
        urls,
        persist_requests_key=persist_requests_key,
        persist_state_key=persist_state_key,
    )

    first_request = await request_list_1.fetch_next_request()
    assert first_request is not None
    assert first_request.url == 'https://restore1.placeholder.com'
    await request_list_1.mark_request_as_handled(first_request)
    await request_list_1._state.persist_state()

    # Create second request list with same persist key (simulating restart)
    spy = iter(['1', '2', '3'])
    request_list_2 = RequestList(
        spy,
        persist_requests_key=persist_requests_key,
        persist_state_key=persist_state_key,
    )

    # Should be able to fetch requests from persisted data and continue where the previous instance left off
    next_request = await request_list_2.fetch_next_request()
    assert next_request is not None
    assert next_request.url == 'https://restore2.placeholder.com'
    await request_list_2.mark_request_as_handled(next_request)

    next_request = await request_list_2.fetch_next_request()
    assert next_request is not None
    assert next_request.url == 'https://restore3.placeholder.com'
    await request_list_2.mark_request_as_handled(next_request)

    # Make sure that the second instance did not consume the input iterator
    assert len(list(spy)) == 3


async def test_persist_requests_key_only_persists_once() -> None:
    """Test that requests are only persisted once, even with multiple RequestList instances."""
    persist_key = 'test_requests_once'
    urls = ['https://once1.placeholder.com', 'https://once2.placeholder.com']

    # Create first request list
    request_list_1 = RequestList(urls, persist_requests_key=persist_key)
    await request_list_1.fetch_next_request()  # Trigger persistence

    # Get initial persisted data
    kvs = await KeyValueStore.open()
    initial_data = await kvs.get_value(persist_key)
    assert initial_data is not None

    # Create second request list with different data
    different_urls = ['https://different.placeholder.com']
    request_list_2 = RequestList(different_urls, persist_requests_key=persist_key)
    await request_list_2.fetch_next_request()  # Should use persisted data, not new data

    # Verify the persisted data hasn't changed
    current_data = await kvs.get_value(persist_key)
    assert current_data == initial_data

    # The request should come from the original persisted data, not the new iterator
    fetched_request = await request_list_2.fetch_next_request()
    assert fetched_request is not None
    assert fetched_request.url == 'https://once2.placeholder.com'  # From original data


================================================
FILE: tests/unit/request_loaders/test_sitemap_request_loader.py
================================================
import asyncio
import base64
import gzip
from typing import TYPE_CHECKING

from yarl import URL

from crawlee import RequestOptions, RequestTransformAction
from crawlee.http_clients._base import HttpClient
from crawlee.request_loaders._sitemap_request_loader import SitemapRequestLoader
from crawlee.storages import KeyValueStore

if TYPE_CHECKING:
    from crawlee._types import JsonSerializable

BASIC_SITEMAP = """
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://not-exists.com/</loc>
<lastmod>2005-02-03</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>http://not-exists.com/catalog?item=12&amp;desc=vacation_hawaii</loc>
<changefreq>weekly</changefreq>
</url>
<url>
<loc>http://not-exists.com/catalog?item=73&amp;desc=vacation_new_zealand</loc>
<lastmod>2004-12-23</lastmod>
<changefreq>weekly</changefreq>
</url>
<url>
<loc>http://not-exists.com/catalog?item=74&amp;desc=vacation_newfoundland</loc>
<lastmod>2004-12-23T18:00:15+00:00</lastmod>
<priority>0.3</priority>
</url>
<url>
<loc>http://not-exists.com/catalog?item=83&amp;desc=vacation_usa</loc>
<lastmod>2004-11-23</lastmod>
</url>
</urlset>
""".strip()


def compress_gzip(data: str) -> bytes:
    """Compress a string using gzip."""
    return gzip.compress(data.encode())


def encode_base64(data: bytes) -> str:
    """Encode bytes to a base64 string."""
    return base64.b64encode(data).decode('utf-8')


async def test_sitemap_traversal(server_url: URL, http_client: HttpClient) -> None:
    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
    sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client)

    while not await sitemap_loader.is_finished():
        item = await sitemap_loader.fetch_next_request()

        if item:
            await sitemap_loader.mark_request_as_handled(item)

    assert await sitemap_loader.is_empty()
    assert await sitemap_loader.is_finished()
    assert await sitemap_loader.get_total_count() == 5
    assert await sitemap_loader.get_handled_count() == 5


async def test_is_empty_does_not_depend_on_fetch_next_request(server_url: URL, http_client: HttpClient) -> None:
    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
    sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client)

    items = []

    for _ in range(5):
        item = await sitemap_loader.fetch_next_request()
        assert item is not None
        assert not await sitemap_loader.is_finished()
        items.append(item)

    assert await sitemap_loader.is_empty()
    assert not await sitemap_loader.is_finished()

    for item in items:
        await sitemap_loader.mark_request_as_handled(item)

    assert await sitemap_loader.is_empty()

    await asyncio.sleep(0.1)

    assert await sitemap_loader.is_finished()


async def test_abort_sitemap_loading(server_url: URL, http_client: HttpClient) -> None:
    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
    sitemap_loader = SitemapRequestLoader([str(sitemap_url)], max_buffer_size=2, http_client=http_client)

    item = await sitemap_loader.fetch_next_request()
    assert item is not None
    await sitemap_loader.mark_request_as_handled(item)

    assert not await sitemap_loader.is_empty()
    assert not await sitemap_loader.is_finished()

    await sitemap_loader.abort_loading()

    item = await sitemap_loader.fetch_next_request()
    assert item is not None
    await sitemap_loader.mark_request_as_handled(item)

    assert await sitemap_loader.is_finished()


async def test_create_persist_state_for_sitemap_loading(
    server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore
) -> None:
    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
    persist_key = 'create_persist_state'
    sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, persist_state_key=persist_key)
    assert await sitemap_loader.is_finished() is False

    await sitemap_loader.close()

    state_data = await key_value_store.get_value(persist_key)

    assert state_data is not None
    assert state_data['handledCount'] == 0


async def test_data_persistence_for_sitemap_loading(
    server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore
) -> None:
    async def wait_for_sitemap_loader_not_empty(sitemap_loader: SitemapRequestLoader) -> None:
        while await sitemap_loader.is_empty() and not await sitemap_loader.is_finished():  # noqa: ASYNC110
            await asyncio.sleep(0.1)

    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
    persist_key = 'data_persist_state'
    sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, persist_state_key=persist_key)

    # Give time to load
    await asyncio.wait_for(wait_for_sitemap_loader_not_empty(sitemap_loader), timeout=2)

    await sitemap_loader.close()

    state_data = await key_value_store.get_value(persist_key)

    assert state_data is not None
    assert state_data['handledCount'] == 0
    assert state_data['totalCount'] == 5
    assert len(state_data['urlQueue']) == 5


async def test_recovery_data_persistence_for_sitemap_loading(
    server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore
) -> None:
    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
    persist_key = 'recovery_persist_state'
    sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, persist_state_key=persist_key)

    item = await sitemap_loader.fetch_next_request()

    assert item is not None
    await sitemap_loader.mark_request_as_handled(item)

    await sitemap_loader.close()

    state_data = await key_value_store.get_value(persist_key)

    assert state_data is not None
    next_item_in_kvs = state_data['urlQueue'][0]

    sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, persist_state_key=persist_key)

    item = await sitemap_loader.fetch_next_request()

    assert item is not None
    assert item.url == next_item_in_kvs


async def test_transform_request_function(server_url: URL, http_client: HttpClient) -> None:
    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))

    def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
        user_data: dict[str, JsonSerializable] = {'transformed': True}
        request_options['user_data'] = user_data
        return request_options

    sitemap_loader = SitemapRequestLoader(
        [str(sitemap_url)],
        http_client=http_client,
        transform_request_function=transform_request,
    )

    extracted_urls = set()

    while not await sitemap_loader.is_finished():
        request = await sitemap_loader.fetch_next_request()

        if request:
            assert request.user_data.get('transformed') is True

            extracted_urls.add(request.url)

            await sitemap_loader.mark_request_as_handled(request)

    assert len(extracted_urls) == 5
    assert extracted_urls == {
        'http://not-exists.com/',
        'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',
        'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',
        'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',
        'http://not-exists.com/catalog?item=83&desc=vacation_usa',
    }


================================================
FILE: tests/unit/server.py
================================================
from __future__ import annotations

import asyncio
import base64
import gzip
import json
import sys
import threading
import time
from collections.abc import Awaitable, Callable, Coroutine, Iterator
from typing import TYPE_CHECKING, Any
from urllib.parse import parse_qs

from uvicorn.server import Server
from yarl import URL

from tests.unit.server_endpoints import (
    BASE_INDEX,
    GENERIC_RESPONSE,
    HELLO_WORLD,
    INCAPSULA,
    INFINITE_SCROLL,
    NON_HREF_LINKS,
    PROBLEMATIC_LINKS,
    RESOURCE_LOADING_PAGE,
    ROBOTS_TXT,
    SECONDARY_INDEX,
    START_ENQUEUE,
    START_ENQUEUE_NON_HREF,
)

if TYPE_CHECKING:
    from socket import socket

Receive = Callable[[], Awaitable[dict[str, Any]]]
Send = Callable[[dict[str, Any]], Coroutine[None, None, None]]
PathHandler = Callable[[dict[str, Any], Receive, Send], Coroutine[None, None, None]]


def get_headers_dict(scope: dict[str, Any]) -> dict[str, str]:
    """Extract request headers and return them as a dictionary."""
    headers = {}
    for name, value in scope.get('headers', []):
        headers[name.decode()] = value.decode()
    return headers


def get_query_params(query_string: bytes) -> dict[str, str]:
    """Extract and parse query parameters from the request."""
    args = parse_qs(query_string.decode(), keep_blank_values=True)
    result_args = {}

    for key, values in args.items():
        if values:
            result_args[key] = values[0]

    return result_args


def get_cookies_from_headers(headers: dict[str, Any]) -> dict[str, str]:
    """Extract cookies from request headers."""
    cookies = {}
    cookie_header: str = headers.get('cookie', '')
    if cookie_header:
        for cookie in cookie_header.split(';'):
            name, value = cookie.strip().split('=')
            cookies[name] = value
    return cookies


async def send_json_response(send: Send, data: Any, status: int = 200) -> None:
    """Send a JSON response to the client."""
    await send(
        {
            'type': 'http.response.start',
            'status': status,
            'headers': [[b'content-type', b'application/json']],
        }
    )
    await send({'type': 'http.response.body', 'body': json.dumps(data, indent=2).encode()})


async def send_html_response(send: Send, html_content: bytes, status: int = 200) -> None:
    """Send an HTML response to the client."""
    await send(
        {
            'type': 'http.response.start',
            'status': status,
            'headers': [[b'content-type', b'text/html; charset=utf-8']],
        }
    )
    await send({'type': 'http.response.body', 'body': html_content})


async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
    """Main ASGI application handler that routes requests to specific handlers.

    Args:
        scope: The ASGI connection scope.
        receive: The ASGI receive function.
        send: The ASGI send function.
    """
    assert scope['type'] == 'http'
    paths: dict[str, PathHandler] = {
        'start_enqueue': start_enqueue_endpoint,
        'start_enqueue_non_href': start_enqueue_non_href_endpoint,
        'sub_index': secondary_index_endpoint,
        'incapsula': incapsula_endpoint,
        'page_1': generic_response_endpoint,
        'page_2': generic_response_endpoint,
        'page_3': generic_response_endpoint,
        'base_page': base_index_endpoint,
        'problematic_links': problematic_links_endpoint,
        'non_href_links': non_href_links_endpoint,
        'set_cookies': set_cookies,
        'set_complex_cookies': set_complex_cookies,
        'cookies': get_cookies,
        'status': echo_status,
        'headers': echo_headers,
        'user-agent': echo_user_agent,
        'echo_content': echo_content,
        'sitemap.txt': echo_content,
        'sitemap.xml': echo_content,
        'sitemap.xml.gz': echo_content,
        'get': get_echo,
        'post': post_echo,
        'redirect': redirect_to_url,
        'json': hello_world_json,
        'xml': hello_world_xml,
        'robots.txt': robots_txt,
        'get_compressed': get_compressed,
        'slow': slow_response,
        'infinite_scroll': infinite_scroll_endpoint,
        'resource_loading_page': resource_loading_endpoint,
    }
    path = URL(scope['path']).parts[1]
    # Route requests to appropriate handlers
    if path in paths:
        path_func = paths[path]
        await path_func(scope, receive, send)
    else:
        await hello_world(scope, receive, send)


async def get_cookies(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests to retrieve cookies sent in the request."""
    headers = get_headers_dict(scope)
    cookies = get_cookies_from_headers(headers)
    await send_json_response(send, {'cookies': cookies})


async def set_cookies(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests to set cookies from query parameters and redirect."""

    query_params = get_query_params(scope.get('query_string', b''))

    headers = [
        [b'content-type', b'text/plain; charset=utf-8'],
        [b'location', b'/cookies'],  # Redirect header
    ]

    for key, values in query_params.items():
        if values:  # Only add if there's at least one value
            cookie_value = f'{key}={values[0]}; Path=/'
            headers.append([b'set-cookie', cookie_value.encode()])

    await send(
        {
            'type': 'http.response.start',
            'status': 302,  # 302 Found for redirect
            'headers': headers,
        }
    )
    await send({'type': 'http.response.body', 'body': b'Redirecting to get_cookies...'})


async def hello_world(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle basic requests with a simple HTML response."""
    await send_html_response(
        send,
        HELLO_WORLD,
    )


async def hello_world_json(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle basic requests with a simple JSON response."""
    await send_json_response(
        send,
        {'hello': 'world'},
    )


async def hello_world_xml(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle basic requests with a simple XML response."""
    await send_html_response(
        send,
        b"""<?xml version="1.0"?>
            <hello>world</hello>""",
    )


async def post_echo(scope: dict[str, Any], receive: Receive, send: Send) -> None:
    """Echo back POST request details similar to httpbin.org/post."""
    # Extract basic request info
    path = scope.get('path', '')
    query_string = scope.get('query_string', b'')
    args = get_query_params(query_string)

    # Extract headers and cookies
    headers = get_headers_dict(scope)

    # Read the request body
    body = b''
    form = {}
    json_data = None
    more_body = True

    while more_body:
        message = await receive()
        if message['type'] == 'http.request':
            body += message.get('body', b'')
            more_body = message.get('more_body', False)

    # Parse body based on content type
    content_type = headers.get('content-type', '').lower()

    if body and 'application/json' in content_type:
        json_data = json.loads(body.decode())

    if body and 'application/x-www-form-urlencoded' in content_type:
        form_data = parse_qs(body.decode())
        for key, values in form_data.items():
            form[key] = values[0] if len(values) == 1 else values

    body_text = '' if form else body.decode('utf-8', errors='replace')

    # Prepare response
    response = {
        'args': args,
        'data': body_text,
        'files': {},  # Not handling multipart file uploads
        'form': form,
        'headers': headers,
        'json': json_data,
        'origin': headers.get('host', ''),
        'url': f'http://{headers["host"]}{path}',
    }

    await send_json_response(send, response)


async def echo_status(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Echo the status code from the URL path."""
    status_code = int(scope['path'].replace('/status/', ''))
    await send(
        {
            'type': 'http.response.start',
            'status': status_code,
            'headers': [[b'content-type', b'text/plain']],
        }
    )
    await send({'type': 'http.response.body', 'body': b''})


async def echo_headers(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Echo back the request headers as JSON."""
    headers = get_headers_dict(scope)
    await send_json_response(send, headers)


async def start_enqueue_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests for the main page with links."""
    await send_html_response(
        send,
        START_ENQUEUE,
    )


async def secondary_index_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests for the secondary page with links."""
    await send_html_response(
        send,
        SECONDARY_INDEX,
    )


async def incapsula_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests for a page with an incapsula iframe."""
    await send_html_response(
        send,
        INCAPSULA,
    )


async def generic_response_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests with a generic HTML response."""
    await send_html_response(
        send,
        GENERIC_RESPONSE,
    )


async def problematic_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests with a page containing problematic links."""
    await send_html_response(
        send,
        PROBLEMATIC_LINKS,
    )


async def non_href_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests with a page containing non-href links."""
    await send_html_response(
        send,
        NON_HREF_LINKS,
    )


async def redirect_to_url(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests that should redirect to a specified full URL."""
    query_params = get_query_params(scope.get('query_string', b''))

    target_url = query_params.get('url', 'http://example.com')
    status_code = int(query_params.get('status', 302))

    await send(
        {
            'type': 'http.response.start',
            'status': status_code,
            'headers': [
                [b'content-type', b'text/plain; charset=utf-8'],
                [b'location', target_url.encode()],
            ],
        }
    )
    await send({'type': 'http.response.body', 'body': f'Redirecting to {target_url}...'.encode()})


async def echo_user_agent(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Echo back the user agent header as a response."""
    headers = get_headers_dict(scope)
    user_agent = headers.get('user-agent', 'Not provided')
    await send_json_response(send, {'user-agent': user_agent})


async def get_echo(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Echo back GET request details similar to httpbin.org/get."""
    path = scope.get('path', '')
    query_string = scope.get('query_string', b'')
    args = get_query_params(query_string)
    headers = get_headers_dict(scope)

    origin = scope.get('client', ('unknown', 0))[0]

    host = headers.get('host', 'localhost')
    scheme = headers.get('x-forwarded-proto', 'http')
    url = f'{scheme}://{host}{path}'
    if query_string:
        url += f'?{query_string}'

    response = {
        'args': args,
        'headers': headers,
        'origin': origin,
        'url': url,
    }

    await send_json_response(send, response)


async def set_complex_cookies(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests to set specific cookies with various attributes."""

    headers = [
        [b'content-type', b'text/plain; charset=utf-8'],
        [b'set-cookie', b'basic=1; Path=/; HttpOnly; SameSite=Lax'],
        [b'set-cookie', b'withpath=2; Path=/html; SameSite=None'],
        [b'set-cookie', b'strict=3; Path=/; SameSite=Strict'],
        [b'set-cookie', b'secure=4; Path=/; HttpOnly; Secure; SameSite=Strict; Partitioned'],
        [b'set-cookie', b'short=5; Path=/;'],
        [b'set-cookie', b'domain=6; Path=/; Domain=.127.0.0.1;'],
    ]

    await send(
        {
            'type': 'http.response.start',
            'status': 200,
            'headers': headers,
        }
    )
    await send({'type': 'http.response.body', 'body': b'Cookies have been set!'})


async def echo_content(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Echo back content (plain text or base64) with specified content-type."""
    query_params = get_query_params(scope.get('query_string', b''))

    content = query_params.get('content', '')
    base64_content = query_params.get('base64', '')
    c_type = query_params.get('c_type', 'text/html; charset=utf-8')

    out_content = base64.b64decode(base64_content) if base64_content else content.encode()

    await send(
        {
            'type': 'http.response.start',
            'status': 200,
            'headers': [[b'content-type', c_type.encode()]],
        }
    )

    await send({'type': 'http.response.body', 'body': out_content})


async def robots_txt(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests for the robots.txt file."""
    await send_html_response(send, ROBOTS_TXT)


async def get_compressed(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Return large gzip compressed content."""

    await send(
        {
            'type': 'http.response.start',
            'status': 200,
            'headers': [[b'content-encoding', b'gzip']],
        }
    )
    await send({'type': 'http.response.body', 'body': gzip.compress(HELLO_WORLD * 1000)})


async def slow_response(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests with a configurable delay to test timeouts."""
    query_params = get_query_params(scope.get('query_string', b''))
    delay = float(query_params.get('delay', '5'))  # Default 5 second delay

    await asyncio.sleep(delay)
    await send_html_response(send, HELLO_WORLD)


async def infinite_scroll_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests for the infinite scroll page."""
    await send_html_response(
        send,
        INFINITE_SCROLL,
    )


async def resource_loading_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests for the resource loading page."""
    await send_html_response(
        send,
        RESOURCE_LOADING_PAGE,
    )


async def base_index_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests for the base index page."""
    host = f'http://{get_headers_dict(_scope).get("host", "localhost")}'
    content = BASE_INDEX.format(host=host).encode()
    await send_html_response(
        send,
        content,
    )


async def start_enqueue_non_href_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
    """Handle requests for the base index page."""
    host = f'http://{get_headers_dict(_scope).get("host", "localhost")}'
    content = START_ENQUEUE_NON_HREF.format(host=host).encode()
    await send_html_response(
        send,
        content,
    )


class TestServer(Server):
    """A test HTTP server implementation based on Uvicorn Server."""

    @property
    def url(self) -> URL:
        """Get the base URL of the server.

        Returns:
            A URL instance with the server's base URL.
        """
        protocol = 'https' if self.config.is_ssl else 'http'
        return URL(f'{protocol}://{self.config.host}:{self.config.port}/')

    async def serve(self, sockets: list[socket] | None = None) -> None:
        """Run the server and set up restart capability.

        Args:
            sockets: Optional list of sockets to bind to.
        """
        self.restart_requested = asyncio.Event()

        loop = asyncio.get_event_loop()
        tasks = {
            loop.create_task(super().serve(sockets=sockets)),
            loop.create_task(self.watch_restarts()),
        }
        await asyncio.wait(tasks)

    async def restart(self) -> None:
        """Request server restart and wait for it to complete.

        This method can be called from a different thread than the one the server
        is running on, and from a different async environment.
        """
        self.started = False
        self.restart_requested.set()
        while not self.started:  # noqa: ASYNC110
            await asyncio.sleep(0.2)

    async def watch_restarts(self) -> None:
        """Watch for and handle restart requests."""
        while True:
            if self.should_exit:
                return

            try:
                await asyncio.wait_for(self.restart_requested.wait(), timeout=0.1)
            except asyncio.TimeoutError:
                continue

            self.restart_requested.clear()
            await self.shutdown()
            await self.startup()

    def run(self, sockets: list[socket] | None = None) -> None:
        """Run the server."""
        # Set the event loop policy in thread with server for Windows and Python 3.12+.
        # This is necessary because there are problems with closing connections when using `ProactorEventLoop`
        if sys.version_info >= (3, 12) and sys.platform == 'win32':
            return asyncio.run(self.serve(sockets=sockets), loop_factory=asyncio.SelectorEventLoop)
        super().run(sockets=sockets)
        return None


def serve_in_thread(server: TestServer) -> Iterator[TestServer]:
    """Run a server in a background thread and yield it."""
    thread = threading.Thread(target=server.run)
    thread.start()
    try:
        while not server.started:
            time.sleep(1e-3)
        yield server
    finally:
        server.should_exit = True
        thread.join()


================================================
FILE: tests/unit/server_endpoints.py
================================================
# Test server response content for testing

HELLO_WORLD = b"""\
<html><head>
    <title>Hello, world!</title>
</head>
<body>
</body></html>"""

START_ENQUEUE = b"""\
<html><head>
    <title>Hello</title>
</head>
<body>
    <a href="/sub_index" class="foo">Link 1</a>
    <a href="/page_1">Link 2</a>
    <a href="mailto:test@test.com">test@test.com</a>
</body></html>"""

START_ENQUEUE_NON_HREF = """\
<html><head>
    <base href="{host}/base_subpath/">
    <title>Hello</title>
</head>
<body>
    <a href="/page_3">Link A</a>
    <a href="/page_2">Link B</a>
    <img src="image_1"/>
    <img src="/image_2"/>
</body></html>"""

SECONDARY_INDEX = b"""\
<html><head>
    <title>Hello</title>
</head>
<body>
    <a href="/page_3">Link 3</a>
    <a href="/page_2">Link 4</a>
    <a href="/base_page">Base Page</a>
</body></html>"""

BASE_INDEX = """\
<html><head>
    <base href="{host}/base_subpath/">
    <base href="{host}/sub_index/">
    <title>Hello</title>
</head>
<body>
    <a href="page_5">Link 5</a>
    <a href="/page_4">Link 6</a>
</body></html>"""

INCAPSULA = b"""\
<html><head>
    <title>Hello</title>
</head>
<body>
    <iframe src=Test_Incapsula_Resource>
    </iframe>
</body></html>"""

PROBLEMATIC_LINKS = b"""\
<html><head>
    <title>Hello</title>
</head>
<body>
    <a href="https://budplaceholder.com/">Placeholder</a>
    <a href="mailto:test@test.com">test@test.com</a>
    <a href=https://avatars.githubusercontent.com/apify>Apify avatar/a>
</body></html>"""

NON_HREF_LINKS = b"""\
<html><head>
    <title>Hello</title>
</head>
<body>
    <a href="/page_1"></a>
    <li data-href="/page_2"></li>
</body></html>
"""

GENERIC_RESPONSE = b"""\
<html><head>
    <title>Hello</title>
</head>
<body>
    Insightful content
</body></html>"""


ROBOTS_TXT = b"""\
User-agent: *
Disallow: *deny_all/
Disallow: /page_
crawl-delay: 10

User-agent: Googlebot
Disallow: *deny_googlebot/
crawl-delay: 1

user-agent: Mozilla
crawl-delay: 2

sitemap: http://not-exists.com/sitemap_1.xml
sitemap: http://not-exists.com/sitemap_2.xml"""


INFINITE_SCROLL = b"""\
<!DOCTYPE html>
<html>
<body>
    <div id="content"></div>

    <script>
        let page = 0;
        let loading = false;

        for (let i = 0; i < 10; i++) {
            const div = document.createElement('div');
            div.className = 'item';
            div.style.height = '200px';
            div.textContent = 'Item ' + (i + 1);
            document.getElementById('content').appendChild(div);
        }

        async function loadMore() {
            if (loading || page >= 3) return;
            loading = true;
            page++;

            await new Promise(resolve => setTimeout(resolve, 100));

            for (let i = 0; i < 10; i++) {
                const div = document.createElement('div');
                div.className = 'item';
                div.style.height = '200px';
                div.textContent = 'Item ' + (page * 10 + i + 1);
                document.getElementById('content').appendChild(div);
            }

            loading = false;
        }

        window.addEventListener('scroll', () => {
            if (window.innerHeight + window.scrollY >= document.body.offsetHeight - 100) {
                loadMore();
            }
        });
    </script>
</body>
</html>
"""

RESOURCE_LOADING_PAGE = b"""\
<!DOCTYPE html>
<html>
  <head>
    <script src="/server_static/test.js"></script>
  </head>
  <body>
    <img src="/server_static/test.png" />
  </body>
</html>
"""


================================================
FILE: tests/unit/server_static/test.js
================================================


================================================
FILE: tests/unit/sessions/test_cookies.py
================================================
from __future__ import annotations

import pytest

from crawlee.sessions._cookies import CookieParam, PlaywrightCookieParam, SessionCookies


@pytest.fixture
def cookie_dict() -> CookieParam:
    return CookieParam(
        {
            'name': 'test_cookie',
            'value': 'test_value',
            'domain': 'example.com',
            'path': '/test',
            'expires': 1735689600,
            'http_only': True,
            'secure': True,
            'same_site': 'Strict',
        }
    )


@pytest.fixture
def session_cookies(cookie_dict: CookieParam) -> SessionCookies:
    session_cookies = SessionCookies()
    session_cookies.set(**cookie_dict)
    return session_cookies


def test_set_basic_cookie() -> None:
    """Test setting a basic cookie with minimal attributes."""
    session_cookies = SessionCookies()
    session_cookies.set('test', 'value')
    cookies = list(session_cookies.jar)

    assert len(cookies) == 1
    cookie = cookies[0]
    assert cookie.name == 'test'
    assert cookie.value == 'value'
    assert cookie.path == '/'
    assert not cookie.secure
    assert not cookie.has_nonstandard_attr('httpOnpy')


def test_set_cookie_with_all_attributes(session_cookies: SessionCookies, cookie_dict: CookieParam) -> None:
    """Test setting a cookie with all available attributes."""
    cookies = list(session_cookies.jar)

    assert len(cookies) == 1
    cookie = cookies[0]

    assert cookie.name == cookie_dict.get('name')
    assert cookie.value == cookie_dict.get('value')
    assert cookie.path == cookie_dict.get('path')
    assert cookie.domain == cookie_dict.get('domain')
    assert cookie.expires == cookie_dict.get('expires')
    assert cookie.has_nonstandard_attr('HttpOnly')
    assert cookie.secure
    assert cookie.get_nonstandard_attr('SameSite') == 'Strict'


def test_convert_cookie_to_dict(session_cookies: SessionCookies, cookie_dict: CookieParam) -> None:
    """Test converting Cookie object to dictionary representation."""
    cookies = list(session_cookies.jar)

    assert len(cookies) == 1
    cookie = cookies[0]

    converted_cookie_dict = session_cookies._convert_cookie_to_dict(cookie)
    assert converted_cookie_dict == cookie_dict


def test_convert_dict_format(session_cookies: SessionCookies) -> None:
    """Test normalizing cookie attributes between internal and browser formats."""
    internal_format = CookieParam({'name': 'test', 'value': 'value', 'http_only': True, 'same_site': 'Lax'})

    # Test internal to browser format
    browser_format = session_cookies._to_playwright(internal_format)
    assert 'httpOnly' in browser_format
    assert 'sameSite' in browser_format
    assert 'http_only' not in browser_format
    assert 'same_site' not in browser_format

    # Test browser to internal format
    browser_format = PlaywrightCookieParam({'name': 'test', 'value': 'value', 'httpOnly': True, 'sameSite': 'Lax'})
    internal_format = session_cookies._from_playwright(browser_format)
    assert 'http_only' in internal_format
    assert 'same_site' in internal_format
    assert 'httpOnly' not in internal_format
    assert 'sameSite' not in internal_format


def test_get_cookies_as_browser_format(session_cookies: SessionCookies, cookie_dict: CookieParam) -> None:
    """Test getting cookies in browser-compatible format."""
    browser_cookies = session_cookies.get_cookies_as_playwright_format()

    assert len(browser_cookies) == 1
    cookie = browser_cookies[0]
    assert 'httpOnly' in cookie
    assert 'sameSite' in cookie
    assert cookie['httpOnly'] == cookie_dict.get('http_only')
    assert cookie['sameSite'] == cookie_dict.get('same_site')


def test_get_cookies_as_dicts(session_cookies: SessionCookies, cookie_dict: CookieParam) -> None:
    """Test get list of dictionary from a SessionCookies."""
    test_session_cookies = session_cookies.get_cookies_as_dicts()

    assert [cookie_dict] == test_session_cookies


def test_store_cookie(session_cookies: SessionCookies) -> None:
    """Test storing a Cookie object directly."""
    test_session_cookies = SessionCookies()
    cookies = list(session_cookies.jar)
    test_session_cookies.store_cookie(cookies[0])

    assert test_session_cookies == session_cookies


def test_store_multidomain_cookies() -> None:
    """Test of storing cookies with the same name for different domains"""
    session_cookies = SessionCookies()
    session_cookies.set(name='a', value='1', domain='test.io')
    session_cookies.set(name='a', value='2', domain='notest.io')
    check_cookies = {
        item.get('domain'): (item['name'], item['value']) for item in session_cookies.get_cookies_as_dicts()
    }

    assert len(check_cookies) == 2

    assert check_cookies['test.io'] == ('a', '1')
    assert check_cookies['notest.io'] == ('a', '2')


================================================
FILE: tests/unit/sessions/test_models.py
================================================
from __future__ import annotations

from datetime import datetime, timedelta, timezone

import pytest

from crawlee.sessions._cookies import CookieParam
from crawlee.sessions._models import SessionModel

SESSION_CREATED_AT = datetime.now(timezone.utc)


@pytest.fixture
def session_direct() -> SessionModel:
    """Provide a SessionModel instance directly using fixed parameters."""
    return SessionModel(
        id='test_session',
        max_age=timedelta(minutes=30),
        user_data={'user_key': 'user_value'},
        max_error_score=3.0,
        error_score_decrement=0.5,
        created_at=SESSION_CREATED_AT,
        usage_count=0,
        max_usage_count=10,
        error_score=0.0,
        cookies=[CookieParam({'name': 'cookie_key', 'value': 'cookie_value'})],
        blocked_status_codes=[401, 403, 429],
    )


@pytest.fixture
def session_args_camel() -> dict:
    """Provide session parameters as dictionary with camel case keys."""
    return {
        'id': 'test_session',
        'maxAge': '00:30:00',
        'userData': {'user_key': 'user_value'},
        'maxErrorScore': 3.0,
        'errorScoreDecrement': 0.5,
        'createdAt': SESSION_CREATED_AT,
        'usageCount': 0,
        'maxUsageCount': 10,
        'errorScore': 0.0,
        'cookies': [CookieParam({'name': 'cookie_key', 'value': 'cookie_value'})],
        'blockedStatusCodes': [401, 403, 429],
    }


@pytest.fixture
def session_args_snake() -> dict:
    """Provide session parameters as dictionary with snake case keys."""
    return {
        'id': 'test_session',
        'max_age': '00:30:00',
        'user_data': {'user_key': 'user_value'},
        'max_error_score': 3.0,
        'error_score_decrement': 0.5,
        'created_at': SESSION_CREATED_AT,
        'usage_count': 0,
        'max_usage_count': 10,
        'error_score': 0.0,
        'cookies': [CookieParam({'name': 'cookie_key', 'value': 'cookie_value'})],
        'blocked_status_codes': [401, 403, 429],
    }


def test_session_model(
    session_direct: SessionModel,
    session_args_camel: dict,
    session_args_snake: dict,
) -> None:
    """Test equivalence of SessionModel instances created directly and from camelCase, and snake_case kwargs."""
    session_camel = SessionModel(**session_args_camel)
    session_snake = SessionModel(**session_args_snake)

    assert session_direct == session_camel == session_snake
    assert session_direct.id == session_camel.id == session_snake.id == 'test_session'

    # Check that max_age is correctly parsed into a timedelta object
    assert session_direct.max_age == session_camel.max_age == session_snake.max_age == timedelta(minutes=30)


================================================
FILE: tests/unit/sessions/test_session.py
================================================
from __future__ import annotations

from datetime import datetime, timedelta, timezone

import pytest

from crawlee.sessions._cookies import SessionCookies
from crawlee.sessions._session import Session


@pytest.fixture
def session() -> Session:
    return Session(
        id='test_session',
        max_age=timedelta(minutes=30),
        user_data={'user_key': 'user_value'},
        max_error_score=3.0,
        error_score_decrement=0.5,
        created_at=datetime.now(timezone.utc),
        usage_count=0,
        max_usage_count=10,
        error_score=0.0,
        cookies={'cookie_key': 'cookie_value'},
        blocked_status_codes=[401, 403, 429],
    )


def test_session_init(session: Session) -> None:
    """Verify that the session initializes correctly with the expected properties."""
    assert session.id == 'test_session'
    assert session.user_data == {'user_key': 'user_value'}
    assert session.cookies == SessionCookies({'cookie_key': 'cookie_value'})
    assert session.expires_at >= datetime.now(timezone.utc)
    assert not session.is_blocked
    assert not session.is_expired
    assert not session.is_max_usage_count_reached
    assert session.is_usable


def test_session_get_state(session: Session) -> None:
    """Check if the session state is correctly retrievable in both dict and model forms."""
    session_state_dict = session.get_state(as_dict=True)
    assert session_state_dict['id'] == 'test_session'

    session_state_model = session.get_state(as_dict=False)
    assert session_state_model.id == 'test_session'

    session_2 = Session.from_model(session_state_model)
    assert session_2.id == 'test_session'


def test_mark_good(session: Session) -> None:
    """Test the mark_good method increases usage count and potentially decreases error score."""
    initial_usage_count = session.usage_count
    session.mark_good()
    assert session.usage_count == initial_usage_count + 1
    assert session.error_score == 0


def test_mark_bad(session: Session) -> None:
    """Test the mark_bad method affects the session's error score and usage."""
    initial_error_score = session.error_score
    session.mark_bad()
    assert session.error_score == initial_error_score + 1


def test_multiple_marks(session: Session) -> None:
    """Test the mark_good and mark_bad methods in sequence."""
    initial_usage_count = session.usage_count
    session.mark_bad()
    session.mark_bad()
    assert session.error_score == initial_usage_count + 2
    session.mark_good()
    session.mark_good()
    assert session.error_score == initial_usage_count + 1
    session.mark_bad()
    session.mark_bad()
    session.mark_good()
    assert session.is_blocked
    assert not session.is_usable


def test_retire_method(session: Session) -> None:
    """Test that retire method properly sets the session as unusable."""
    session.retire()
    assert not session.is_usable
    assert session.error_score == 3.0


def test_retire_on_blocked_status_code(session: Session) -> None:
    """Test retiring the session based on specific HTTP status codes."""
    status_code = 403
    result = session.is_blocked_status_code(status_code=status_code)
    assert result is True


def test_not_retire_on_not_block_status_code(session: Session) -> None:
    """Test that the session is not retired on a non-blocked status code."""
    status_code = 200
    result = session.is_blocked_status_code(status_code=status_code)
    assert result is False


def test_session_expiration() -> None:
    """Test the expiration logic of the session."""
    session = Session(created_at=datetime.now(timezone.utc) - timedelta(hours=1))
    assert session.is_expired


================================================
FILE: tests/unit/sessions/test_session_pool.py
================================================
from __future__ import annotations

import logging
from datetime import datetime, timezone
from typing import TYPE_CHECKING

import pytest

from crawlee import service_locator
from crawlee.events import EventManager
from crawlee.events._types import Event, EventPersistStateData
from crawlee.sessions import Session, SessionPool
from crawlee.sessions._models import SessionPoolModel
from crawlee.storages import KeyValueStore

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

MAX_POOL_SIZE = 3
KVS_NAME = 'test-session-pool'
PERSIST_STATE_KEY = 'crawlee_session_pool_state'


@pytest.fixture
async def kvs() -> AsyncGenerator[KeyValueStore, None]:
    kvs = await KeyValueStore.open(name=KVS_NAME)
    yield kvs
    await kvs.drop()


@pytest.fixture
async def event_manager() -> AsyncGenerator[EventManager, None]:
    async with EventManager() as em:
        yield em


@pytest.fixture
async def session_pool() -> AsyncGenerator[SessionPool, None]:
    async with SessionPool(max_pool_size=MAX_POOL_SIZE, persistence_enabled=False) as sp:
        yield sp


async def test_session_pool_init(session_pool: SessionPool) -> None:
    """Ensure that the session pool initializes correctly with predefined parameters."""
    assert session_pool.session_count == MAX_POOL_SIZE
    assert session_pool.usable_session_count == MAX_POOL_SIZE
    assert session_pool.retired_session_count == 0


async def test_add_session(session_pool: SessionPool) -> None:
    """Test adding sessions to the session pool increases session counts appropriately."""
    session_01 = Session(id='test_session_01')
    session_02 = Session(id='test_session_02')
    session_pool.add_session(session=session_01)
    session_pool.add_session(session=session_02)
    assert session_pool.session_count == MAX_POOL_SIZE + 2
    assert session_pool.usable_session_count == MAX_POOL_SIZE + 2
    assert session_pool.retired_session_count == 0


async def test_add_session_duplicate(caplog: pytest.LogCaptureFixture, session_pool: SessionPool) -> None:
    """Verify that adding a duplicate session logs a warning and does not increase count."""
    session_01 = Session(id='test_session_01')
    session_02 = Session(id='test_session_01')

    session_pool.add_session(session=session_01)
    assert session_pool.session_count == MAX_POOL_SIZE + 1

    with caplog.at_level(logging.WARNING):
        session_pool.add_session(session=session_02)

    assert session_pool.session_count == MAX_POOL_SIZE + 1


async def test_get_session(session_pool: SessionPool) -> None:
    """Check retrieval of a session from the pool and verify its properties."""
    session = await session_pool.get_session()
    assert session is not None
    assert session.expires_at >= datetime.now(timezone.utc)
    assert not session.is_blocked
    assert not session.is_expired
    assert not session.is_max_usage_count_reached
    assert session.is_usable


async def test_get_session_no_usable(caplog: pytest.LogCaptureFixture, session_pool: SessionPool) -> None:
    """Ensure that retrieval of a non-existent or retired session returns None and logs warning."""
    session = await session_pool.get_session_by_id('non_existent')
    assert session is None

    session = Session(id='test_session_not_usable')
    session.retire()
    assert not session.is_usable
    session_pool.add_session(session=session)
    assert session_pool.session_count == MAX_POOL_SIZE + 1

    with caplog.at_level(logging.WARNING):
        session = await session_pool.get_session_by_id('test_session_not_usable')
        assert session is None


async def test_create_session_function() -> None:
    """Validate that a session created via a custom function works and has the expected fields set."""
    user_data = {'created_by': 'test_create_session_function'}
    async with SessionPool(
        max_pool_size=MAX_POOL_SIZE,
        persistence_enabled=False,
        create_session_function=lambda: Session(user_data=user_data),
    ) as sp:
        session = await sp.get_session()
        assert session is not None
        assert session.user_data == user_data


@pytest.mark.parametrize('kvs_name', [KVS_NAME, None])
async def test_session_pool_persist(event_manager: EventManager, kvs_name: str | None) -> None:
    """Test persistence of session pool state to KVS and validate stored data integrity."""
    service_locator.set_event_manager(event_manager)

    async with SessionPool(
        max_pool_size=MAX_POOL_SIZE,
        persistence_enabled=True,
        persist_state_kvs_name=kvs_name,
        persist_state_key=PERSIST_STATE_KEY,
    ) as sp:
        # Emit persist state event and wait for the persistence to complete
        event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False))
        await event_manager.wait_for_all_listeners_to_complete()

        # Get the persisted state from the key-value store
        kvs = await KeyValueStore.open(name=kvs_name)
        previous_state = await kvs.get_value(key=PERSIST_STATE_KEY)
        assert isinstance(previous_state, dict)
        sp_model = SessionPoolModel(**previous_state)

        # Check if the state is correctly persisted
        assert sp_model.session_count == sp.session_count
        assert sp_model.usable_session_count == sp.usable_session_count
        assert sp_model.retired_session_count == sp.retired_session_count

        # Check if all the sessions are correctly persisted
        for kvs_session in sp_model.sessions.values():
            session = await sp.get_session_by_id(kvs_session.id)
            assert kvs_session == session


async def test_session_pool_persist_and_restore(event_manager: EventManager, kvs: KeyValueStore) -> None:
    """Check session pool's ability to persist its state and then restore it accurately after reset."""
    service_locator.set_event_manager(event_manager)

    async with SessionPool(
        max_pool_size=MAX_POOL_SIZE,
        persistence_enabled=True,
        persist_state_kvs_name=KVS_NAME,
        persist_state_key=PERSIST_STATE_KEY,
    ):
        # Emit persist state event and wait for the persistence to complete
        event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False))
        await event_manager.wait_for_all_listeners_to_complete()

    async with SessionPool(
        max_pool_size=MAX_POOL_SIZE,
        persistence_enabled=True,
        persist_state_kvs_name=KVS_NAME,
        persist_state_key=PERSIST_STATE_KEY,
    ) as sp:
        # Not just reset the store and check it's empty
        await sp.reset_store()
        previous_state = await kvs.get_value(key=PERSIST_STATE_KEY)
        assert previous_state is None


async def test_methods_raise_error_when_not_active() -> None:
    session = Session()
    session_pool = SessionPool()

    assert session_pool.active is False

    with pytest.raises(RuntimeError, match=r'SessionPool is not active.'):
        session_pool.get_state(as_dict=True)

    with pytest.raises(RuntimeError, match=r'SessionPool is not active.'):
        session_pool.add_session(session)

    with pytest.raises(RuntimeError, match=r'SessionPool is not active.'):
        await session_pool.get_session()

    with pytest.raises(RuntimeError, match=r'SessionPool is not active.'):
        await session_pool.get_session_by_id(session.id)

    await session_pool.reset_store()

    with pytest.raises(RuntimeError, match=r'SessionPool is already active.'):
        async with session_pool, session_pool:
            pass

    async with session_pool:
        assert session_pool.active is True


================================================
FILE: tests/unit/storage_clients/_file_system/test_fs_dataset_client.py
================================================
from __future__ import annotations

import asyncio
import json
from pathlib import Path
from typing import TYPE_CHECKING

import pytest

from crawlee._consts import METADATA_FILENAME
from crawlee.configuration import Configuration
from crawlee.storage_clients import FileSystemStorageClient

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from crawlee.storage_clients._file_system import FileSystemDatasetClient


@pytest.fixture
def configuration(tmp_path: Path) -> Configuration:
    return Configuration(
        storage_dir=str(tmp_path),
    )


@pytest.fixture
async def dataset_client(configuration: Configuration) -> AsyncGenerator[FileSystemDatasetClient, None]:
    """A fixture for a file system dataset client."""
    client = await FileSystemStorageClient().create_dataset_client(name='test-dataset', configuration=configuration)
    yield client
    await client.drop()


async def test_file_and_directory_creation(configuration: Configuration) -> None:
    """Test that file system dataset creates proper files and directories."""
    client = await FileSystemStorageClient().create_dataset_client(name='new-dataset', configuration=configuration)

    # Verify files were created
    assert client.path_to_dataset.exists()
    assert client.path_to_metadata.exists()

    # Verify metadata file structure
    with client.path_to_metadata.open() as f:
        metadata = json.load(f)
        client_metadata = await client.get_metadata()
        assert metadata['id'] == client_metadata.id
        assert metadata['name'] == 'new-dataset'
        assert metadata['item_count'] == 0

    await client.drop()


async def test_file_persistence_and_content_verification(dataset_client: FileSystemDatasetClient) -> None:
    """Test that data is properly persisted to files with correct content."""
    item = {'key': 'value', 'number': 42}
    await dataset_client.push_data(item)

    # Verify files are created on disk
    all_files = list(dataset_client.path_to_dataset.glob('*.json'))
    assert len(all_files) == 2  # 1 data file + 1 metadata file

    # Verify actual file content
    data_files = [item for item in all_files if item.name != METADATA_FILENAME]
    assert len(data_files) == 1

    with Path(data_files[0]).open() as f:
        saved_item = json.load(f)
        assert saved_item == item

    # Test multiple items file creation
    items = [{'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}]
    await dataset_client.push_data(items)

    all_files = list(dataset_client.path_to_dataset.glob('*.json'))
    assert len(all_files) == 5  # 4 data files + 1 metadata file

    data_files = [f for f in all_files if f.name != METADATA_FILENAME]
    assert len(data_files) == 4  # Original item + 3 new items


async def test_drop_removes_files_from_disk(dataset_client: FileSystemDatasetClient) -> None:
    """Test that dropping a dataset removes the entire dataset directory from disk."""
    await dataset_client.push_data({'test': 'data'})

    assert dataset_client.path_to_dataset.exists()

    # Drop the dataset
    await dataset_client.drop()

    assert not dataset_client.path_to_dataset.exists()


async def test_metadata_file_updates(dataset_client: FileSystemDatasetClient) -> None:
    """Test that metadata file is updated correctly after operations."""
    # Record initial timestamps
    metadata = await dataset_client.get_metadata()
    initial_created = metadata.created_at
    initial_accessed = metadata.accessed_at
    initial_modified = metadata.modified_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform an operation that updates accessed_at
    await dataset_client.get_data()

    # Verify timestamps
    metadata = await dataset_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.accessed_at > initial_accessed
    assert metadata.modified_at == initial_modified

    accessed_after_get = metadata.accessed_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform an operation that updates modified_at
    await dataset_client.push_data({'new': 'item'})

    # Verify timestamps again
    metadata = await dataset_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.modified_at > initial_modified
    assert metadata.accessed_at > accessed_after_get

    # Verify metadata file is updated on disk
    with dataset_client.path_to_metadata.open() as f:
        metadata_json = json.load(f)
        assert metadata_json['item_count'] == 1


async def test_data_persistence_across_reopens() -> None:
    """Test that data persists correctly when reopening the same dataset."""
    storage_client = FileSystemStorageClient()

    # Create dataset and add data
    original_client = await storage_client.create_dataset_client(name='persistence-test')

    test_data = {'test_item': 'test_value', 'id': 123}
    await original_client.push_data(test_data)

    dataset_id = (await original_client.get_metadata()).id

    # Reopen by ID and verify data persists
    reopened_client = await storage_client.create_dataset_client(id=dataset_id)

    data = await reopened_client.get_data()
    assert len(data.items) == 1
    assert data.items[0] == test_data

    await reopened_client.drop()


================================================
FILE: tests/unit/storage_clients/_file_system/test_fs_kvs_client.py
================================================
from __future__ import annotations

import asyncio
import json
from typing import TYPE_CHECKING

import pytest

from crawlee._consts import METADATA_FILENAME
from crawlee.configuration import Configuration
from crawlee.storage_clients import FileSystemStorageClient

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator
    from pathlib import Path

    from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient


@pytest.fixture
def configuration(tmp_path: Path) -> Configuration:
    return Configuration(
        storage_dir=str(tmp_path),
    )


@pytest.fixture
async def kvs_client(configuration: Configuration) -> AsyncGenerator[FileSystemKeyValueStoreClient, None]:
    """A fixture for a file system key-value store client."""
    client = await FileSystemStorageClient().create_kvs_client(name='test-kvs', configuration=configuration)
    yield client
    await client.drop()


async def test_file_and_directory_creation(configuration: Configuration) -> None:
    """Test that file system KVS creates proper files and directories."""
    client = await FileSystemStorageClient().create_kvs_client(name='new-kvs', configuration=configuration)

    # Verify files were created
    assert client.path_to_kvs.exists()
    assert client.path_to_metadata.exists()

    # Verify metadata file structure
    with client.path_to_metadata.open() as f:
        metadata = json.load(f)
        assert metadata['id'] == (await client.get_metadata()).id
        assert metadata['name'] == 'new-kvs'

    await client.drop()


async def test_value_file_creation_and_content(kvs_client: FileSystemKeyValueStoreClient) -> None:
    """Test that values are properly persisted to files with correct content and metadata."""
    test_key = 'test-key'
    test_value = 'Hello, world!'
    await kvs_client.set_value(key=test_key, value=test_value)

    # Check if the files were created
    key_path = kvs_client.path_to_kvs / test_key
    key_metadata_path = kvs_client.path_to_kvs / f'{test_key}.{METADATA_FILENAME}'
    assert key_path.exists()
    assert key_metadata_path.exists()

    # Check file content
    content = key_path.read_text(encoding='utf-8')
    assert content == test_value

    # Check record metadata file
    with key_metadata_path.open() as f:
        metadata = json.load(f)
        assert metadata['key'] == test_key
        assert metadata['content_type'] == 'text/plain; charset=utf-8'
        assert metadata['size'] == len(test_value.encode('utf-8'))


async def test_binary_data_persistence(kvs_client: FileSystemKeyValueStoreClient) -> None:
    """Test that binary data is stored correctly without corruption."""
    test_key = 'test-binary'
    test_value = b'\x00\x01\x02\x03\x04'
    await kvs_client.set_value(key=test_key, value=test_value)

    # Verify binary file exists
    key_path = kvs_client.path_to_kvs / test_key
    assert key_path.exists()

    # Verify binary content is preserved
    content = key_path.read_bytes()
    assert content == test_value

    # Verify retrieval works correctly
    record = await kvs_client.get_value(key=test_key)
    assert record is not None
    assert record.value == test_value
    assert record.content_type == 'application/octet-stream'


async def test_json_serialization_to_file(kvs_client: FileSystemKeyValueStoreClient) -> None:
    """Test that JSON objects are properly serialized to files."""
    test_key = 'test-json'
    test_value = {'name': 'John', 'age': 30, 'items': [1, 2, 3]}
    await kvs_client.set_value(key=test_key, value=test_value)

    # Check if file content is valid JSON
    key_path = kvs_client.path_to_kvs / test_key
    with key_path.open() as f:
        file_content = json.load(f)
        assert file_content == test_value


async def test_file_deletion_on_value_delete(kvs_client: FileSystemKeyValueStoreClient) -> None:
    """Test that deleting a value removes its files from disk."""
    test_key = 'test-delete'
    test_value = 'Delete me'

    # Set a value
    await kvs_client.set_value(key=test_key, value=test_value)

    # Verify files exist
    key_path = kvs_client.path_to_kvs / test_key
    metadata_path = kvs_client.path_to_kvs / f'{test_key}.{METADATA_FILENAME}'
    assert key_path.exists()
    assert metadata_path.exists()

    # Delete the value
    await kvs_client.delete_value(key=test_key)

    # Verify files were deleted
    assert not key_path.exists()
    assert not metadata_path.exists()


async def test_drop_removes_directory(kvs_client: FileSystemKeyValueStoreClient) -> None:
    """Test that drop removes the entire store directory from disk."""
    await kvs_client.set_value(key='test', value='test-value')

    assert kvs_client.path_to_kvs.exists()

    # Drop the store
    await kvs_client.drop()

    assert not kvs_client.path_to_kvs.exists()


async def test_metadata_file_updates(kvs_client: FileSystemKeyValueStoreClient) -> None:
    """Test that read/write operations properly update metadata file timestamps."""
    # Record initial timestamps
    metadata = await kvs_client.get_metadata()
    initial_created = metadata.created_at
    initial_accessed = metadata.accessed_at
    initial_modified = metadata.modified_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a read operation
    await kvs_client.get_value(key='nonexistent')

    # Verify accessed timestamp was updated
    metadata = await kvs_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.accessed_at > initial_accessed
    assert metadata.modified_at == initial_modified

    accessed_after_read = metadata.accessed_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a write operation
    await kvs_client.set_value(key='test', value='test-value')

    # Verify modified timestamp was updated
    metadata = await kvs_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.modified_at > initial_modified
    assert metadata.accessed_at > accessed_after_read


async def test_data_persistence_across_reopens(configuration: Configuration) -> None:
    """Test that data persists correctly when reopening the same KVS."""
    storage_client = FileSystemStorageClient()

    # Create KVS and add data
    original_client = await storage_client.create_kvs_client(name='persistence-test', configuration=configuration)

    test_key = 'persistent-key'
    test_value = 'persistent-value'
    await original_client.set_value(key=test_key, value=test_value)

    kvs_id = (await original_client.get_metadata()).id

    # Reopen by ID and verify data persists
    reopened_client = await storage_client.create_kvs_client(
        id=kvs_id,
    )

    record = await reopened_client.get_value(key=test_key)
    assert record is not None
    assert record.value == test_value

    await reopened_client.drop()


================================================
FILE: tests/unit/storage_clients/_file_system/test_fs_rq_client.py
================================================
from __future__ import annotations

import asyncio
import json
from typing import TYPE_CHECKING

import pytest

from crawlee import Request, service_locator
from crawlee.configuration import Configuration
from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator
    from pathlib import Path

    from crawlee.storage_clients._file_system import FileSystemRequestQueueClient


@pytest.fixture
def configuration(tmp_path: Path) -> Configuration:
    return Configuration(
        storage_dir=str(tmp_path),
    )


@pytest.fixture
async def rq_client() -> AsyncGenerator[FileSystemRequestQueueClient, None]:
    """A fixture for a file system request queue client."""
    client = await FileSystemStorageClient().create_rq_client(
        name='test-request-queue',
    )
    yield client
    await client.drop()


async def test_file_and_directory_creation() -> None:
    """Test that file system RQ creates proper files and directories."""
    client = await FileSystemStorageClient().create_rq_client(name='new-request-queue')

    # Verify files were created
    assert client.path_to_rq.exists()
    assert client.path_to_metadata.exists()

    # Verify metadata file structure
    with client.path_to_metadata.open() as f:
        metadata = json.load(f)
        assert metadata['id'] == (await client.get_metadata()).id
        assert metadata['name'] == 'new-request-queue'

    await client.drop()


async def test_request_file_persistence(rq_client: FileSystemRequestQueueClient) -> None:
    """Test that requests are properly persisted to files."""
    requests = [
        Request.from_url('https://example.com/1'),
        Request.from_url('https://example.com/2'),
        Request.from_url('https://example.com/3'),
    ]

    await rq_client.add_batch_of_requests(requests)

    # Verify request files are created
    request_files = list(rq_client.path_to_rq.glob('*.json'))
    # Should have 3 request files + 1 metadata file
    assert len(request_files) == 4
    assert rq_client.path_to_metadata in request_files

    # Verify actual request file content
    data_files = [f for f in request_files if f != rq_client.path_to_metadata]
    assert len(data_files) == 3

    for req_file in data_files:
        with req_file.open() as f:
            request_data = json.load(f)
            assert 'url' in request_data
            assert request_data['url'].startswith('https://example.com/')


async def test_opening_rq_does_not_have_side_effect_on_service_locator(configuration: Configuration) -> None:
    """Opening request queue client should cause setting storage client in the global service locator."""
    await FileSystemStorageClient().create_rq_client(name='test_request_queue', configuration=configuration)

    # Set some specific storage client in the service locator. There should be no `ServiceConflictError`.
    service_locator.set_storage_client(MemoryStorageClient())


async def test_drop_removes_directory(rq_client: FileSystemRequestQueueClient) -> None:
    """Test that drop removes the entire RQ directory from disk."""
    await rq_client.add_batch_of_requests([Request.from_url('https://example.com')])

    rq_path = rq_client.path_to_rq
    assert rq_path.exists()

    # Drop the request queue
    await rq_client.drop()

    assert not rq_path.exists()


async def test_metadata_file_updates(rq_client: FileSystemRequestQueueClient) -> None:
    """Test that metadata file is updated correctly after operations."""
    # Record initial timestamps
    metadata = await rq_client.get_metadata()
    initial_created = metadata.created_at
    initial_accessed = metadata.accessed_at
    initial_modified = metadata.modified_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a read operation
    await rq_client.is_empty()

    # Verify accessed timestamp was updated
    metadata = await rq_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.accessed_at > initial_accessed
    assert metadata.modified_at == initial_modified

    accessed_after_read = metadata.accessed_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a write operation
    await rq_client.add_batch_of_requests([Request.from_url('https://example.com')])

    # Verify modified timestamp was updated
    metadata = await rq_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.modified_at > initial_modified
    assert metadata.accessed_at > accessed_after_read

    # Verify metadata file is updated on disk
    with rq_client.path_to_metadata.open() as f:
        metadata_json = json.load(f)
        assert metadata_json['total_request_count'] == 1


async def test_data_persistence_across_reopens() -> None:
    """Test that requests persist correctly when reopening the same RQ."""
    storage_client = FileSystemStorageClient()

    # Create RQ and add requests
    original_client = await storage_client.create_rq_client(
        name='persistence-test',
    )

    test_requests = [
        Request.from_url('https://example.com/1'),
        Request.from_url('https://example.com/2'),
    ]
    await original_client.add_batch_of_requests(test_requests)

    rq_id = (await original_client.get_metadata()).id

    # Reopen by ID and verify requests persist
    reopened_client = await storage_client.create_rq_client(
        id=rq_id,
    )

    metadata = await reopened_client.get_metadata()
    assert metadata.total_request_count == 2

    # Fetch requests to verify they're still there
    request1 = await reopened_client.fetch_next_request()
    request2 = await reopened_client.fetch_next_request()

    assert request1 is not None
    assert request2 is not None
    assert {request1.url, request2.url} == {'https://example.com/1', 'https://example.com/2'}

    await reopened_client.drop()


async def test_get_request_does_not_mark_in_progress(rq_client: FileSystemRequestQueueClient) -> None:
    """Test that get_request does not block a request from being fetched."""
    request = Request.from_url('https://example.com/blocked')
    await rq_client.add_batch_of_requests([request])

    fetched = await rq_client.get_request(request.unique_key)
    assert fetched is not None
    assert fetched.unique_key == request.unique_key

    next_request = await rq_client.fetch_next_request()
    assert next_request is not None
    assert next_request.unique_key == request.unique_key


================================================
FILE: tests/unit/storage_clients/_memory/test_memory_dataset_client.py
================================================
from __future__ import annotations

import asyncio
from typing import TYPE_CHECKING

import pytest

from crawlee.storage_clients import MemoryStorageClient

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from crawlee.storage_clients._memory import MemoryDatasetClient


@pytest.fixture
async def dataset_client() -> AsyncGenerator[MemoryDatasetClient, None]:
    """Fixture that provides a fresh memory dataset client for each test."""
    client = await MemoryStorageClient().create_dataset_client(name='test-dataset')
    yield client
    await client.drop()


async def test_memory_specific_purge_behavior() -> None:
    """Test memory-specific purge behavior and in-memory storage characteristics."""
    # Create dataset and add data
    dataset_client1 = await MemoryStorageClient().create_dataset_client(
        name='test-purge-dataset',
    )
    await dataset_client1.push_data({'item': 'initial data'})

    # Verify data was added
    items = await dataset_client1.get_data()
    assert len(items.items) == 1

    # Reopen with same storage client instance
    dataset_client2 = await MemoryStorageClient().create_dataset_client(
        name='test-purge-dataset',
    )

    # Verify data was purged (memory storage specific behavior)
    items = await dataset_client2.get_data()
    assert len(items.items) == 0


async def test_memory_metadata_updates(dataset_client: MemoryDatasetClient) -> None:
    """Test that metadata timestamps are updated correctly in memory storage."""
    # Record initial timestamps
    metadata = await dataset_client.get_metadata()
    initial_created = metadata.created_at
    initial_accessed = metadata.accessed_at
    initial_modified = metadata.modified_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a read operation
    await dataset_client.get_data()

    # Verify timestamps (memory-specific behavior)
    metadata = await dataset_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.accessed_at > initial_accessed
    assert metadata.modified_at == initial_modified

    accessed_after_read = metadata.accessed_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a write operation
    await dataset_client.push_data({'new': 'item'})

    # Verify timestamps were updated
    metadata = await dataset_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.modified_at > initial_modified
    assert metadata.accessed_at > accessed_after_read


================================================
FILE: tests/unit/storage_clients/_memory/test_memory_kvs_client.py
================================================
from __future__ import annotations

import asyncio
from typing import TYPE_CHECKING

import pytest

from crawlee.storage_clients import MemoryStorageClient

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from crawlee.storage_clients._memory import MemoryKeyValueStoreClient


@pytest.fixture
async def kvs_client() -> AsyncGenerator[MemoryKeyValueStoreClient, None]:
    """Fixture that provides a fresh memory key-value store client for each test."""
    client = await MemoryStorageClient().create_kvs_client(name='test-kvs')
    yield client
    await client.drop()


async def test_memory_specific_purge_behavior() -> None:
    """Test memory-specific purge behavior and in-memory storage characteristics."""

    # Create KVS and add data
    kvs_client1 = await MemoryStorageClient().create_kvs_client(
        name='test-purge-kvs',
    )
    await kvs_client1.set_value(key='test-key', value='initial value')

    # Verify value was set
    record = await kvs_client1.get_value(key='test-key')
    assert record is not None
    assert record.value == 'initial value'

    # Reopen with same storage client instance
    kvs_client2 = await MemoryStorageClient().create_kvs_client(
        name='test-purge-kvs',
    )

    # Verify value was purged (memory storage specific behavior)
    record = await kvs_client2.get_value(key='test-key')
    assert record is None


async def test_memory_metadata_updates(kvs_client: MemoryKeyValueStoreClient) -> None:
    """Test that metadata timestamps are updated correctly in memory storage."""
    # Record initial timestamps
    metadata = await kvs_client.get_metadata()
    initial_created = metadata.created_at
    initial_accessed = metadata.accessed_at
    initial_modified = metadata.modified_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a read operation
    await kvs_client.get_value(key='nonexistent')

    # Verify timestamps (memory-specific behavior)
    metadata = await kvs_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.accessed_at > initial_accessed
    assert metadata.modified_at == initial_modified

    accessed_after_read = metadata.accessed_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a write operation
    await kvs_client.set_value(key='test', value='test-value')

    # Verify timestamps were updated
    metadata = await kvs_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.modified_at > initial_modified
    assert metadata.accessed_at > accessed_after_read


================================================
FILE: tests/unit/storage_clients/_memory/test_memory_rq_client.py
================================================
from __future__ import annotations

import asyncio
from typing import TYPE_CHECKING

import pytest

from crawlee import Request
from crawlee.storage_clients import MemoryStorageClient

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from crawlee.storage_clients._memory import MemoryRequestQueueClient


@pytest.fixture
async def rq_client() -> AsyncGenerator[MemoryRequestQueueClient, None]:
    """Fixture that provides a fresh memory request queue client for each test."""
    client = await MemoryStorageClient().create_rq_client(name='test-rq')
    yield client
    await client.drop()


async def test_memory_specific_purge_behavior() -> None:
    """Test memory-specific purge behavior and in-memory storage characteristics."""
    # Create RQ and add data
    rq_client1 = await MemoryStorageClient().create_rq_client(
        name='test-purge-rq',
    )
    request = Request.from_url(url='https://example.com/initial')
    await rq_client1.add_batch_of_requests([request])

    # Verify request was added
    assert await rq_client1.is_empty() is False

    # Reopen with same storage client instance
    rq_client2 = await MemoryStorageClient().create_rq_client(
        name='test-purge-rq',
    )

    # Verify queue was purged (memory storage specific behavior)
    assert await rq_client2.is_empty() is True


async def test_memory_metadata_updates(rq_client: MemoryRequestQueueClient) -> None:
    """Test that metadata timestamps are updated correctly in memory storage."""
    # Record initial timestamps
    metadata = await rq_client.get_metadata()
    initial_created = metadata.created_at
    initial_accessed = metadata.accessed_at
    initial_modified = metadata.modified_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a read operation
    await rq_client.is_empty()

    # Verify timestamps (memory-specific behavior)
    metadata = await rq_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.accessed_at > initial_accessed
    assert metadata.modified_at == initial_modified

    accessed_after_read = metadata.accessed_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a write operation
    await rq_client.add_batch_of_requests([Request.from_url('https://example.com')])

    # Verify timestamps were updated
    metadata = await rq_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.modified_at > initial_modified
    assert metadata.accessed_at > accessed_after_read


================================================
FILE: tests/unit/storage_clients/_redis/test_redis_dataset_client.py
================================================
from __future__ import annotations

import asyncio
from typing import TYPE_CHECKING

import pytest

from crawlee.storage_clients import RedisStorageClient
from crawlee.storage_clients._redis._utils import await_redis_response

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from fakeredis import FakeAsyncRedis

    from crawlee.storage_clients._redis import RedisDatasetClient


@pytest.fixture
async def dataset_client(
    redis_client: FakeAsyncRedis,
    suppress_user_warning: None,  # noqa: ARG001
) -> AsyncGenerator[RedisDatasetClient, None]:
    """A fixture for a Redis dataset client."""
    client = await RedisStorageClient(redis=redis_client).create_dataset_client(
        name='test_dataset',
    )
    yield client
    await client.drop()


async def test_base_keys_creation(dataset_client: RedisDatasetClient) -> None:
    """Test that Redis dataset client creates proper keys."""
    metadata = await dataset_client.get_metadata()
    name = await await_redis_response(dataset_client.redis.hget('datasets:id_to_name', metadata.id))

    assert name is not None
    assert (name.decode() if isinstance(name, bytes) else name) == 'test_dataset'

    dataset_id = await await_redis_response(dataset_client.redis.hget('datasets:name_to_id', 'test_dataset'))

    assert dataset_id is not None
    assert (dataset_id.decode() if isinstance(dataset_id, bytes) else dataset_id) == metadata.id

    items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$'))
    assert items is not None
    assert len(items) == 0

    metadata_data = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:metadata'))

    assert isinstance(metadata_data, dict)
    assert metadata_data['id'] == metadata.id


async def test_record_and_content_verification(dataset_client: RedisDatasetClient) -> None:
    """Test that data is properly persisted to Redis with correct content."""
    item = {'key': 'value', 'number': 42}
    await dataset_client.push_data(item)

    # Verify metadata record
    metadata = await dataset_client.get_metadata()
    assert metadata.item_count == 1
    assert metadata.created_at is not None
    assert metadata.modified_at is not None
    assert metadata.accessed_at is not None

    # Verify records in Redis
    all_items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$'))

    assert all_items is not None
    assert len(all_items) == 1

    # Verify actual file content
    assert all_items[0] == item

    # Test multiple records
    items = [{'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}]
    await dataset_client.push_data(items)

    all_items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$'))
    assert all_items is not None
    assert len(all_items) == 4


async def test_drop_removes_records(dataset_client: RedisDatasetClient) -> None:
    """Test that dropping a dataset removes all records from Redis."""
    await dataset_client.push_data({'test': 'data'})

    metadata = await dataset_client.get_metadata()
    name = await await_redis_response(dataset_client.redis.hget('datasets:id_to_name', metadata.id))
    dataset_id = await await_redis_response(dataset_client.redis.hget('datasets:name_to_id', 'test_dataset'))
    items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$'))

    assert name is not None
    assert (name.decode() if isinstance(name, bytes) else name) == 'test_dataset'
    assert dataset_id is not None
    assert (dataset_id.decode() if isinstance(dataset_id, bytes) else dataset_id) == metadata.id
    assert items is not None
    assert len(items) == 1

    # Drop the dataset
    await dataset_client.drop()

    # Verify removal of all records
    name_after_drop = await await_redis_response(dataset_client.redis.hget('datasets:id_to_name', metadata.id))
    dataset_id_after_drop = await await_redis_response(dataset_client.redis.hget('datasets:name_to_id', 'test_dataset'))
    items_after_drop = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$'))

    assert name_after_drop is None
    assert dataset_id_after_drop is None
    assert items_after_drop is None


async def test_metadata_record_updates(dataset_client: RedisDatasetClient) -> None:
    """Test that metadata record is updated correctly after operations."""
    # Record initial timestamps
    metadata = await dataset_client.get_metadata()
    initial_created = metadata.created_at
    initial_accessed = metadata.accessed_at
    initial_modified = metadata.modified_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform an operation that updates accessed_at
    await dataset_client.get_data()

    # Verify timestamps
    metadata = await dataset_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.accessed_at > initial_accessed
    assert metadata.modified_at == initial_modified

    accessed_after_get = metadata.accessed_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform an operation that updates modified_at
    await dataset_client.push_data({'new': 'item'})

    # Verify timestamps again
    metadata = await dataset_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.modified_at > initial_modified
    assert metadata.accessed_at > accessed_after_get


================================================
FILE: tests/unit/storage_clients/_redis/test_redis_kvs_client.py
================================================
from __future__ import annotations

import asyncio
import json
from typing import TYPE_CHECKING

import pytest

from crawlee.storage_clients import RedisStorageClient
from crawlee.storage_clients._redis._utils import await_redis_response

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from fakeredis import FakeAsyncRedis

    from crawlee.storage_clients._redis import RedisKeyValueStoreClient


@pytest.fixture
async def kvs_client(
    redis_client: FakeAsyncRedis,
    suppress_user_warning: None,  # noqa: ARG001
) -> AsyncGenerator[RedisKeyValueStoreClient, None]:
    """A fixture for a Redis KVS client."""
    client = await RedisStorageClient(redis=redis_client).create_kvs_client(
        name='test_kvs',
    )
    yield client
    await client.drop()


async def test_base_keys_creation(kvs_client: RedisKeyValueStoreClient) -> None:
    """Test that Redis KVS client creates proper keys."""
    metadata = await kvs_client.get_metadata()
    name = await await_redis_response(kvs_client.redis.hget('key_value_stores:id_to_name', metadata.id))

    assert name is not None
    assert (name.decode() if isinstance(name, bytes) else name) == 'test_kvs'

    kvs_id = await await_redis_response(kvs_client.redis.hget('key_value_stores:name_to_id', 'test_kvs'))

    assert kvs_id is not None
    assert (kvs_id.decode() if isinstance(kvs_id, bytes) else kvs_id) == metadata.id

    metadata_data = await await_redis_response(kvs_client.redis.json().get('key_value_stores:test_kvs:metadata'))

    assert isinstance(metadata_data, dict)
    assert metadata_data['id'] == metadata.id


async def test_value_record_creation_and_content(kvs_client: RedisKeyValueStoreClient) -> None:
    """Test that values are properly persisted to records with correct content and metadata."""
    test_key = 'test-key'
    test_value = 'Hello, world!'
    await kvs_client.set_value(key=test_key, value=test_value)

    # Check if the records were created
    records_key = 'key_value_stores:test_kvs:items'
    records_items_metadata = 'key_value_stores:test_kvs:metadata_items'
    record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key))
    metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key))
    assert record_exists is True
    assert metadata_exists is True

    # Check record content
    content = await await_redis_response(kvs_client.redis.hget(records_key, test_key))
    content = content.decode() if isinstance(content, bytes) else content
    assert content == test_value

    # Check record metadata
    record_metadata = await await_redis_response(kvs_client.redis.hget(records_items_metadata, test_key))
    assert record_metadata is not None
    assert isinstance(record_metadata, (str, bytes))
    metadata = json.loads(record_metadata)

    # Check record metadata
    assert metadata['key'] == test_key
    assert metadata['content_type'] == 'text/plain; charset=utf-8'
    assert metadata['size'] == len(test_value.encode('utf-8'))

    # Verify retrieval works correctly
    check_value = await kvs_client.get_value(key=test_key)
    assert check_value is not None
    assert check_value.value == test_value


async def test_binary_data_persistence(kvs_client: RedisKeyValueStoreClient) -> None:
    """Test that binary data is stored correctly without corruption."""
    test_key = 'test-binary'
    test_value = b'\x00\x01\x02\x03\x04'
    records_key = 'key_value_stores:test_kvs:items'
    records_items_metadata = 'key_value_stores:test_kvs:metadata_items'
    await kvs_client.set_value(key=test_key, value=test_value)

    # Verify binary file exists
    record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key))
    metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key))
    assert record_exists is True
    assert metadata_exists is True

    # Verify binary content is preserved
    content = await await_redis_response(kvs_client.redis.hget(records_key, test_key))
    assert content == test_value

    # Verify retrieval works correctly
    record = await kvs_client.get_value(key=test_key)
    assert record is not None
    assert record.value == test_value
    assert record.content_type == 'application/octet-stream'


async def test_json_serialization_to_record(kvs_client: RedisKeyValueStoreClient) -> None:
    """Test that JSON objects are properly serialized to records."""
    test_key = 'test-json'
    test_value = {'name': 'John', 'age': 30, 'items': [1, 2, 3]}
    await kvs_client.set_value(key=test_key, value=test_value)

    # Check if record content is valid JSON
    records_key = 'key_value_stores:test_kvs:items'
    record = await await_redis_response(kvs_client.redis.hget(records_key, test_key))
    assert record is not None
    assert isinstance(record, (str, bytes))
    assert json.loads(record) == test_value


async def test_records_deletion_on_value_delete(kvs_client: RedisKeyValueStoreClient) -> None:
    """Test that deleting a value removes its records from Redis."""
    test_key = 'test-delete'
    test_value = 'Delete me'
    records_key = 'key_value_stores:test_kvs:items'
    records_items_metadata = 'key_value_stores:test_kvs:metadata_items'

    # Set a value
    await kvs_client.set_value(key=test_key, value=test_value)

    # Verify records exist
    record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key))
    metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key))
    assert record_exists is True
    assert metadata_exists is True

    # Delete the value
    await kvs_client.delete_value(key=test_key)

    # Verify files were deleted
    record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key))
    metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key))
    assert record_exists is False
    assert metadata_exists is False


async def test_drop_removes_keys(kvs_client: RedisKeyValueStoreClient) -> None:
    """Test that drop removes the entire store directory from disk."""
    await kvs_client.set_value(key='test', value='test-value')

    metadata = await kvs_client.get_metadata()
    name = await await_redis_response(kvs_client.redis.hget('key_value_stores:id_to_name', metadata.id))
    kvs_id = await await_redis_response(kvs_client.redis.hget('key_value_stores:name_to_id', 'test_kvs'))
    items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:items'))
    metadata_items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:metadata_items'))

    assert name is not None
    assert (name.decode() if isinstance(name, bytes) else name) == 'test_kvs'
    assert kvs_id is not None
    assert (kvs_id.decode() if isinstance(kvs_id, bytes) else kvs_id) == metadata.id
    assert items is not None
    assert items != {}
    assert metadata_items is not None
    assert metadata_items != {}

    # Drop the store
    await kvs_client.drop()

    name = await await_redis_response(kvs_client.redis.hget('key_value_stores:id_to_name', metadata.id))
    kvs_id = await await_redis_response(kvs_client.redis.hget('key_value_stores:name_to_id', 'test_kvs'))
    items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:items'))
    metadata_items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:metadata_items'))
    assert name is None
    assert kvs_id is None
    assert items == {}
    assert metadata_items == {}


async def test_metadata_record_updates(kvs_client: RedisKeyValueStoreClient) -> None:
    """Test that read/write operations properly update metadata file timestamps."""
    # Record initial timestamps
    metadata = await kvs_client.get_metadata()
    initial_created = metadata.created_at
    initial_accessed = metadata.accessed_at
    initial_modified = metadata.modified_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a read operation
    await kvs_client.get_value(key='nonexistent')

    # Verify accessed timestamp was updated
    metadata = await kvs_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.accessed_at > initial_accessed
    assert metadata.modified_at == initial_modified

    accessed_after_read = metadata.accessed_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a write operation
    await kvs_client.set_value(key='test', value='test-value')

    # Verify modified timestamp was updated
    metadata = await kvs_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.modified_at > initial_modified
    assert metadata.accessed_at > accessed_after_read


================================================
FILE: tests/unit/storage_clients/_redis/test_redis_rq_client.py
================================================
from __future__ import annotations

import asyncio
import json
from typing import TYPE_CHECKING

import pytest

from crawlee import Request
from crawlee.storage_clients import RedisStorageClient
from crawlee.storage_clients._redis._utils import await_redis_response

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from fakeredis import FakeAsyncRedis

    from crawlee.storage_clients._redis import RedisRequestQueueClient


@pytest.fixture(params=['default', 'bloom'])
async def rq_client(
    redis_client: FakeAsyncRedis,
    request: pytest.FixtureRequest,
    suppress_user_warning: None,  # noqa: ARG001
) -> AsyncGenerator[RedisRequestQueueClient, None]:
    """A fixture for a Redis RQ client."""
    client = await RedisStorageClient(redis=redis_client, queue_dedup_strategy=request.param).create_rq_client(
        name='test_request_queue'
    )
    yield client
    await client.drop()


async def test_base_keys_creation(rq_client: RedisRequestQueueClient) -> None:
    """Test that Redis RQ client creates proper keys."""

    metadata = await rq_client.get_metadata()
    name = await await_redis_response(rq_client.redis.hget('request_queues:id_to_name', metadata.id))

    assert name is not None
    assert (name.decode() if isinstance(name, bytes) else name) == 'test_request_queue'

    kvs_id = await await_redis_response(rq_client.redis.hget('request_queues:name_to_id', 'test_request_queue'))

    assert kvs_id is not None
    assert (kvs_id.decode() if isinstance(kvs_id, bytes) else kvs_id) == metadata.id

    if rq_client._dedup_strategy == 'bloom':
        added_bf = await await_redis_response(
            rq_client.redis.exists('request_queues:test_request_queue:added_bloom_filter')
        )
        assert added_bf == 1

        handled_bf = await await_redis_response(
            rq_client.redis.exists('request_queues:test_request_queue:handled_bloom_filter')
        )
        assert handled_bf == 1

    metadata_data = await await_redis_response(rq_client.redis.json().get('request_queues:test_request_queue:metadata'))

    assert isinstance(metadata_data, dict)
    assert metadata_data['id'] == metadata.id


async def test_request_records_persistence(rq_client: RedisRequestQueueClient) -> None:
    """Test that requests are properly persisted to Redis."""
    requests = [
        Request.from_url('https://example.com/1'),
        Request.from_url('https://example.com/2'),
        Request.from_url('https://example.com/3'),
    ]

    await rq_client.add_batch_of_requests(requests)

    # Verify request records are created
    request_queue_response = await await_redis_response(
        rq_client.redis.lmpop(1, 'request_queues:test_request_queue:queue', direction='left', count=10)
    )
    assert request_queue_response is not None
    assert isinstance(request_queue_response, list)
    request_keys = request_queue_response[1]
    assert isinstance(request_keys, list)
    assert len(request_keys) == 3

    # Verify actual request file content
    requests_records_data = await await_redis_response(
        rq_client.redis.hgetall('request_queues:test_request_queue:data')
    )
    assert isinstance(requests_records_data, dict)

    for key in request_keys:
        request_data = json.loads(requests_records_data[key])  # ty: ignore[invalid-argument-type]
        assert 'url' in request_data
        assert request_data['url'].startswith('https://example.com/')


async def test_drop_removes_records(rq_client: RedisRequestQueueClient) -> None:
    """Test that drop removes all request records from Redis."""
    await rq_client.add_batch_of_requests([Request.from_url('https://example.com')])

    rq_queue = 'request_queues:test_request_queue:queue'
    rq_data = 'request_queues:test_request_queue:data'
    added_bf = 'request_queues:test_request_queue:added_bloom_filter'
    handled_bf = 'request_queues:test_request_queue:handled_bloom_filter'
    pending_set = 'request_queues:test_request_queue:pending_set'
    handled_set = 'request_queues:test_request_queue:handled_set'
    metadata_key = 'request_queues:test_request_queue:metadata'

    metadata = await rq_client.get_metadata()
    name = await await_redis_response(rq_client.redis.hget('request_queues:id_to_name', metadata.id))

    assert name is not None
    assert (name.decode() if isinstance(name, bytes) else name) == 'test_request_queue'

    rq_id = await await_redis_response(rq_client.redis.hget('request_queues:name_to_id', 'test_request_queue'))
    assert rq_id is not None
    assert rq_id.decode() if isinstance(rq_id, bytes) else rq_id

    rq_queue_exists = await await_redis_response(rq_client.redis.exists(rq_queue))
    rq_data_exists = await await_redis_response(rq_client.redis.exists(rq_data))
    metadata_exists = await await_redis_response(rq_client.redis.exists(metadata_key))
    assert rq_queue_exists == 1
    assert rq_data_exists == 1
    assert metadata_exists == 1

    if rq_client._dedup_strategy == 'bloom':
        added_bf_exists = await await_redis_response(rq_client.redis.exists(added_bf))
        handled_bf_exists = await await_redis_response(rq_client.redis.exists(handled_bf))
        assert added_bf_exists == 1
        assert handled_bf_exists == 1
    elif rq_client._dedup_strategy == 'default':
        pending_set_exists = await await_redis_response(rq_client.redis.exists(pending_set))
        handled_set_exists = await await_redis_response(rq_client.redis.exists(handled_set))
        assert pending_set_exists == 1
        # No requests marked as handled
        assert handled_set_exists == 0

    # Drop the request queue
    await rq_client.drop()

    # Verify removal of all records
    name_after_drop = await await_redis_response(rq_client.redis.hget('request_queues:id_to_name', metadata.id))
    rq_id_after_drop = await await_redis_response(
        rq_client.redis.hget('request_queues:name_to_id', 'test_request_queue')
    )
    rq_queue_exists = await await_redis_response(rq_client.redis.exists(rq_queue))
    rq_data_exists = await await_redis_response(rq_client.redis.exists(rq_data))
    metadata_exists = await await_redis_response(rq_client.redis.exists(metadata_key))
    assert name_after_drop is None
    assert rq_id_after_drop is None
    assert rq_queue_exists == 0
    assert rq_data_exists == 0
    assert metadata_exists == 0

    if rq_client._dedup_strategy == 'bloom':
        added_bf_exists = await await_redis_response(rq_client.redis.exists(added_bf))
        handled_bf_exists = await await_redis_response(rq_client.redis.exists(handled_bf))
        assert added_bf_exists == 0
        assert handled_bf_exists == 0
    elif rq_client._dedup_strategy == 'default':
        pending_set_exists = await await_redis_response(rq_client.redis.exists(pending_set))
        handled_set_exists = await await_redis_response(rq_client.redis.exists(handled_set))
        assert pending_set_exists == 0
        assert handled_set_exists == 0


async def test_metadata_file_updates(rq_client: RedisRequestQueueClient) -> None:
    """Test that metadata file is updated correctly after operations."""
    # Record initial timestamps
    metadata = await rq_client.get_metadata()
    initial_created = metadata.created_at
    initial_accessed = metadata.accessed_at
    initial_modified = metadata.modified_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a read operation
    await rq_client.is_empty()

    # Verify accessed timestamp was updated
    metadata = await rq_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.accessed_at > initial_accessed
    assert metadata.modified_at == initial_modified

    accessed_after_read = metadata.accessed_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a write operation
    await rq_client.add_batch_of_requests([Request.from_url('https://example.com')])

    # Verify modified timestamp was updated
    metadata = await rq_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.modified_at > initial_modified
    assert metadata.accessed_at > accessed_after_read


async def test_get_request(rq_client: RedisRequestQueueClient) -> None:
    """Test that get_request works correctly."""
    requests = [
        Request.from_url('https://example.com/1'),
        Request.from_url('https://example.com/2'),
        Request.from_url('https://example.com/3'),
    ]

    added_requests = await rq_client.add_batch_of_requests(requests)
    assert len(added_requests.processed_requests) == 3

    for req in requests:
        fetched_request = await rq_client.get_request(req.unique_key)
        assert fetched_request is not None
        assert fetched_request.unique_key == req.unique_key
        assert fetched_request.url == req.url

    # Test fetching a non-existent request
    non_existent = await rq_client.get_request('non-existent-id')
    assert non_existent is None


async def test_deduplication(rq_client: RedisRequestQueueClient) -> None:
    """Test that request deduplication works correctly."""
    requests = [
        Request.from_url('https://example.com/1'),
        Request.from_url('https://example.com/1'),
        Request.from_url('https://example.com/3'),
    ]

    await rq_client.add_batch_of_requests(requests)

    # Verify only unique requests are added
    metadata = await rq_client.get_metadata()
    assert metadata.pending_request_count == 2
    assert metadata.total_request_count == 2

    # Fetch requests and verify order
    request1 = await rq_client.fetch_next_request()
    assert request1 is not None
    assert request1 == requests[0]

    # Fetch the next request, which should skip the duplicate
    request2 = await rq_client.fetch_next_request()
    assert request2 is not None
    assert request2 == requests[2]

    # Verify no more requests are available
    request3 = await rq_client.fetch_next_request()
    assert request3 is None


================================================
FILE: tests/unit/storage_clients/_sql/test_sql_dataset_client.py
================================================
from __future__ import annotations

import asyncio
from typing import TYPE_CHECKING

import pytest
from sqlalchemy import inspect, select
from sqlalchemy.ext.asyncio import create_async_engine

from crawlee.configuration import Configuration
from crawlee.storage_clients import SqlStorageClient
from crawlee.storage_clients._sql._db_models import DatasetItemDb, DatasetMetadataDb

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator
    from pathlib import Path

    from sqlalchemy import Connection

    from crawlee.storage_clients._sql import SqlDatasetClient


@pytest.fixture
def configuration(tmp_path: Path) -> Configuration:
    """Temporary configuration for tests."""
    return Configuration(
        storage_dir=str(tmp_path),
    )


# Helper function that allows you to use inspect with an asynchronous engine
def get_tables(sync_conn: Connection) -> list[str]:
    inspector = inspect(sync_conn)
    return inspector.get_table_names()


@pytest.fixture
async def dataset_client(
    configuration: Configuration,
) -> AsyncGenerator[SqlDatasetClient, None]:
    """A fixture for a SQL dataset client."""
    async with SqlStorageClient() as storage_client:
        client = await storage_client.create_dataset_client(
            name='test-dataset',
            configuration=configuration,
        )
        yield client
        await client.drop()


async def test_create_tables_with_connection_string(configuration: Configuration, tmp_path: Path) -> None:
    """Test that SQL dataset client creates tables with a connection string."""
    storage_dir = tmp_path / 'test_table.db'

    async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client:
        await storage_client.create_dataset_client(
            name='new-dataset',
            configuration=configuration,
        )

        async with storage_client.engine.begin() as conn:
            tables = await conn.run_sync(get_tables)
            assert 'dataset_records' in tables
            assert 'datasets' in tables


async def test_create_tables_with_engine(configuration: Configuration, tmp_path: Path) -> None:
    """Test that SQL dataset client creates tables with a pre-configured engine."""
    storage_dir = tmp_path / 'test_table.db'

    engine = create_async_engine(f'sqlite+aiosqlite:///{storage_dir}', future=True, echo=False)

    async with SqlStorageClient(engine=engine) as storage_client:
        await storage_client.create_dataset_client(
            name='new-dataset',
            configuration=configuration,
        )

        async with engine.begin() as conn:
            tables = await conn.run_sync(get_tables)
            assert 'dataset_records' in tables
            assert 'datasets' in tables


async def test_tables_and_metadata_record(configuration: Configuration) -> None:
    """Test that SQL dataset creates proper tables and metadata records."""
    async with SqlStorageClient() as storage_client:
        client = await storage_client.create_dataset_client(
            name='new-dataset',
            configuration=configuration,
        )

        client_metadata = await client.get_metadata()

        async with storage_client.engine.begin() as conn:
            tables = await conn.run_sync(get_tables)
            assert 'dataset_records' in tables
            assert 'datasets' in tables

        async with client.get_session() as session:
            stmt = select(DatasetMetadataDb).where(DatasetMetadataDb.name == 'new-dataset')
            result = await session.execute(stmt)
            orm_metadata = result.scalar_one_or_none()
            assert orm_metadata is not None
            assert orm_metadata.id == client_metadata.id
            assert orm_metadata.name == 'new-dataset'
            assert orm_metadata.item_count == 0

        await client.drop()


async def test_record_and_content_verification(dataset_client: SqlDatasetClient) -> None:
    """Test that dataset client can push data and verify its content."""
    item = {'key': 'value', 'number': 42}
    await dataset_client.push_data(item)

    # Verify metadata record
    metadata = await dataset_client.get_metadata()
    assert metadata.item_count == 1
    assert metadata.created_at is not None
    assert metadata.modified_at is not None
    assert metadata.accessed_at is not None

    async with dataset_client.get_session() as session:
        stmt = select(DatasetItemDb).where(DatasetItemDb.dataset_id == metadata.id)
        result = await session.execute(stmt)
        records = result.scalars().all()
        assert len(records) == 1
        saved_item = records[0].data
        assert saved_item == item

    # Test pushing multiple items and verify total count
    items = [{'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}]
    await dataset_client.push_data(items)

    async with dataset_client.get_session() as session:
        stmt = select(DatasetItemDb).where(DatasetItemDb.dataset_id == metadata.id)
        result = await session.execute(stmt)
        records = result.scalars().all()
        assert len(records) == 4


async def test_drop_removes_records(dataset_client: SqlDatasetClient) -> None:
    """Test that dropping a dataset removes all records from the database."""
    await dataset_client.push_data({'test': 'data'})

    client_metadata = await dataset_client.get_metadata()

    async with dataset_client.get_session() as session:
        stmt = select(DatasetItemDb).where(DatasetItemDb.dataset_id == client_metadata.id)
        result = await session.execute(stmt)
        records = result.scalars().all()
        assert len(records) == 1

    # Drop the dataset
    await dataset_client.drop()

    async with dataset_client.get_session() as session:
        stmt = select(DatasetItemDb).where(DatasetItemDb.dataset_id == client_metadata.id)
        result = await session.execute(stmt)
        records = result.scalars().all()
        assert len(records) == 0
        metadata = await session.get(DatasetMetadataDb, client_metadata.id)
        assert metadata is None


async def test_metadata_record_updates(dataset_client: SqlDatasetClient) -> None:
    """Test that metadata record is updated correctly after operations."""
    # Record initial timestamps
    metadata = await dataset_client.get_metadata()
    initial_created = metadata.created_at
    initial_accessed = metadata.accessed_at
    initial_modified = metadata.modified_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform an operation that updates accessed_at
    await dataset_client.get_data()

    # Verify timestamps
    metadata = await dataset_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.accessed_at > initial_accessed
    assert metadata.modified_at == initial_modified

    accessed_after_get = metadata.accessed_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform an operation that updates modified_at
    await dataset_client.push_data({'new': 'item'})

    # Verify timestamps again
    metadata = await dataset_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.modified_at > initial_modified
    assert metadata.accessed_at > accessed_after_get

    # Verify metadata record is updated in db
    async with dataset_client.get_session() as session:
        orm_metadata = await session.get(DatasetMetadataDb, metadata.id)
        assert orm_metadata is not None
        orm_metadata.item_count = 1
        assert orm_metadata.created_at == initial_created
        assert orm_metadata.accessed_at == metadata.accessed_at
        assert orm_metadata.modified_at == metadata.modified_at


async def test_data_persistence_across_reopens(configuration: Configuration) -> None:
    """Test that data persists correctly when reopening the same dataset."""
    async with SqlStorageClient() as storage_client:
        original_client = await storage_client.create_dataset_client(
            name='persistence-test',
            configuration=configuration,
        )

        test_data = {'test_item': 'test_value', 'id': 123}
        await original_client.push_data(test_data)

        dataset_id = (await original_client.get_metadata()).id

        reopened_client = await storage_client.create_dataset_client(
            id=dataset_id,
            configuration=configuration,
        )

        data = await reopened_client.get_data()
        assert len(data.items) == 1
        assert data.items[0] == test_data

        await reopened_client.drop()


================================================
FILE: tests/unit/storage_clients/_sql/test_sql_kvs_client.py
================================================
from __future__ import annotations

import asyncio
import json
from typing import TYPE_CHECKING

import pytest
from sqlalchemy import inspect, select
from sqlalchemy.ext.asyncio import create_async_engine

from crawlee.configuration import Configuration
from crawlee.storage_clients import SqlStorageClient
from crawlee.storage_clients._sql._db_models import KeyValueStoreMetadataDb, KeyValueStoreRecordDb
from crawlee.storage_clients.models import KeyValueStoreMetadata

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator
    from pathlib import Path

    from sqlalchemy import Connection

    from crawlee.storage_clients._sql import SqlKeyValueStoreClient


@pytest.fixture
def configuration(tmp_path: Path) -> Configuration:
    """Temporary configuration for tests."""
    return Configuration(
        storage_dir=str(tmp_path),
    )


@pytest.fixture
async def kvs_client(
    configuration: Configuration,
) -> AsyncGenerator[SqlKeyValueStoreClient, None]:
    """A fixture for a SQL key-value store client."""
    async with SqlStorageClient() as storage_client:
        client = await storage_client.create_kvs_client(
            name='test-kvs',
            configuration=configuration,
        )
        yield client
        await client.drop()


# Helper function that allows you to use inspect with an asynchronous engine
def get_tables(sync_conn: Connection) -> list[str]:
    inspector = inspect(sync_conn)
    return inspector.get_table_names()


async def test_create_tables_with_connection_string(configuration: Configuration, tmp_path: Path) -> None:
    """Test that SQL key-value store client creates tables with a connection string."""
    storage_dir = tmp_path / 'test_table.db'

    async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client:
        await storage_client.create_kvs_client(
            name='new-kvs',
            configuration=configuration,
        )

        async with storage_client.engine.begin() as conn:
            tables = await conn.run_sync(get_tables)
            assert 'key_value_stores' in tables
            assert 'key_value_store_records' in tables


async def test_create_tables_with_engine(configuration: Configuration, tmp_path: Path) -> None:
    """Test that SQL key-value store client creates tables with a pre-configured engine."""
    storage_dir = tmp_path / 'test_table.db'

    engine = create_async_engine(f'sqlite+aiosqlite:///{storage_dir}', future=True, echo=False)

    async with SqlStorageClient(engine=engine) as storage_client:
        await storage_client.create_kvs_client(
            name='new-kvs',
            configuration=configuration,
        )

        async with engine.begin() as conn:
            tables = await conn.run_sync(get_tables)
            assert 'key_value_stores' in tables
            assert 'key_value_store_records' in tables


async def test_tables_and_metadata_record(configuration: Configuration) -> None:
    """Test that SQL key-value store creates proper tables and metadata records."""
    async with SqlStorageClient() as storage_client:
        client = await storage_client.create_kvs_client(
            name='new-kvs',
            configuration=configuration,
        )

        client_metadata = await client.get_metadata()

        async with storage_client.engine.begin() as conn:
            tables = await conn.run_sync(get_tables)
            assert 'key_value_stores' in tables
            assert 'key_value_store_records' in tables

        async with client.get_session() as session:
            stmt = select(KeyValueStoreMetadataDb).where(KeyValueStoreMetadataDb.name == 'new-kvs')
            result = await session.execute(stmt)
            orm_metadata = result.scalar_one_or_none()
            metadata = KeyValueStoreMetadata.model_validate(orm_metadata)
            assert metadata.id == client_metadata.id
            assert metadata.name == 'new-kvs'

        await client.drop()


async def test_value_record_creation(kvs_client: SqlKeyValueStoreClient) -> None:
    """Test that SQL key-value store client can create a record."""
    test_key = 'test-key'
    test_value = 'Hello, world!'
    await kvs_client.set_value(key=test_key, value=test_value)
    async with kvs_client.get_session() as session:
        stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key)
        result = await session.execute(stmt)
        record = result.scalar_one_or_none()
        assert record is not None
        assert record.key == test_key
        assert record.content_type == 'text/plain; charset=utf-8'
        assert record.size == len(test_value.encode('utf-8'))
        assert record.value == test_value.encode('utf-8')


async def test_binary_data_persistence(kvs_client: SqlKeyValueStoreClient) -> None:
    """Test that binary data is stored correctly without corruption."""
    test_key = 'test-binary'
    test_value = b'\x00\x01\x02\x03\x04'
    await kvs_client.set_value(key=test_key, value=test_value)

    async with kvs_client.get_session() as session:
        stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key)
        result = await session.execute(stmt)
        record = result.scalar_one_or_none()
        assert record is not None
        assert record.key == test_key
        assert record.content_type == 'application/octet-stream'
        assert record.size == len(test_value)
        assert record.value == test_value

    verify_record = await kvs_client.get_value(key=test_key)
    assert verify_record is not None
    assert verify_record.value == test_value
    assert verify_record.content_type == 'application/octet-stream'


async def test_json_serialization_to_record(kvs_client: SqlKeyValueStoreClient) -> None:
    """Test that JSON objects are properly serialized to records."""
    test_key = 'test-json'
    test_value = {'name': 'John', 'age': 30, 'items': [1, 2, 3]}
    await kvs_client.set_value(key=test_key, value=test_value)

    async with kvs_client.get_session() as session:
        stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key)
        result = await session.execute(stmt)
        record = result.scalar_one_or_none()
        assert record is not None
        assert record.key == test_key
        assert json.loads(record.value.decode('utf-8')) == test_value


async def test_record_deletion_on_value_delete(kvs_client: SqlKeyValueStoreClient) -> None:
    """Test that deleting a value removes its record from the database."""
    test_key = 'test-delete'
    test_value = 'Delete me'

    # Set a value
    await kvs_client.set_value(key=test_key, value=test_value)

    async with kvs_client.get_session() as session:
        stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key)
        result = await session.execute(stmt)
        record = result.scalar_one_or_none()
        assert record is not None
        assert record.key == test_key
        assert record.value == test_value.encode('utf-8')

    # Delete the value
    await kvs_client.delete_value(key=test_key)

    # Verify record was deleted
    async with kvs_client.get_session() as session:
        stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key)
        result = await session.execute(stmt)
        record = result.scalar_one_or_none()
        assert record is None


async def test_drop_removes_records(kvs_client: SqlKeyValueStoreClient) -> None:
    """Test that drop removes all records from the database."""
    await kvs_client.set_value(key='test', value='test-value')

    client_metadata = await kvs_client.get_metadata()

    async with kvs_client.get_session() as session:
        stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == 'test')
        result = await session.execute(stmt)
        record = result.scalar_one_or_none()
        assert record is not None

    # Drop the store
    await kvs_client.drop()

    async with kvs_client.get_session() as session:
        stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == 'test')
        result = await session.execute(stmt)
        record = result.scalar_one_or_none()
        assert record is None
        metadata = await session.get(KeyValueStoreMetadataDb, client_metadata.id)
        assert metadata is None


async def test_metadata_record_updates(kvs_client: SqlKeyValueStoreClient) -> None:
    """Test that read/write operations properly update metadata record timestamps."""
    # Record initial timestamps
    metadata = await kvs_client.get_metadata()
    initial_created = metadata.created_at
    initial_accessed = metadata.accessed_at
    initial_modified = metadata.modified_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a read operation
    await kvs_client.get_value(key='nonexistent')

    # Verify accessed timestamp was updated
    metadata = await kvs_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.accessed_at > initial_accessed
    assert metadata.modified_at == initial_modified

    accessed_after_read = metadata.accessed_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a write operation
    await kvs_client.set_value(key='test', value='test-value')

    # Verify modified timestamp was updated
    metadata = await kvs_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.modified_at > initial_modified
    assert metadata.accessed_at > accessed_after_read

    async with kvs_client.get_session() as session:
        orm_metadata = await session.get(KeyValueStoreMetadataDb, metadata.id)
        assert orm_metadata is not None
        assert orm_metadata.created_at == metadata.created_at
        assert orm_metadata.accessed_at == metadata.accessed_at
        assert orm_metadata.modified_at == metadata.modified_at


async def test_data_persistence_across_reopens(configuration: Configuration) -> None:
    """Test that data persists correctly when reopening the same key-value store."""
    async with SqlStorageClient() as storage_client:
        original_client = await storage_client.create_kvs_client(
            name='persistence-test',
            configuration=configuration,
        )

        test_key = 'persistent-key'
        test_value = 'persistent-value'
        await original_client.set_value(key=test_key, value=test_value)

        kvs_id = (await original_client.get_metadata()).id

        # Reopen by ID and verify data persists
        reopened_client = await storage_client.create_kvs_client(
            id=kvs_id,
            configuration=configuration,
        )

        record = await reopened_client.get_value(key=test_key)
        assert record is not None
        assert record.value == test_value

        await reopened_client.drop()


================================================
FILE: tests/unit/storage_clients/_sql/test_sql_rq_client.py
================================================
from __future__ import annotations

import asyncio
import json
from typing import TYPE_CHECKING

import pytest
from sqlalchemy import inspect, select
from sqlalchemy.ext.asyncio import create_async_engine

from crawlee import Request
from crawlee.configuration import Configuration
from crawlee.storage_clients import SqlStorageClient
from crawlee.storage_clients._sql._db_models import RequestDb, RequestQueueMetadataDb
from crawlee.storage_clients.models import RequestQueueMetadata

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator
    from pathlib import Path

    from sqlalchemy import Connection

    from crawlee.storage_clients._sql import SqlRequestQueueClient


@pytest.fixture
def configuration(tmp_path: Path) -> Configuration:
    """Temporary configuration for tests."""
    return Configuration(
        storage_dir=str(tmp_path),
    )


@pytest.fixture
async def rq_client(
    configuration: Configuration,
) -> AsyncGenerator[SqlRequestQueueClient, None]:
    """A fixture for a SQL request queue client."""
    async with SqlStorageClient() as storage_client:
        client = await storage_client.create_rq_client(
            name='test-request-queue',
            configuration=configuration,
        )
        yield client
        await client.drop()


# Helper function that allows you to use inspect with an asynchronous engine
def get_tables(sync_conn: Connection) -> list[str]:
    inspector = inspect(sync_conn)
    return inspector.get_table_names()


async def test_create_tables_with_connection_string(configuration: Configuration, tmp_path: Path) -> None:
    """Test that SQL request queue client creates tables with a connection string."""
    storage_dir = tmp_path / 'test_table.db'

    async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client:
        await storage_client.create_rq_client(
            name='test-request-queue',
            configuration=configuration,
        )

        async with storage_client.engine.begin() as conn:
            tables = await conn.run_sync(get_tables)
            assert 'request_queues' in tables
            assert 'request_queue_records' in tables
            assert 'request_queue_state' in tables


async def test_create_tables_with_engine(configuration: Configuration, tmp_path: Path) -> None:
    """Test that SQL request queue client creates tables with a pre-configured engine."""
    storage_dir = tmp_path / 'test_table.db'

    engine = create_async_engine(f'sqlite+aiosqlite:///{storage_dir}', future=True, echo=False)

    async with SqlStorageClient(engine=engine) as storage_client:
        await storage_client.create_rq_client(
            name='test-request-queue',
            configuration=configuration,
        )

        async with engine.begin() as conn:
            tables = await conn.run_sync(get_tables)
            assert 'request_queues' in tables
            assert 'request_queue_records' in tables
            assert 'request_queue_state' in tables


async def test_tables_and_metadata_record(configuration: Configuration) -> None:
    """Test that SQL request queue creates proper tables and metadata records."""
    async with SqlStorageClient() as storage_client:
        client = await storage_client.create_rq_client(
            name='test-request-queue',
            configuration=configuration,
        )

        client_metadata = await client.get_metadata()

        async with storage_client.engine.begin() as conn:
            tables = await conn.run_sync(get_tables)
            assert 'request_queues' in tables
            assert 'request_queue_records' in tables
            assert 'request_queue_state' in tables

        async with client.get_session() as session:
            stmt = select(RequestQueueMetadataDb).where(RequestQueueMetadataDb.name == 'test-request-queue')
            result = await session.execute(stmt)
            orm_metadata = result.scalar_one_or_none()
            metadata = RequestQueueMetadata.model_validate(orm_metadata)
            assert metadata.id == client_metadata.id
            assert metadata.name == 'test-request-queue'

        await client.drop()


async def test_request_records_persistence(rq_client: SqlRequestQueueClient) -> None:
    """Test that all added requests are persisted and can be retrieved from the database."""
    requests = [
        Request.from_url('https://example.com/1'),
        Request.from_url('https://example.com/2'),
        Request.from_url('https://example.com/3'),
    ]

    await rq_client.add_batch_of_requests(requests)

    metadata_client = await rq_client.get_metadata()

    async with rq_client.get_session() as session:
        stmt = select(RequestDb).where(RequestDb.request_queue_id == metadata_client.id)
        result = await session.execute(stmt)
        db_requests = result.scalars().all()
        assert len(db_requests) == 3
    for db_request in db_requests:
        request = json.loads(db_request.data)
        assert request['url'] in ['https://example.com/1', 'https://example.com/2', 'https://example.com/3']


async def test_drop_removes_records(rq_client: SqlRequestQueueClient) -> None:
    """Test that drop removes all records from the database."""
    await rq_client.add_batch_of_requests([Request.from_url('https://example.com')])
    metadata = await rq_client.get_metadata()
    async with rq_client.get_session() as session:
        stmt = select(RequestDb).where(RequestDb.request_queue_id == metadata.id)
        result = await session.execute(stmt)
        records = result.scalars().all()
        assert len(records) == 1

    await rq_client.drop()

    async with rq_client.get_session() as session:
        stmt = select(RequestDb).where(RequestDb.request_queue_id == metadata.id)
        result = await session.execute(stmt)
        records = result.scalars().all()
        assert len(records) == 0
        db_metadata = await session.get(RequestQueueMetadataDb, metadata.id)
        assert db_metadata is None


async def test_metadata_record_updates(rq_client: SqlRequestQueueClient) -> None:
    """Test that metadata record updates correctly after operations."""
    # Record initial timestamps
    metadata = await rq_client.get_metadata()
    initial_created = metadata.created_at
    initial_accessed = metadata.accessed_at
    initial_modified = metadata.modified_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a read operation
    await rq_client.is_empty()

    # Verify accessed timestamp was updated
    metadata = await rq_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.accessed_at > initial_accessed
    assert metadata.modified_at == initial_modified

    accessed_after_read = metadata.accessed_at

    # Wait a moment to ensure timestamps can change
    await asyncio.sleep(0.01)

    # Perform a write operation
    await rq_client.add_batch_of_requests([Request.from_url('https://example.com')])

    # Verify modified timestamp was updated
    metadata = await rq_client.get_metadata()
    assert metadata.created_at == initial_created
    assert metadata.modified_at > initial_modified
    assert metadata.accessed_at > accessed_after_read

    async with rq_client.get_session() as session:
        orm_metadata = await session.get(RequestQueueMetadataDb, metadata.id)
        assert orm_metadata is not None
        assert orm_metadata.created_at == metadata.created_at
        assert orm_metadata.accessed_at == metadata.accessed_at
        assert orm_metadata.modified_at == metadata.modified_at


async def test_data_persistence_across_reopens(configuration: Configuration) -> None:
    """Test that data persists correctly when reopening the same request queue."""
    async with SqlStorageClient() as storage_client:
        original_client = await storage_client.create_rq_client(
            name='persistence-test',
            configuration=configuration,
        )

        test_requests = [
            Request.from_url('https://example.com/1'),
            Request.from_url('https://example.com/2'),
        ]
        await original_client.add_batch_of_requests(test_requests)

        rq_id = (await original_client.get_metadata()).id

        # Reopen by ID and verify data persists
        reopened_client = await storage_client.create_rq_client(
            id=rq_id,
            configuration=configuration,
        )

        metadata = await reopened_client.get_metadata()
        assert metadata.total_request_count == 2

        # Fetch requests to verify they're still there
        request1 = await reopened_client.fetch_next_request()
        request2 = await reopened_client.fetch_next_request()

        assert request1 is not None
        assert request2 is not None
        assert {request1.url, request2.url} == {'https://example.com/1', 'https://example.com/2'}

        await reopened_client.drop()


================================================
FILE: tests/unit/storages/conftest.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

import pytest

from crawlee import service_locator
from crawlee.storage_clients import (
    FileSystemStorageClient,
    MemoryStorageClient,
    RedisStorageClient,
    SqlStorageClient,
    StorageClient,
)

if TYPE_CHECKING:
    from fakeredis import FakeAsyncRedis


@pytest.fixture(params=['memory', 'file_system', 'sql', 'redis'])
def storage_client(
    request: pytest.FixtureRequest,
    redis_client: FakeAsyncRedis,
) -> StorageClient:
    """Parameterized fixture to test with different storage clients."""
    storage_client: StorageClient

    storage_type = request.param

    if storage_type == 'memory':
        storage_client = MemoryStorageClient()
    elif storage_type == 'sql':
        storage_client = SqlStorageClient()
    elif storage_type == 'redis':
        storage_client = RedisStorageClient(redis=redis_client)
    else:
        storage_client = FileSystemStorageClient()
    service_locator.set_storage_client(storage_client)
    return storage_client


================================================
FILE: tests/unit/storages/test_dataset.py
================================================
from __future__ import annotations

import json
from typing import TYPE_CHECKING

import pytest

from crawlee import service_locator
from crawlee.configuration import Configuration
from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient
from crawlee.storages import Dataset, KeyValueStore
from crawlee.storages._storage_instance_manager import StorageInstanceManager

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator
    from pathlib import Path
    from typing import Any

    from crawlee.storage_clients import StorageClient


@pytest.fixture
async def dataset(
    storage_client: StorageClient,
) -> AsyncGenerator[Dataset, None]:
    """Fixture that provides a dataset instance for each test."""
    dataset = await Dataset.open(
        storage_client=storage_client,
    )

    yield dataset
    await dataset.drop()


async def test_open_creates_new_dataset(
    storage_client: StorageClient,
) -> None:
    """Test that open() creates a new dataset with proper metadata."""
    dataset = await Dataset.open(
        name='new-dataset',
        storage_client=storage_client,
    )

    # Verify dataset properties
    assert dataset.id is not None
    assert dataset.name == 'new-dataset'

    metadata = await dataset.get_metadata()
    assert metadata.item_count == 0

    await dataset.drop()


async def test_reopen_default(
    storage_client: StorageClient,
) -> None:
    """Test reopening a dataset with default parameters."""
    # Create a first dataset instance with default parameters
    dataset_1 = await Dataset.open(
        storage_client=storage_client,
    )

    # Verify default properties
    assert dataset_1.id is not None
    metadata_1 = await dataset_1.get_metadata()
    assert metadata_1.item_count == 0

    # Add an item
    await dataset_1.push_data({'key': 'value'})
    metadata_1 = await dataset_1.get_metadata()
    assert metadata_1.item_count == 1

    # Reopen the same dataset
    dataset_2 = await Dataset.open(
        storage_client=storage_client,
    )

    # Verify both instances reference the same dataset
    assert dataset_2.id == dataset_1.id
    assert dataset_2.name == dataset_1.name
    metadata_1 = await dataset_1.get_metadata()
    metadata_2 = await dataset_2.get_metadata()
    assert metadata_2.item_count == metadata_1.item_count == 1

    # Verify they are the same object (cached)
    assert id(dataset_1) == id(dataset_2)

    # Clean up
    await dataset_1.drop()


async def test_open_by_id(
    storage_client: StorageClient,
) -> None:
    """Test opening a dataset by its ID."""
    # First create a dataset by name
    dataset1 = await Dataset.open(
        name='dataset-by-id-test',
        storage_client=storage_client,
    )

    # Add some data to identify it
    test_item = {'test': 'opening_by_id', 'timestamp': 12345}
    await dataset1.push_data(test_item)

    # Open the dataset by ID
    dataset2 = await Dataset.open(
        id=dataset1.id,
        storage_client=storage_client,
    )

    # Verify it's the same dataset
    assert dataset2.id == dataset1.id
    assert dataset2.name == 'dataset-by-id-test'

    # Verify the data is still there
    data = await dataset2.get_data()
    assert data.count == 1
    assert data.items[0]['test'] == 'opening_by_id'
    assert data.items[0]['timestamp'] == 12345

    # Clean up
    await dataset2.drop()


async def test_open_existing_dataset(
    dataset: Dataset,
) -> None:
    """Test that open() loads an existing dataset correctly."""
    # Open the same dataset again
    reopened_dataset = await Dataset.open(
        name=dataset.name,
    )

    # Verify dataset properties
    assert dataset.id == reopened_dataset.id
    assert dataset.name == reopened_dataset.name
    metadata = await dataset.get_metadata()
    reopened_metadata = await reopened_dataset.get_metadata()
    assert metadata.item_count == reopened_metadata.item_count

    # Verify they are the same object (from cache)
    assert id(dataset) == id(reopened_dataset)


async def test_open_with_id_and_name(
    storage_client: StorageClient,
) -> None:
    """Test that open() raises an error when both id and name are provided."""
    with pytest.raises(
        ValueError,
        match=r'Only one of "id", "name", "alias" can be specified, but following arguments '
        r'were specified: "id", "name".',
    ):
        await Dataset.open(
            id='some-id',
            name='some-name',
            storage_client=storage_client,
        )


async def test_push_data_single_item(dataset: Dataset) -> None:
    """Test pushing a single item to the dataset."""
    item = {'key': 'value', 'number': 42}
    await dataset.push_data(item)

    # Verify item was stored
    result = await dataset.get_data()
    assert result.count == 1
    assert result.items[0] == item


async def test_push_data_multiple_items(dataset: Dataset) -> None:
    """Test pushing multiple items to the dataset."""
    items = [
        {'id': 1, 'name': 'Item 1'},
        {'id': 2, 'name': 'Item 2'},
        {'id': 3, 'name': 'Item 3'},
    ]
    await dataset.push_data(items)

    # Verify items were stored
    result = await dataset.get_data()
    assert result.count == 3
    assert result.items == items


async def test_get_data_empty_dataset(dataset: Dataset) -> None:
    """Test getting data from an empty dataset returns empty results."""
    result = await dataset.get_data()

    assert result.count == 0
    assert result.total == 0
    assert result.items == []


async def test_get_data_with_pagination(dataset: Dataset) -> None:
    """Test getting data with offset and limit parameters for pagination."""
    # Add some items
    items = [{'id': i} for i in range(1, 11)]  # 10 items
    await dataset.push_data(items)

    # Test offset
    result = await dataset.get_data(offset=3)
    assert result.count == 7
    assert result.offset == 3
    assert result.items[0]['id'] == 4

    # Test limit
    result = await dataset.get_data(limit=5)
    assert result.count == 5
    assert result.limit == 5
    assert result.items[-1]['id'] == 5

    # Test both offset and limit
    result = await dataset.get_data(offset=2, limit=3)
    assert result.count == 3
    assert result.offset == 2
    assert result.limit == 3
    assert result.items[0]['id'] == 3
    assert result.items[-1]['id'] == 5


async def test_get_data_descending_order(dataset: Dataset) -> None:
    """Test getting data in descending order reverses the item order."""
    # Add some items
    items = [{'id': i} for i in range(1, 6)]  # 5 items
    await dataset.push_data(items)

    # Get items in descending order
    result = await dataset.get_data(desc=True)

    assert result.desc is True
    assert result.items[0]['id'] == 5
    assert result.items[-1]['id'] == 1


async def test_get_data_skip_empty(dataset: Dataset) -> None:
    """Test getting data with skip_empty option filters out empty items."""
    # Add some items including an empty one
    items = [
        {'id': 1, 'name': 'Item 1'},
        {},  # Empty item
        {'id': 3, 'name': 'Item 3'},
    ]
    await dataset.push_data(items)

    # Get all items
    result = await dataset.get_data()
    assert result.count == 3

    # Get non-empty items
    result = await dataset.get_data(skip_empty=True)
    assert result.count == 2
    assert all(item != {} for item in result.items)


async def test_iterate_items(dataset: Dataset) -> None:
    """Test iterating over dataset items yields each item in the correct order."""
    # Add some items
    items = [{'id': i} for i in range(1, 6)]  # 5 items
    await dataset.push_data(items)

    # Iterate over all items
    collected_items = [item async for item in dataset.iterate_items()]

    assert len(collected_items) == 5
    assert collected_items[0]['id'] == 1
    assert collected_items[-1]['id'] == 5


async def test_iterate_items_with_options(dataset: Dataset) -> None:
    """Test iterating with offset, limit and desc parameters."""
    # Add some items
    items = [{'id': i} for i in range(1, 11)]  # 10 items
    await dataset.push_data(items)

    # Test with offset and limit
    collected_items = [item async for item in dataset.iterate_items(offset=3, limit=3)]

    assert len(collected_items) == 3
    assert collected_items[0]['id'] == 4
    assert collected_items[-1]['id'] == 6

    # Test with descending order
    collected_items = []
    async for item in dataset.iterate_items(desc=True, limit=3):
        collected_items.append(item)

    assert len(collected_items) == 3
    assert collected_items[0]['id'] == 10
    assert collected_items[-1]['id'] == 8


async def test_list_items(dataset: Dataset) -> None:
    """Test that list_items returns all dataset items as a list."""
    # Add some items
    items = [{'id': i} for i in range(1, 6)]  # 5 items
    await dataset.push_data(items)

    # Get all items as a list
    collected_items = await dataset.list_items()

    assert len(collected_items) == 5
    assert collected_items[0]['id'] == 1
    assert collected_items[-1]['id'] == 5


async def test_list_items_with_options(dataset: Dataset) -> None:
    """Test that list_items respects filtering options."""
    # Add some items
    items: list[dict[str, Any]] = [
        {'id': 1, 'name': 'Item 1'},
        {'id': 2, 'name': 'Item 2'},
        {'id': 3},  # Item with missing 'name' field
        {},  # Empty item
        {'id': 5, 'name': 'Item 5'},
    ]
    await dataset.push_data(items)

    # Test with offset and limit
    collected_items = await dataset.list_items(offset=1, limit=2)
    assert len(collected_items) == 2
    assert collected_items[0]['id'] == 2
    assert collected_items[1]['id'] == 3

    # Test with descending order - skip empty items to avoid KeyError
    collected_items = await dataset.list_items(desc=True, skip_empty=True)

    # Filter items that have an 'id' field
    items_with_ids = [item for item in collected_items if 'id' in item]
    id_values = [item['id'] for item in items_with_ids]

    # Verify the list is sorted in descending order
    assert sorted(id_values, reverse=True) == id_values, f'IDs should be in descending order. Got {id_values}'

    # Verify key IDs are present and in the right order
    if 5 in id_values and 3 in id_values:
        assert id_values.index(5) < id_values.index(3), 'ID 5 should come before ID 3 in descending order'

    # Test with skip_empty
    collected_items = await dataset.list_items(skip_empty=True)
    assert len(collected_items) == 4  # Should skip the empty item
    assert all(item != {} for item in collected_items)

    # Test with fields - manually filter since 'fields' parameter is not supported
    # Get all items first
    collected_items = await dataset.list_items()
    assert len(collected_items) == 5

    # Manually extract only the 'id' field from each item
    filtered_items = [{key: item[key] for key in ['id'] if key in item} for item in collected_items]

    # Verify 'name' field is not present in any item
    assert all('name' not in item for item in filtered_items)

    # Test clean functionality manually instead of using the clean parameter
    # Get all items
    collected_items = await dataset.list_items()

    # Manually filter out empty items as 'clean' would do
    clean_items = [item for item in collected_items if item != {}]

    assert len(clean_items) == 4  # Should have 4 non-empty items
    assert all(item != {} for item in clean_items)


async def test_drop(
    storage_client: StorageClient,
) -> None:
    """Test dropping a dataset removes it from cache and clears its data."""
    dataset = await Dataset.open(
        name='drop-test',
        storage_client=storage_client,
    )

    # Add some data
    await dataset.push_data({'test': 'data'})

    # Drop the dataset
    await dataset.drop()

    # Verify dataset is empty (by creating a new one with the same name)
    new_dataset = await Dataset.open(
        name='drop-test',
        storage_client=storage_client,
    )

    result = await new_dataset.get_data()
    assert result.count == 0
    await new_dataset.drop()


async def test_export_to_json(
    dataset: Dataset,
    storage_client: StorageClient,
) -> None:
    """Test exporting dataset to JSON format."""
    # Create a key-value store for export
    kvs = await KeyValueStore.open(
        name='export-kvs',
    )

    # Add some items to the dataset
    items = [
        {'id': 1, 'name': 'Item 1'},
        {'id': 2, 'name': 'Item 2'},
        {'id': 3, 'name': 'Item 3'},
    ]
    await dataset.push_data(items)

    # Export to JSON
    await dataset.export_to(
        key='dataset_export.json',
        content_type='json',
        to_kvs_name='export-kvs',
        to_kvs_storage_client=storage_client,
    )

    # Retrieve the exported file
    record = await kvs.get_value(key='dataset_export.json')
    assert record is not None

    # Verify content has all the items
    assert '"id": 1' in record
    assert '"id": 2' in record
    assert '"id": 3' in record

    await kvs.drop()


async def test_export_to_csv(
    dataset: Dataset,
    storage_client: StorageClient,
) -> None:
    """Test exporting dataset to CSV format."""
    # Create a key-value store for export
    kvs = await KeyValueStore.open(
        name='export-kvs',
        storage_client=storage_client,
    )

    # Add some items to the dataset
    items = [
        {'id': 1, 'name': 'Item 1'},
        {'id': 2, 'name': 'Item 2'},
        {'id': 3, 'name': 'Item 3'},
    ]
    await dataset.push_data(items)

    # Export to CSV
    await dataset.export_to(
        key='dataset_export.csv',
        content_type='csv',
        to_kvs_name='export-kvs',
        to_kvs_storage_client=storage_client,
    )

    # Retrieve the exported file
    record = await kvs.get_value(key='dataset_export.csv')
    assert record is not None

    # Verify content has all the items
    assert 'id,name' in record
    assert '1,Item 1' in record
    assert '2,Item 2' in record
    assert '3,Item 3' in record

    await kvs.drop()


async def test_export_to_invalid_content_type(dataset: Dataset) -> None:
    """Test exporting dataset with invalid content type raises error."""
    with pytest.raises(ValueError, match=r'Unsupported content type'):
        await dataset.export_to(key='invalid_export', content_type='invalid')  # ty: ignore[no-matching-overload]


async def test_export_with_multiple_kwargs(dataset: Dataset, tmp_path: Path) -> None:
    """Test exporting dataset using many optional arguments together."""
    target_kvs_name = 'some-kvs'
    target_storage_client = FileSystemStorageClient()
    export_key = 'exported_dataset'
    data = {'some key': 'some data'}

    # Prepare custom directory and configuration
    custom_dir_name = 'some_dir'
    custom_dir = tmp_path / custom_dir_name
    custom_dir.mkdir()
    target_configuration = Configuration(storage_dir=str(custom_dir))

    # Set expected values
    expected_exported_data = f'{json.dumps([{"some key": "some data"}])}'
    expected_kvs_dir = custom_dir / 'key_value_stores' / target_kvs_name

    # Populate dataset and export
    await dataset.push_data(data)
    await dataset.export_to(
        key=export_key,
        content_type='json',
        to_kvs_name=target_kvs_name,
        to_kvs_storage_client=target_storage_client,
        to_kvs_configuration=target_configuration,
    )

    # Verify the directory was created
    assert expected_kvs_dir.is_dir()
    # Verify that kvs contains the exported data
    kvs = await KeyValueStore.open(
        name=target_kvs_name, storage_client=target_storage_client, configuration=target_configuration
    )

    assert await kvs.get_value(key=export_key) == expected_exported_data


async def test_large_dataset(dataset: Dataset) -> None:
    """Test handling a large dataset with many items."""
    items = [{'id': i, 'value': f'value-{i}'} for i in range(100)]
    await dataset.push_data(items)

    # Test that all items are retrieved
    result = await dataset.get_data(limit=None)
    assert result.count == 100
    assert result.total == 100

    # Test pagination with large datasets
    result = await dataset.get_data(offset=50, limit=25)
    assert result.count == 25
    assert result.offset == 50
    assert result.items[0]['id'] == 50
    assert result.items[-1]['id'] == 74


async def test_purge(
    storage_client: StorageClient,
) -> None:
    """Test purging a dataset removes all data but keeps the dataset itself."""
    # First create a dataset
    dataset = await Dataset.open(
        name='purge-test-dataset',
        storage_client=storage_client,
    )

    # Add some data
    initial_items = [
        {'id': 1, 'name': 'Item 1'},
        {'id': 2, 'name': 'Item 2'},
        {'id': 3, 'name': 'Item 3'},
    ]
    await dataset.push_data(initial_items)

    # Verify data was added
    data = await dataset.get_data()
    assert data.count == 3
    assert data.total == 3
    metadata = await dataset.get_metadata()
    assert metadata.item_count == 3

    # Record the dataset ID
    dataset_id = dataset.id

    # Purge the dataset
    await dataset.purge()

    # Verify the dataset still exists but is empty
    assert dataset.id == dataset_id  # Same ID preserved
    assert dataset.name == 'purge-test-dataset'  # Same name preserved

    # Dataset should be empty now
    data = await dataset.get_data()
    assert data.count == 0
    assert data.total == 0
    metadata = await dataset.get_metadata()
    assert metadata.item_count == 0

    # Verify we can add new data after purging
    new_item = {'id': 4, 'name': 'New Item After Purge'}
    await dataset.push_data(new_item)

    data = await dataset.get_data()
    assert data.count == 1
    assert data.items[0]['name'] == 'New Item After Purge'

    # Clean up
    await dataset.drop()


async def test_open_with_alias(
    storage_client: StorageClient,
) -> None:
    """Test opening datasets with alias parameter for NDU functionality."""
    # Create datasets with different aliases
    dataset_1 = await Dataset.open(
        alias='test_alias_1',
        storage_client=storage_client,
    )
    dataset_2 = await Dataset.open(
        alias='test_alias_2',
        storage_client=storage_client,
    )

    # Verify they have different IDs but no names (unnamed)
    assert dataset_1.id != dataset_2.id
    assert dataset_1.name is None
    assert dataset_2.name is None

    # Add different data to each
    await dataset_1.push_data({'source': 'alias_1', 'value': 1})
    await dataset_2.push_data({'source': 'alias_2', 'value': 2})

    # Verify data isolation
    data_1 = await dataset_1.get_data()
    data_2 = await dataset_2.get_data()

    assert data_1.count == 1
    assert data_2.count == 1
    assert data_1.items[0]['source'] == 'alias_1'
    assert data_2.items[0]['source'] == 'alias_2'

    # Clean up
    await dataset_1.drop()
    await dataset_2.drop()


async def test_alias_caching(
    storage_client: StorageClient,
) -> None:
    """Test that datasets with same alias return same instance (cached)."""
    # Open dataset with alias
    dataset_1 = await Dataset.open(
        alias='cache_test',
        storage_client=storage_client,
    )

    # Open again with same alias
    dataset_2 = await Dataset.open(
        alias='cache_test',
        storage_client=storage_client,
    )

    # Should be same instance
    assert dataset_1 is dataset_2
    assert dataset_1.id == dataset_2.id

    # Clean up
    await dataset_1.drop()


async def test_alias_with_id_error(
    storage_client: StorageClient,
) -> None:
    """Test that providing both alias and id raises error."""
    with pytest.raises(
        ValueError,
        match=r'Only one of "id", "name", "alias" can be specified, but following arguments '
        r'were specified: "id", "alias".',
    ):
        await Dataset.open(
            id='some-id',
            alias='some-alias',
            storage_client=storage_client,
        )


async def test_alias_with_name_error(
    storage_client: StorageClient,
) -> None:
    """Test that providing both alias and name raises error."""
    with pytest.raises(
        ValueError,
        match=r'Only one of "id", "name", "alias" can be specified, but following arguments '
        r'were specified: "name", "alias".',
    ):
        await Dataset.open(
            name='some-name',
            alias='some-alias',
            storage_client=storage_client,
        )


async def test_alias_with_all_parameters_error(
    storage_client: StorageClient,
) -> None:
    """Test that providing id, name, and alias raises error."""
    with pytest.raises(
        ValueError,
        match=r'Only one of "id", "name", "alias" can be specified, but following arguments '
        r'were specified: "id", "name", "alias".',
    ):
        await Dataset.open(
            id='some-id',
            name='some-name',
            alias='some-alias',
            storage_client=storage_client,
        )


async def test_alias_with_special_characters(
    storage_client: StorageClient,
) -> None:
    """Test alias functionality with special characters."""
    special_aliases = [
        'alias-with-dashes',
        'alias_with_underscores',
        'alias.with.dots',
        'alias123with456numbers',
        'CamelCaseAlias',
    ]

    datasets = []
    for alias in special_aliases:
        dataset = await Dataset.open(
            alias=alias,
            storage_client=storage_client,
        )
        datasets.append(dataset)

        # Add data with the alias as identifier
        await dataset.push_data({'alias_used': alias, 'test': 'special_chars'})

    # Verify all work correctly
    for i, dataset in enumerate(datasets):
        data = await dataset.get_data()
        assert data.count == 1
        assert data.items[0]['alias_used'] == special_aliases[i]

    # Clean up
    for dataset in datasets:
        await dataset.drop()


async def test_named_vs_alias_conflict_detection(
    storage_client: StorageClient,
) -> None:
    """Test that conflicts between named and alias storages are detected."""
    # Test 1: Create named storage first, then try alias with same name
    named_dataset = await Dataset.open(name='conflict-test', storage_client=storage_client)
    assert named_dataset.name == 'conflict-test'

    # Try to create alias with same name - should raise error
    with pytest.raises(ValueError, match=r'Cannot create alias storage "conflict-test".*already exists'):
        await Dataset.open(alias='conflict-test', storage_client=storage_client)

    # Clean up
    await named_dataset.drop()

    # Test 2: Create alias first, then try named with same name
    alias_dataset = await Dataset.open(alias='conflict-test2', storage_client=storage_client)
    assert alias_dataset.name is None  # Alias storages have no name

    # Try to create named with same name - should raise error
    with pytest.raises(ValueError, match=r'Cannot create named storage "conflict-test2".*already exists'):
        await Dataset.open(name='conflict-test2', storage_client=storage_client)

    # Clean up
    await alias_dataset.drop()


async def test_alias_parameter(
    storage_client: StorageClient,
) -> None:
    """Test dataset creation and operations with alias parameter."""
    # Create dataset with alias
    alias_dataset = await Dataset.open(
        alias='test_alias',
        storage_client=storage_client,
    )

    # Verify alias dataset properties
    assert alias_dataset.id is not None
    assert alias_dataset.name is None  # Alias storages should be unnamed

    # Test data operations
    await alias_dataset.push_data({'type': 'alias', 'value': 1})
    data = await alias_dataset.get_data()
    assert data.count == 1
    assert data.items[0]['type'] == 'alias'

    await alias_dataset.drop()


async def test_alias_vs_named_isolation(
    storage_client: StorageClient,
) -> None:
    """Test that alias and named datasets with same identifier are isolated."""
    # Create named dataset
    named_dataset = await Dataset.open(
        name='test-identifier',
        storage_client=storage_client,
    )

    # Verify named dataset
    assert named_dataset.name == 'test-identifier'
    await named_dataset.push_data({'type': 'named'})

    # Clean up named dataset first
    await named_dataset.drop()

    # Now create alias dataset with same identifier (should work after cleanup)
    alias_dataset = await Dataset.open(
        alias='test_identifier',
        storage_client=storage_client,
    )

    # Should be different instance
    assert alias_dataset.name is None
    await alias_dataset.push_data({'type': 'alias'})

    # Verify alias data
    alias_data = await alias_dataset.get_data()
    assert alias_data.items[0]['type'] == 'alias'

    await alias_dataset.drop()


async def test_default_vs_alias_default_equivalence(
    storage_client: StorageClient,
) -> None:
    """Test that default dataset and alias='default' are equivalent."""
    # Open default dataset
    default_dataset = await Dataset.open(
        storage_client=storage_client,
    )

    alias_default_dataset = await Dataset.open(
        alias=StorageInstanceManager._DEFAULT_STORAGE_ALIAS,
        storage_client=storage_client,
    )

    # Should be the same
    assert default_dataset.id == alias_default_dataset.id
    assert default_dataset.name is None
    assert alias_default_dataset.name is None

    # Data should be shared
    await default_dataset.push_data({'source': 'default'})
    data = await alias_default_dataset.get_data()
    assert data.items[0]['source'] == 'default'

    await default_dataset.drop()


async def test_multiple_alias_isolation(
    storage_client: StorageClient,
) -> None:
    """Test that different aliases create separate datasets."""
    datasets = []

    for i in range(3):
        dataset = await Dataset.open(
            alias=f'alias_{i}',
            storage_client=storage_client,
        )
        await dataset.push_data({'alias': f'alias_{i}', 'index': i})
        datasets.append(dataset)

    # All should be different
    for i in range(3):
        for j in range(i + 1, 3):
            assert datasets[i].id != datasets[j].id

    # Verify data isolation
    for i, dataset in enumerate(datasets):
        data = await dataset.get_data()
        assert data.items[0]['alias'] == f'alias_{i}'
        await dataset.drop()


async def test_purge_on_start_enabled(storage_client: StorageClient) -> None:
    """Test purge behavior when purge_on_start=True: named storages retain data, unnamed storages are purged."""

    # Skip this test for memory storage since it doesn't persist data between client instances.
    if isinstance(storage_client, MemoryStorageClient):
        pytest.skip('Memory storage does not persist data between client instances.')

    configuration = Configuration(purge_on_start=True)

    # First, create all storage types with purge enabled and add data.
    default_dataset = await Dataset.open(
        storage_client=storage_client,
        configuration=configuration,
    )

    alias_dataset = await Dataset.open(
        alias='purge-test-alias',
        storage_client=storage_client,
        configuration=configuration,
    )

    named_dataset = await Dataset.open(
        name='purge-test-named',
        storage_client=storage_client,
        configuration=configuration,
    )

    await default_dataset.push_data({'type': 'default', 'data': 'should_be_purged'})
    await alias_dataset.push_data({'type': 'alias', 'data': 'should_be_purged'})
    await named_dataset.push_data({'type': 'named', 'data': 'should_persist'})

    # Verify data was added
    default_data = await default_dataset.get_data()
    alias_data = await alias_dataset.get_data()
    named_data = await named_dataset.get_data()

    assert len(default_data.items) == 1
    assert len(alias_data.items) == 1
    assert len(named_data.items) == 1

    # Verify that default and alias storages are unnamed
    default_metadata = await default_dataset.get_metadata()
    alias_metadata = await alias_dataset.get_metadata()
    named_metadata = await named_dataset.get_metadata()

    assert default_metadata.name is None
    assert alias_metadata.name is None
    assert named_metadata.name == 'purge-test-named'

    # Clear storage cache to simulate "reopening" storages
    service_locator.storage_instance_manager.clear_cache()

    # Now "reopen" all storages
    default_dataset_2 = await Dataset.open(
        storage_client=storage_client,
        configuration=configuration,
    )
    alias_dataset_2 = await Dataset.open(
        alias='purge-test-alias',
        storage_client=storage_client,
        configuration=configuration,
    )
    named_dataset_2 = await Dataset.open(
        name='purge-test-named',
        storage_client=storage_client,
        configuration=configuration,
    )

    # Check the data after purge
    default_data_after = await default_dataset_2.get_data()
    alias_data_after = await alias_dataset_2.get_data()
    named_data_after = await named_dataset_2.get_data()

    # Unnamed storages (alias and default) should be purged (data removed)
    assert len(default_data_after.items) == 0
    assert len(alias_data_after.items) == 0

    # Named storage should retain data (not purged)
    assert len(named_data_after.items) == 1

    # Clean up
    await named_dataset_2.drop()
    await alias_dataset_2.drop()
    await default_dataset_2.drop()


async def test_purge_on_start_disabled(storage_client: StorageClient) -> None:
    """Test purge behavior when purge_on_start=False: all storages retain data regardless of type."""

    # Skip this test for memory storage since it doesn't persist data between client instances.
    if isinstance(storage_client, MemoryStorageClient):
        pytest.skip('Memory storage does not persist data between client instances.')

    configuration = Configuration(purge_on_start=False)

    # First, create all storage types with purge disabled and add data.
    default_dataset = await Dataset.open(
        storage_client=storage_client,
        configuration=configuration,
    )

    alias_dataset = await Dataset.open(
        alias='purge-test-alias',
        storage_client=storage_client,
        configuration=configuration,
    )

    named_dataset = await Dataset.open(
        name='purge-test-named',
        storage_client=storage_client,
        configuration=configuration,
    )

    await default_dataset.push_data({'type': 'default', 'data': 'should_persist'})
    await alias_dataset.push_data({'type': 'alias', 'data': 'should_persist'})
    await named_dataset.push_data({'type': 'named', 'data': 'should_persist'})

    # Verify data was added
    default_data = await default_dataset.get_data()
    alias_data = await alias_dataset.get_data()
    named_data = await named_dataset.get_data()

    assert len(default_data.items) == 1
    assert len(alias_data.items) == 1
    assert len(named_data.items) == 1

    # Verify that default and alias storages are unnamed
    default_metadata = await default_dataset.get_metadata()
    alias_metadata = await alias_dataset.get_metadata()
    named_metadata = await named_dataset.get_metadata()

    assert default_metadata.name is None
    assert alias_metadata.name is None
    assert named_metadata.name == 'purge-test-named'

    # Clear storage cache to simulate "reopening" storages
    service_locator.storage_instance_manager.clear_cache()

    # Now "reopen" all storages
    default_dataset_2 = await Dataset.open(
        storage_client=storage_client,
        configuration=configuration,
    )
    alias_dataset_2 = await Dataset.open(
        alias='purge-test-alias',
        storage_client=storage_client,
        configuration=configuration,
    )
    named_dataset_2 = await Dataset.open(
        name='purge-test-named',
        storage_client=storage_client,
        configuration=configuration,
    )

    # Check the data after purge
    default_data_after = await default_dataset_2.get_data()
    alias_data_after = await alias_dataset_2.get_data()
    named_data_after = await named_dataset_2.get_data()

    # All storages should retain data (not purged)
    assert len(default_data_after.items) == 1
    assert len(alias_data_after.items) == 1
    assert len(named_data_after.items) == 1

    assert default_data_after.items[0]['data'] == 'should_persist'
    assert alias_data_after.items[0]['data'] == 'should_persist'
    assert named_data_after.items[0]['data'] == 'should_persist'

    # Clean up
    await default_dataset_2.drop()
    await alias_dataset_2.drop()
    await named_dataset_2.drop()


async def test_name_default_not_allowed(storage_client: StorageClient) -> None:
    """Test that storage can't have default alias as name, to prevent collisions with unnamed storage alias."""
    with pytest.raises(
        ValueError,
        match=f'Storage name cannot be "{StorageInstanceManager._DEFAULT_STORAGE_ALIAS}" as '
        f'it is reserved for default alias.',
    ):
        await Dataset.open(name=StorageInstanceManager._DEFAULT_STORAGE_ALIAS, storage_client=storage_client)


@pytest.mark.parametrize(
    ('name', 'is_valid'),
    [
        pytest.param('F', True, id='single-char'),
        pytest.param('7', True, id='single-digit'),
        pytest.param('FtghdfseySds', True, id='mixed-case'),
        pytest.param('125673450', True, id='all-digits'),
        pytest.param('Ft2134Sfe0O1hf', True, id='mixed-alphanumeric'),
        pytest.param('name-with-dashes', True, id='dashes'),
        pytest.param('1-value', True, id='number start'),
        pytest.param('value-1', True, id='number end'),
        pytest.param('test-1-value', True, id='number middle'),
        pytest.param('test-------value', True, id='multiple-dashes'),
        pytest.param('test-VALUES-test', True, id='multiple-cases'),
        pytest.param('name_with_underscores', False, id='underscores'),
        pytest.param('name with spaces', False, id='spaces'),
        pytest.param('-test', False, id='dashes start'),
        pytest.param('test-', False, id='dashes end'),
    ],
)
async def test_validate_name(storage_client: StorageClient, name: str, *, is_valid: bool) -> None:
    """Test name validation logic."""
    if is_valid:
        # Should not raise
        dataset = await Dataset.open(name=name, storage_client=storage_client)
        assert dataset.name == name
        await dataset.drop()
    else:
        with pytest.raises(ValueError, match=rf'Invalid storage name "{name}".*'):
            await Dataset.open(name=name, storage_client=storage_client)


async def test_record_with_noascii_chars(dataset: Dataset) -> None:
    """Test handling record with non-ASCII characters."""
    init_value = {
        'record_1': 'Supermaxi El Jardín',
        'record_2': 'záznam dva',
        'record_3': '記録三',
    }

    # Save the record to the dataset
    await dataset.push_data(init_value)

    # Get the record and verify
    value = await dataset.get_data()
    assert value is not None
    assert value.items[0] == init_value


================================================
FILE: tests/unit/storages/test_key_value_store.py
================================================
from __future__ import annotations

import json
from typing import TYPE_CHECKING

import pytest

from crawlee import service_locator
from crawlee.configuration import Configuration
from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, SqlStorageClient, StorageClient
from crawlee.storages import KeyValueStore
from crawlee.storages._storage_instance_manager import StorageInstanceManager

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator
    from pathlib import Path


@pytest.fixture
async def kvs(
    storage_client: StorageClient,
) -> AsyncGenerator[KeyValueStore, None]:
    """Fixture that provides a key-value store instance for each test."""
    kvs = await KeyValueStore.open(
        storage_client=storage_client,
    )

    yield kvs
    await kvs.drop()


async def test_open_creates_new_kvs(
    storage_client: StorageClient,
) -> None:
    """Test that open() creates a new key-value store with proper metadata."""
    kvs = await KeyValueStore.open(
        name='new-kvs',
        storage_client=storage_client,
    )

    # Verify key-value store properties
    assert kvs.id is not None
    assert kvs.name == 'new-kvs'

    await kvs.drop()


async def test_open_existing_kvs(
    kvs: KeyValueStore,
    storage_client: StorageClient,
) -> None:
    """Test that open() loads an existing key-value store correctly."""
    # Open the same key-value store again
    reopened_kvs = await KeyValueStore.open(
        name=kvs.name,
        storage_client=storage_client,
    )

    # Verify key-value store properties
    assert kvs.id == reopened_kvs.id
    assert kvs.name == reopened_kvs.name

    # Verify they are the same object (from cache)
    assert id(kvs) == id(reopened_kvs)


async def test_open_with_id_and_name(
    storage_client: StorageClient,
) -> None:
    """Test that open() raises an error when both id and name are provided."""
    with pytest.raises(
        ValueError,
        match=r'Only one of "id", "name", "alias" can be specified, but following arguments '
        r'were specified: "id", "name".',
    ):
        await KeyValueStore.open(
            id='some-id',
            name='some-name',
            storage_client=storage_client,
        )


async def test_open_by_id(
    storage_client: StorageClient,
) -> None:
    """Test opening a key-value store by its ID."""
    # First create a key-value store by name
    kvs1 = await KeyValueStore.open(
        name='kvs-by-id-test',
        storage_client=storage_client,
    )

    # Add some data to identify it
    await kvs1.set_value('test_key', {'test': 'opening_by_id', 'timestamp': 12345})

    # Open the key-value store by ID
    kvs2 = await KeyValueStore.open(
        id=kvs1.id,
        storage_client=storage_client,
    )

    # Verify it's the same key-value store
    assert kvs2.id == kvs1.id
    assert kvs2.name == 'kvs-by-id-test'

    # Verify the data is still there
    value = await kvs2.get_value('test_key')
    assert value is not None
    assert value['test'] == 'opening_by_id'
    assert value['timestamp'] == 12345

    # Clean up
    await kvs2.drop()


async def test_set_get_value(kvs: KeyValueStore) -> None:
    """Test setting and getting a value from the key-value store."""
    # Set a value
    test_key = 'test-key'
    test_value = {'data': 'value', 'number': 42}
    await kvs.set_value(test_key, test_value)

    # Get the value
    result = await kvs.get_value(test_key)
    assert result == test_value


async def test_set_get_none(kvs: KeyValueStore) -> None:
    """Test setting and getting None as a value."""
    test_key = 'none-key'
    await kvs.set_value(test_key, None)
    result = await kvs.get_value(test_key)
    assert result is None


async def test_get_value_nonexistent(kvs: KeyValueStore) -> None:
    """Test getting a nonexistent value returns None."""
    result = await kvs.get_value('nonexistent-key')
    assert result is None


async def test_get_value_with_default(kvs: KeyValueStore) -> None:
    """Test getting a nonexistent value with a default value."""
    default_value = {'default': True}
    result = await kvs.get_value('nonexistent-key', default_value=default_value)
    assert result == default_value


async def test_set_value_with_content_type(kvs: KeyValueStore) -> None:
    """Test setting a value with a specific content type."""
    test_key = 'test-json'
    test_value = {'data': 'value', 'items': [1, 2, 3]}
    await kvs.set_value(test_key, test_value, content_type='application/json')

    # Verify the value is retrievable
    result = await kvs.get_value(test_key)
    assert result == test_value


async def test_delete_value(kvs: KeyValueStore) -> None:
    """Test deleting a value from the key-value store."""
    # Set a value first
    test_key = 'delete-me'
    test_value = 'value to delete'
    await kvs.set_value(test_key, test_value)

    # Verify value exists
    assert await kvs.get_value(test_key) == test_value

    # Delete the value
    await kvs.delete_value(test_key)

    # Verify value is gone
    assert await kvs.get_value(test_key) is None


async def test_list_keys_empty_kvs(kvs: KeyValueStore) -> None:
    """Test listing keys from an empty key-value store."""
    keys = await kvs.list_keys()
    assert len(keys) == 0


async def test_list_keys(kvs: KeyValueStore) -> None:
    """Test listing keys from a key-value store with items."""
    # Add some items
    await kvs.set_value('key1', 'value1')
    await kvs.set_value('key2', 'value2')
    await kvs.set_value('key3', 'value3')

    # List keys
    keys = await kvs.list_keys()

    # Verify keys
    assert len(keys) == 3
    key_names = [k.key for k in keys]
    assert 'key1' in key_names
    assert 'key2' in key_names
    assert 'key3' in key_names


async def test_list_keys_with_limit(kvs: KeyValueStore) -> None:
    """Test listing keys with a limit parameter."""
    # Add some items
    for i in range(10):
        await kvs.set_value(f'key{i}', f'value{i}')

    # List with limit
    keys = await kvs.list_keys(limit=5)
    assert len(keys) == 5


async def test_list_keys_with_exclusive_start_key(kvs: KeyValueStore) -> None:
    """Test listing keys with an exclusive start key."""
    # Add some items in a known order
    await kvs.set_value('key1', 'value1')
    await kvs.set_value('key2', 'value2')
    await kvs.set_value('key3', 'value3')
    await kvs.set_value('key4', 'value4')
    await kvs.set_value('key5', 'value5')

    # Get all keys first to determine their order
    all_keys = await kvs.list_keys()
    all_key_names = [k.key for k in all_keys]

    if len(all_key_names) >= 3:
        # Start from the second key
        start_key = all_key_names[1]
        keys = await kvs.list_keys(exclusive_start_key=start_key)

        # We should get all keys after the start key
        expected_count = len(all_key_names) - all_key_names.index(start_key) - 1
        assert len(keys) == expected_count

        # First key should be the one after start_key
        first_returned_key = keys[0].key
        assert first_returned_key != start_key
        assert all_key_names.index(first_returned_key) > all_key_names.index(start_key)


async def test_iterate_keys(kvs: KeyValueStore) -> None:
    """Test iterating over keys in the key-value store."""
    # Add some items
    await kvs.set_value('key1', 'value1')
    await kvs.set_value('key2', 'value2')
    await kvs.set_value('key3', 'value3')

    collected_keys = [key async for key in kvs.iterate_keys()]

    # Verify iteration result
    assert len(collected_keys) == 3
    key_names = [k.key for k in collected_keys]
    assert 'key1' in key_names
    assert 'key2' in key_names
    assert 'key3' in key_names


async def test_iterate_keys_with_limit(kvs: KeyValueStore) -> None:
    """Test iterating over keys with a limit parameter."""
    # Add some items
    for i in range(10):
        await kvs.set_value(f'key{i}', f'value{i}')

    collected_keys = [key async for key in kvs.iterate_keys(limit=5)]

    # Verify iteration result
    assert len(collected_keys) == 5


async def test_drop(
    storage_client: StorageClient,
) -> None:
    """Test dropping a key-value store removes it from cache and clears its data."""
    kvs = await KeyValueStore.open(
        name='drop-test',
        storage_client=storage_client,
    )

    # Add some data
    await kvs.set_value('test', 'data')

    # Drop the key-value store
    await kvs.drop()

    # Verify key-value store is empty (by creating a new one with the same name)
    new_kvs = await KeyValueStore.open(
        name='drop-test',
        storage_client=storage_client,
    )

    # Attempt to get a previously stored value
    result = await new_kvs.get_value('test')
    assert result is None
    await new_kvs.drop()


async def test_reopen_default(
    storage_client: StorageClient,
) -> None:
    """Test reopening the default key-value store."""
    # Open the default key-value store
    kvs1 = await KeyValueStore.open(
        storage_client=storage_client,
    )

    # Set a value
    await kvs1.set_value('test_key', 'test_value')

    # Open the default key-value store again
    kvs2 = await KeyValueStore.open(
        storage_client=storage_client,
    )

    # Verify they are the same store
    assert kvs1.id == kvs2.id
    assert kvs1.name == kvs2.name

    # Verify the value is accessible
    value1 = await kvs1.get_value('test_key')
    value2 = await kvs2.get_value('test_key')
    assert value1 == value2 == 'test_value'

    # Verify they are the same object
    assert id(kvs1) == id(kvs2)


async def test_complex_data_types(kvs: KeyValueStore) -> None:
    """Test storing and retrieving complex data types."""
    # Test nested dictionaries
    nested_dict = {
        'level1': {
            'level2': {
                'level3': 'deep value',
                'numbers': [1, 2, 3],
            },
        },
        'array': [{'a': 1}, {'b': 2}],
    }
    await kvs.set_value('nested', nested_dict)
    result = await kvs.get_value('nested')
    assert result == nested_dict

    # Test lists
    test_list = [1, 'string', True, None, {'key': 'value'}]
    await kvs.set_value('list', test_list)
    result = await kvs.get_value('list')
    assert result == test_list


async def test_string_data(kvs: KeyValueStore) -> None:
    """Test storing and retrieving string data."""
    # Plain string
    await kvs.set_value('string', 'simple string')
    result = await kvs.get_value('string')
    assert result == 'simple string'

    # JSON string
    json_string = json.dumps({'key': 'value'})
    await kvs.set_value('json_string', json_string)
    result = await kvs.get_value('json_string')
    assert result == json_string


async def test_key_with_special_characters(kvs: KeyValueStore) -> None:
    """Test storing and retrieving values with keys containing special characters."""
    # Key with spaces, slashes, and special characters
    special_key = 'key with spaces/and/slashes!@#$%^&*()'
    test_value = 'Special key value'

    # Store the value with the special key
    await kvs.set_value(key=special_key, value=test_value)

    # Retrieve the value and verify it matches
    result = await kvs.get_value(key=special_key)
    assert result is not None
    assert result == test_value

    # Make sure the key is properly listed
    keys = await kvs.list_keys()
    key_names = [k.key for k in keys]
    assert special_key in key_names

    # Test key deletion
    await kvs.delete_value(key=special_key)
    assert await kvs.get_value(key=special_key) is None


async def test_data_persistence_on_reopen() -> None:
    """Test that data persists when reopening a KeyValueStore."""
    kvs1 = await KeyValueStore.open()

    await kvs1.set_value('key_123', 'value_123')

    result1 = await kvs1.get_value('key_123')
    assert result1 == 'value_123'

    kvs2 = await KeyValueStore.open()

    result2 = await kvs2.get_value('key_123')
    assert result2 == 'value_123'
    assert await kvs1.list_keys() == await kvs2.list_keys()

    await kvs2.set_value('key_456', 'value_456')

    result1 = await kvs1.get_value('key_456')
    assert result1 == 'value_456'


async def test_purge(
    storage_client: StorageClient,
) -> None:
    """Test purging a key-value store removes all values but keeps the store itself."""
    # First create a key-value store
    kvs = await KeyValueStore.open(
        name='purge-test-kvs',
        storage_client=storage_client,
    )

    # Add some values
    await kvs.set_value('key1', 'value1')
    await kvs.set_value('key2', 'value2')
    await kvs.set_value('key3', {'complex': 'value', 'number': 42})

    # Verify values were added
    keys = await kvs.list_keys()
    assert len(keys) == 3

    # Record the store ID
    kvs_id = kvs.id

    # Purge the key-value store
    await kvs.purge()

    # Verify the store still exists but is empty
    assert kvs.id == kvs_id  # Same ID preserved
    assert kvs.name == 'purge-test-kvs'  # Same name preserved

    # Store should be empty now
    keys = await kvs.list_keys()
    assert len(keys) == 0

    # Values should no longer be accessible
    assert await kvs.get_value('key1') is None
    assert await kvs.get_value('key2') is None
    assert await kvs.get_value('key3') is None

    # Verify we can add new values after purging
    await kvs.set_value('new_key', 'new value after purge')

    value = await kvs.get_value('new_key')
    assert value == 'new value after purge'

    # Clean up
    await kvs.drop()


async def test_record_exists_nonexistent(kvs: KeyValueStore) -> None:
    """Test that record_exists returns False for a nonexistent key."""
    result = await kvs.record_exists('nonexistent-key')
    assert result is False


async def test_record_exists_after_set(kvs: KeyValueStore) -> None:
    """Test that record_exists returns True after setting a value."""
    test_key = 'exists-key'
    test_value = {'data': 'test'}

    # Initially should not exist
    assert await kvs.record_exists(test_key) is False

    # Set the value
    await kvs.set_value(test_key, test_value)

    # Now should exist
    assert await kvs.record_exists(test_key) is True


async def test_record_exists_after_delete(kvs: KeyValueStore) -> None:
    """Test that record_exists returns False after deleting a value."""
    test_key = 'exists-then-delete-key'
    test_value = 'will be deleted'

    # Set a value
    await kvs.set_value(test_key, test_value)
    assert await kvs.record_exists(test_key) is True

    # Delete the value
    await kvs.delete_value(test_key)

    # Should no longer exist
    assert await kvs.record_exists(test_key) is False


async def test_record_exists_with_none_value(kvs: KeyValueStore) -> None:
    """Test that record_exists returns True even when value is None."""
    test_key = 'none-value-key'

    # Set None as value
    await kvs.set_value(test_key, None)

    # Should still exist even though value is None
    assert await kvs.record_exists(test_key) is True

    # Verify we can distinguish between None value and nonexistent key
    assert await kvs.get_value(test_key) is None
    assert await kvs.record_exists(test_key) is True
    assert await kvs.record_exists('truly-nonexistent') is False


async def test_record_exists_different_content_types(kvs: KeyValueStore) -> None:
    """Test record_exists with different content types."""
    test_cases = [
        ('json-key', {'data': 'json'}, 'application/json'),
        ('text-key', 'plain text', 'text/plain'),
        ('binary-key', b'binary data', 'application/octet-stream'),
    ]

    for key, value, content_type in test_cases:
        # Set value with specific content type
        await kvs.set_value(key, value, content_type=content_type)

        # Should exist regardless of content type
        assert await kvs.record_exists(key) is True


async def test_record_exists_multiple_keys(kvs: KeyValueStore) -> None:
    """Test record_exists with multiple keys and batch operations."""
    keys_and_values = [
        ('key1', 'value1'),
        ('key2', {'nested': 'object'}),
        ('key3', [1, 2, 3]),
        ('key4', None),
    ]

    # Initially, none should exist
    for key, _ in keys_and_values:
        assert await kvs.record_exists(key) is False

    # Set all values
    for key, value in keys_and_values:
        await kvs.set_value(key, value)

    # All should exist now
    for key, _ in keys_and_values:
        assert await kvs.record_exists(key) is True

    # Test some non-existent keys
    assert await kvs.record_exists('nonexistent1') is False
    assert await kvs.record_exists('nonexistent2') is False


async def test_record_exists_after_purge(kvs: KeyValueStore) -> None:
    """Test that record_exists returns False after purging the store."""
    # Set some values
    await kvs.set_value('key1', 'value1')
    await kvs.set_value('key2', 'value2')

    # Verify they exist
    assert await kvs.record_exists('key1') is True
    assert await kvs.record_exists('key2') is True

    # Purge the store
    await kvs.purge()

    # Should no longer exist
    assert await kvs.record_exists('key1') is False
    assert await kvs.record_exists('key2') is False


async def test_open_with_alias(
    storage_client: StorageClient,
) -> None:
    """Test opening key-value stores with alias parameter for NDU functionality."""
    # Create key-value stores with different aliases
    kvs_1 = await KeyValueStore.open(
        alias='test_alias_1',
        storage_client=storage_client,
    )
    kvs_2 = await KeyValueStore.open(
        alias='test_alias_2',
        storage_client=storage_client,
    )

    # Verify they have different IDs but no names (unnamed)
    assert kvs_1.id != kvs_2.id
    assert kvs_1.name is None
    assert kvs_2.name is None

    # Add different data to each
    await kvs_1.set_value('source', 'alias_1')
    await kvs_2.set_value('source', 'alias_2')

    # Verify data isolation
    value_1 = await kvs_1.get_value('source')
    value_2 = await kvs_2.get_value('source')

    assert value_1 == 'alias_1'
    assert value_2 == 'alias_2'

    # Clean up
    await kvs_1.drop()
    await kvs_2.drop()


async def test_alias_caching(
    storage_client: StorageClient,
) -> None:
    """Test that key-value stores with same alias return same instance (cached)."""
    # Open kvs with alias
    kvs_1 = await KeyValueStore.open(
        alias='cache_test',
        storage_client=storage_client,
    )

    # Open again with same alias
    kvs_2 = await KeyValueStore.open(
        alias='cache_test',
        storage_client=storage_client,
    )

    # Should be same instance
    assert kvs_1 is kvs_2
    assert kvs_1.id == kvs_2.id

    # Clean up
    await kvs_1.drop()


async def test_alias_with_id_error(
    storage_client: StorageClient,
) -> None:
    """Test that providing both alias and id raises error."""
    with pytest.raises(
        ValueError,
        match=r'Only one of "id", "name", "alias" can be specified, but following arguments '
        r'were specified: "id", "alias".',
    ):
        await KeyValueStore.open(
            id='some-id',
            alias='some-alias',
            storage_client=storage_client,
        )


async def test_alias_with_name_error(
    storage_client: StorageClient,
) -> None:
    """Test that providing both alias and name raises error."""
    with pytest.raises(
        ValueError,
        match=r'Only one of "id", "name", "alias" can be specified, but following arguments '
        r'were specified: "name", "alias".',
    ):
        await KeyValueStore.open(
            name='some-name',
            alias='some-alias',
            storage_client=storage_client,
        )


async def test_alias_with_special_characters(
    storage_client: StorageClient,
) -> None:
    """Test alias functionality with special characters."""
    special_aliases = [
        'alias-with-dashes',
        'alias_with_underscores',
        'alias.with.dots',
        'alias123with456numbers',
        'CamelCaseAlias',
    ]

    stores = []
    for alias in special_aliases:
        kvs = await KeyValueStore.open(
            alias=alias,
            storage_client=storage_client,
        )
        stores.append(kvs)

        # Add data with the alias as identifier
        await kvs.set_value('alias_used', alias)
        await kvs.set_value('test', 'special_chars')

    # Verify all work correctly
    for i, kvs in enumerate(stores):
        assert await kvs.get_value('alias_used') == special_aliases[i]
        assert await kvs.get_value('test') == 'special_chars'

    # Clean up
    for kvs in stores:
        await kvs.drop()


async def test_alias_key_operations(
    storage_client: StorageClient,
) -> None:
    """Test that key operations work correctly with alias stores."""
    kvs = await KeyValueStore.open(
        alias='key_ops_test',
        storage_client=storage_client,
    )

    # Test setting multiple keys
    test_data = {
        'key1': {'data': 'value1', 'number': 1},
        'key2': 'simple string value',
        'key3': [1, 2, 3, 4, 5],
        'key4': None,
    }

    for key, value in test_data.items():
        await kvs.set_value(key, value)

    # Test getting all keys
    keys = await kvs.list_keys()
    key_names = [k.key for k in keys]
    assert len(keys) == 4
    for key in test_data:
        assert key in key_names

    # Test record_exists
    for key in test_data:
        assert await kvs.record_exists(key) is True
    assert await kvs.record_exists('nonexistent') is False

    # Test iteration
    collected_keys = [key async for key in kvs.iterate_keys()]
    assert len(collected_keys) == 4

    # Test deletion
    await kvs.delete_value('key2')
    assert await kvs.record_exists('key2') is False
    assert await kvs.get_value('key2') is None

    # Verify other keys still exist
    remaining_keys = await kvs.list_keys()
    assert len(remaining_keys) == 3

    # Clean up
    await kvs.drop()


async def test_named_vs_alias_conflict_detection(
    storage_client: StorageClient,
) -> None:
    """Test that conflicts between named and alias storages are detected."""
    # Test 1: Create named storage first, then try alias with same name
    named_kvs = await KeyValueStore.open(name='conflict-test', storage_client=storage_client)
    assert named_kvs.name == 'conflict-test'

    # Try to create alias with same name - should raise error
    with pytest.raises(ValueError, match=r'Cannot create alias storage "conflict-test".*already exists'):
        await KeyValueStore.open(alias='conflict-test', storage_client=storage_client)

    # Clean up
    await named_kvs.drop()

    # Test 2: Create alias first, then try named with same name
    alias_kvs = await KeyValueStore.open(alias='conflict-test2', storage_client=storage_client)
    assert alias_kvs.name is None  # Alias storages have no name

    # Try to create named with same name - should raise error
    with pytest.raises(ValueError, match=r'Cannot create named storage "conflict-test2".*already exists'):
        await KeyValueStore.open(name='conflict-test2', storage_client=storage_client)

    # Clean up
    await alias_kvs.drop()

    # Test 3: Different names should work fine
    named_kvs_ok = await KeyValueStore.open(name='different-name', storage_client=storage_client)
    alias_kvs_ok = await KeyValueStore.open(alias='different-alias', storage_client=storage_client)

    assert named_kvs_ok.name == 'different-name'
    assert alias_kvs_ok.name is None

    # Clean up
    await named_kvs_ok.drop()
    await alias_kvs_ok.drop()


async def test_alias_parameter(
    storage_client: StorageClient,
) -> None:
    """Test key-value store creation and operations with alias parameter."""
    # Create kvs with alias
    alias_kvs = await KeyValueStore.open(
        alias='test_alias',
        storage_client=storage_client,
    )

    # Verify alias kvs properties
    assert alias_kvs.id is not None
    assert alias_kvs.name is None  # Alias storages should be unnamed

    # Test data operations
    await alias_kvs.set_value('test_key', {'type': 'alias', 'value': 1})
    value = await alias_kvs.get_value('test_key')
    assert value['type'] == 'alias'

    await alias_kvs.drop()


async def test_alias_vs_named_isolation(
    storage_client: StorageClient,
) -> None:
    """Test that alias and named key-value stores with same identifier are isolated."""
    # Create named kvs
    named_kvs = await KeyValueStore.open(
        name='test-identifier',
        storage_client=storage_client,
    )

    # Verify named kvs
    assert named_kvs.name == 'test-identifier'
    await named_kvs.set_value('type', 'named')

    # Clean up named kvs first
    await named_kvs.drop()

    # Now create alias kvs with same identifier (should work after cleanup)
    alias_kvs = await KeyValueStore.open(
        alias='test_identifier',
        storage_client=storage_client,
    )

    # Should be different instance
    assert alias_kvs.name is None
    await alias_kvs.set_value('type', 'alias')

    # Verify alias data
    alias_value = await alias_kvs.get_value('type')
    assert alias_value == 'alias'

    await alias_kvs.drop()


async def test_default_vs_alias_default_equivalence(
    storage_client: StorageClient,
) -> None:
    """Test that default key-value store and alias='default' are equivalent."""
    # Open default kvs
    default_kvs = await KeyValueStore.open(
        storage_client=storage_client,
    )

    alias_default_kvs = await KeyValueStore.open(
        alias=StorageInstanceManager._DEFAULT_STORAGE_ALIAS,
        storage_client=storage_client,
    )

    # Should be the same
    assert default_kvs.id == alias_default_kvs.id
    assert default_kvs.name is None
    assert alias_default_kvs.name is None

    # Data should be shared
    await default_kvs.set_value('source', 'default')
    value = await alias_default_kvs.get_value('source')
    assert value == 'default'

    await default_kvs.drop()


async def test_multiple_alias_isolation(
    storage_client: StorageClient,
) -> None:
    """Test that different aliases create separate key-value stores."""
    kvs_stores = []

    for i in range(3):
        kvs = await KeyValueStore.open(
            alias=f'alias_{i}',
            storage_client=storage_client,
        )
        await kvs.set_value('alias', f'alias_{i}')
        await kvs.set_value('index', i)
        kvs_stores.append(kvs)

    # All should be different
    for i in range(3):
        for j in range(i + 1, 3):
            assert kvs_stores[i].id != kvs_stores[j].id

    # Verify data isolation
    for i, kvs in enumerate(kvs_stores):
        alias_value = await kvs.get_value('alias')
        index_value = await kvs.get_value('index')
        assert alias_value == f'alias_{i}'
        # For memory storage, value is preserved as int; for filesystem it's converted to string
        assert index_value == i or index_value == str(i)
        await kvs.drop()


async def test_purge_on_start_enabled(storage_client: StorageClient) -> None:
    """Test purge behavior when purge_on_start=True: named storages retain data, unnamed storages are purged."""

    # Skip this test for memory storage since it doesn't persist data between client instances.
    if isinstance(storage_client, MemoryStorageClient):
        pytest.skip('Memory storage does not persist data between client instances.')

    configuration = Configuration(purge_on_start=True)

    # First, create all storage types with purge enabled and add data.
    default_kvs = await KeyValueStore.open(
        storage_client=storage_client,
        configuration=configuration,
    )

    alias_kvs = await KeyValueStore.open(
        alias='purge-test-alias',
        storage_client=storage_client,
        configuration=configuration,
    )

    named_kvs = await KeyValueStore.open(
        name='purge-test-named',
        storage_client=storage_client,
        configuration=configuration,
    )

    await default_kvs.set_value(key='data', value='should_be_purged')
    await alias_kvs.set_value(key='data', value='should_be_purged')
    await named_kvs.set_value(key='data', value='should_persist')

    # Verify data was added
    default_data = await default_kvs.get_value(key='data')
    alias_data = await alias_kvs.get_value(key='data')
    named_data = await named_kvs.get_value(key='data')

    assert default_data == 'should_be_purged'
    assert alias_data == 'should_be_purged'
    assert named_data == 'should_persist'

    # Verify that default and alias storages are unnamed
    default_metadata = await default_kvs.get_metadata()
    alias_metadata = await alias_kvs.get_metadata()
    named_metadata = await named_kvs.get_metadata()

    assert default_metadata.name is None
    assert alias_metadata.name is None
    assert named_metadata.name == 'purge-test-named'

    # Clear storage cache to simulate "reopening" storages
    service_locator.storage_instance_manager.clear_cache()

    # Now "reopen" all storages
    default_kvs_2 = await KeyValueStore.open(
        storage_client=storage_client,
        configuration=configuration,
    )
    alias_kvs_2 = await KeyValueStore.open(
        alias='purge-test-alias',
        storage_client=storage_client,
        configuration=configuration,
    )
    named_kvs_2 = await KeyValueStore.open(
        name='purge-test-named',
        storage_client=storage_client,
        configuration=configuration,
    )

    # Check the data after purge
    default_data_after = await default_kvs_2.get_value(key='data')
    alias_data_after = await alias_kvs_2.get_value(key='data')
    named_data_after = await named_kvs_2.get_value(key='data')

    # Unnamed storages (alias and default) should be purged (data removed)
    assert default_data_after is None
    assert alias_data_after is None

    # Named storage should retain data (not purged)
    assert named_data_after == 'should_persist'

    # Clean up
    await named_kvs_2.drop()
    await alias_kvs_2.drop()
    await default_kvs_2.drop()


async def test_purge_on_start_disabled(storage_client: StorageClient) -> None:
    """Test purge behavior when purge_on_start=False: all storages retain data regardless of type."""

    # Skip this test for memory storage since it doesn't persist data between client instances.
    if isinstance(storage_client, MemoryStorageClient):
        pytest.skip('Memory storage does not persist data between client instances.')

    configuration = Configuration(purge_on_start=False)

    # First, create all storage types with purge disabled and add data.
    default_kvs = await KeyValueStore.open(
        storage_client=storage_client,
        configuration=configuration,
    )

    alias_kvs = await KeyValueStore.open(
        alias='purge-test-alias',
        storage_client=storage_client,
        configuration=configuration,
    )

    named_kvs = await KeyValueStore.open(
        name='purge-test-named',
        storage_client=storage_client,
        configuration=configuration,
    )

    await default_kvs.set_value('data', 'should_persist')
    await alias_kvs.set_value('data', 'should_persist')
    await named_kvs.set_value('data', 'should_persist')

    # Verify data was added
    default_data = await default_kvs.get_value('data')
    alias_data = await alias_kvs.get_value('data')
    named_data = await named_kvs.get_value('data')

    assert default_data == 'should_persist'
    assert alias_data == 'should_persist'
    assert named_data == 'should_persist'

    # Clear storage cache to simulate "reopening" storages
    service_locator.storage_instance_manager.clear_cache()

    # Now "reopen" all storages
    default_kvs_2 = await KeyValueStore.open(
        storage_client=storage_client,
        configuration=configuration,
    )
    alias_kvs_2 = await KeyValueStore.open(
        alias='purge-test-alias',
        storage_client=storage_client,
        configuration=configuration,
    )
    named_kvs_2 = await KeyValueStore.open(
        name='purge-test-named',
        storage_client=storage_client,
        configuration=configuration,
    )

    # Check the data after reopen
    default_data_after = await default_kvs_2.get_value('data')
    alias_data_after = await alias_kvs_2.get_value('data')
    named_data_after = await named_kvs_2.get_value('data')

    # All storages should retain data when purge is disabled
    assert default_data_after == 'should_persist'
    assert alias_data_after == 'should_persist'
    assert named_data_after == 'should_persist'

    # Clean up
    await named_kvs_2.drop()
    await alias_kvs_2.drop()
    await default_kvs_2.drop()


async def test_name_default_not_allowed(storage_client: StorageClient) -> None:
    """Test that storage can't have default alias as name, to prevent collisions with unnamed storage alias."""
    with pytest.raises(
        ValueError,
        match=f'Storage name cannot be "{StorageInstanceManager._DEFAULT_STORAGE_ALIAS}" as '
        f'it is reserved for default alias.',
    ):
        await KeyValueStore.open(name=StorageInstanceManager._DEFAULT_STORAGE_ALIAS, storage_client=storage_client)


@pytest.mark.parametrize(
    ('name', 'is_valid'),
    [
        pytest.param('F', True, id='single-char'),
        pytest.param('7', True, id='single-digit'),
        pytest.param('FtghdfseySds', True, id='mixed-case'),
        pytest.param('125673450', True, id='all-digits'),
        pytest.param('Ft2134Sfe0O1hf', True, id='mixed-alphanumeric'),
        pytest.param('name-with-dashes', True, id='dashes'),
        pytest.param('1-value', True, id='number start'),
        pytest.param('value-1', True, id='number end'),
        pytest.param('test-1-value', True, id='number middle'),
        pytest.param('test-------value', True, id='multiple-dashes'),
        pytest.param('test-VALUES-test', True, id='multiple-cases'),
        pytest.param('name_with_underscores', False, id='underscores'),
        pytest.param('name with spaces', False, id='spaces'),
        pytest.param('-test', False, id='dashes start'),
        pytest.param('test-', False, id='dashes end'),
    ],
)
async def test_validate_name(storage_client: StorageClient, name: str, *, is_valid: bool) -> None:
    """Test name validation logic."""
    if is_valid:
        # Should not raise
        dataset = await KeyValueStore.open(name=name, storage_client=storage_client)
        assert dataset.name == name
        await dataset.drop()
    else:
        with pytest.raises(ValueError, match=rf'Invalid storage name "{name}".*'):
            await KeyValueStore.open(name=name, storage_client=storage_client)


@pytest.mark.parametrize(
    'tested_storage_client_class',
    [
        pytest.param(MemoryStorageClient, id='tested=MemoryStorageClient'),
        pytest.param(FileSystemStorageClient, id='tested=FileSystemStorageClient'),
        pytest.param(SqlStorageClient, id='tested=SqlStorageClient'),
    ],
)
@pytest.mark.parametrize(
    'global_storage_client_class',
    [
        pytest.param(MemoryStorageClient, id='global=MemoryStorageClient'),
        pytest.param(FileSystemStorageClient, id='global=FileSystemStorageClient'),
        pytest.param(SqlStorageClient, id='global=SqlStorageClient'),
    ],
)
async def test_get_auto_saved_value_various_global_clients(
    tmp_path: Path, tested_storage_client_class: type[StorageClient], global_storage_client_class: type[StorageClient]
) -> None:
    """Ensure that persistence is working for all clients regardless of what is set in service locator."""
    tested_storage_client = tested_storage_client_class()
    global_storage_client = global_storage_client_class()

    service_locator.set_configuration(
        Configuration(
            storage_dir=str(tmp_path),
            purge_on_start=True,
        )
    )
    service_locator.set_storage_client(global_storage_client)

    kvs = await KeyValueStore.open(storage_client=tested_storage_client)
    values_kvs = {'key': 'some_value'}
    test_key = 'test_key'

    autosaved_value_kvs = await kvs.get_auto_saved_value(test_key)
    assert autosaved_value_kvs == {}
    autosaved_value_kvs.update(values_kvs)
    await kvs.persist_autosaved_values()

    assert await kvs.get_value(test_key) == autosaved_value_kvs


async def test_record_with_noascii_chars(kvs: KeyValueStore) -> None:
    """Test storing and retrieving a record with non-ASCII characters."""
    init_value = {
        'record_1': 'Supermaxi El Jardín',
        'record_2': 'záznam dva',
        'record_3': '記録三',
    }
    key = 'non_ascii_key'

    # Save the record in the key-value store
    await kvs.set_value(key, init_value)

    # Get the record and verify
    value = await kvs.get_value(key)
    assert value is not None
    assert value == init_value


================================================
FILE: tests/unit/storages/test_request_manager_tandem.py
================================================
from __future__ import annotations

from dataclasses import dataclass
from unittest.mock import create_autospec

import pytest

from crawlee import Request
from crawlee.request_loaders import RequestLoader, RequestManagerTandem
from crawlee.storages import RequestQueue


@dataclass
class TestInput:
    __test__ = False

    request_loader_items: list[str | Request | None]
    request_manager_items: list[str | Request]
    discovered_items: list[Request]
    expected_result: set[str]


@pytest.mark.parametrize(
    argnames='test_input',
    argvalues=[
        pytest.param(
            TestInput(
                request_loader_items=['https://a.placeholder.com', 'https://b.placeholder.com'],
                request_manager_items=[],
                discovered_items=[Request.from_url('https://c.placeholder.com')],
                expected_result={
                    'https://a.placeholder.com',
                    'https://b.placeholder.com',
                    'https://c.placeholder.com',
                },
            ),
            id='basic_usage',
        ),
        pytest.param(
            TestInput(
                request_loader_items=[
                    Request.from_url('https://a.placeholder.com'),
                    None,
                    Request.from_url('https://c.placeholder.com'),
                ],
                request_manager_items=['https://b.placeholder.com', 'http://d.com'],
                discovered_items=[],
                expected_result={
                    'https://a.placeholder.com',
                    'https://b.placeholder.com',
                    'https://c.placeholder.com',
                    'http://d.com',
                },
            ),
            id='wait_for_read_only_source',
        ),
    ],
)
async def test_basic_functionality(test_input: TestInput) -> None:
    request_queue = await RequestQueue.open()

    if test_input.request_manager_items:
        await request_queue.add_requests(test_input.request_manager_items)

    mock_request_loader = create_autospec(RequestLoader, instance=True, spec_set=True)
    mock_request_loader.fetch_next_request.side_effect = lambda: test_input.request_loader_items.pop(0)
    mock_request_loader.is_finished.side_effect = lambda: len(test_input.request_loader_items) == 0

    tandem = RequestManagerTandem(mock_request_loader, request_queue)
    processed = set[str]()

    while not await tandem.is_finished():
        request = await tandem.fetch_next_request()
        assert request is not None
        processed.add(request.url)

        for new_request in test_input.discovered_items:
            await tandem.add_request(new_request)

        await tandem.mark_request_as_handled(request)

    assert processed == test_input.expected_result


================================================
FILE: tests/unit/storages/test_request_queue.py
================================================
from __future__ import annotations

import asyncio
from datetime import timedelta
from typing import TYPE_CHECKING

import pytest

from crawlee import Request, service_locator
from crawlee.configuration import Configuration
from crawlee.storage_clients import MemoryStorageClient, StorageClient
from crawlee.storages import RequestQueue
from crawlee.storages._storage_instance_manager import StorageInstanceManager

if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

    from crawlee.storage_clients import StorageClient


@pytest.fixture
async def rq(
    storage_client: StorageClient,
) -> AsyncGenerator[RequestQueue, None]:
    """Fixture that provides a request queue instance for each test."""
    rq = await RequestQueue.open(
        storage_client=storage_client,
    )

    yield rq
    await rq.drop()


async def test_open_creates_new_rq(
    storage_client: StorageClient,
) -> None:
    """Test that open() creates a new request queue with proper metadata."""
    rq = await RequestQueue.open(
        name='new-request-queue',
        storage_client=storage_client,
    )

    # Verify request queue properties
    assert rq.id is not None
    assert rq.name == 'new-request-queue'
    metadata = await rq.get_metadata()
    assert metadata.pending_request_count == 0
    assert metadata.handled_request_count == 0
    assert metadata.total_request_count == 0

    await rq.drop()


async def test_open_existing_rq(
    rq: RequestQueue,
    storage_client: StorageClient,
) -> None:
    """Test that open() loads an existing request queue correctly."""
    # Open the same request queue again
    reopened_rq = await RequestQueue.open(
        name=rq.name,
        storage_client=storage_client,
    )

    # Verify request queue properties
    assert rq.id == reopened_rq.id
    assert rq.name == reopened_rq.name

    # Verify they are the same object (from cache)
    assert id(rq) == id(reopened_rq)


async def test_open_with_id_and_name(
    storage_client: StorageClient,
) -> None:
    """Test that open() raises an error when both id and name are provided."""
    with pytest.raises(
        ValueError,
        match=r'Only one of "id", "name", "alias" can be specified, but following arguments '
        r'were specified: "id", "name".',
    ):
        await RequestQueue.open(
            id='some-id',
            name='some-name',
            storage_client=storage_client,
        )


async def test_open_by_id(
    storage_client: StorageClient,
) -> None:
    """Test opening a request queue by its ID."""
    # First create a request queue by name
    rq1 = await RequestQueue.open(
        name='rq-by-id-test',
        storage_client=storage_client,
    )

    # Add a request to identify it
    await rq1.add_request('https://example.com/open-by-id-test')

    # Open the request queue by ID
    rq2 = await RequestQueue.open(
        id=rq1.id,
        storage_client=storage_client,
    )

    # Verify it's the same request queue
    assert rq2.id == rq1.id
    assert rq2.name == 'rq-by-id-test'

    # Verify the request is still there
    request = await rq2.fetch_next_request()
    assert request is not None
    assert request.url == 'https://example.com/open-by-id-test'

    # Clean up
    await rq2.drop()


async def test_add_request_string_url(rq: RequestQueue) -> None:
    """Test adding a request with a string URL."""
    # Add a request with a string URL
    url = 'https://example.com'
    result = await rq.add_request(url)

    # Verify request was added
    assert result is not None
    assert result.unique_key is not None
    assert result.was_already_present is False
    assert result.was_already_handled is False

    # Verify the queue stats were updated
    metadata = await rq.get_metadata()
    assert metadata.total_request_count == 1
    assert metadata.pending_request_count == 1


async def test_add_request_object(rq: RequestQueue) -> None:
    """Test adding a request object."""
    # Create and add a request object
    request = Request.from_url(url='https://example.com', user_data={'key': 'value'})
    result = await rq.add_request(request)

    # Verify request was added
    assert result is not None
    assert result.unique_key is not None
    assert result.was_already_present is False
    assert result.was_already_handled is False

    # Verify the queue stats were updated
    metadata = await rq.get_metadata()
    assert metadata.total_request_count == 1
    assert metadata.pending_request_count == 1


async def test_add_duplicate_request(rq: RequestQueue) -> None:
    """Test adding a duplicate request to the queue."""
    # Add a request
    url = 'https://example.com'
    first_result = await rq.add_request(url)

    assert first_result is not None

    # Add the same request again
    second_result = await rq.add_request(url)

    # Verify the second request was detected as duplicate
    assert second_result is not None
    assert second_result.was_already_present is True
    assert second_result.unique_key == first_result.unique_key

    # Verify the queue stats weren't incremented twice
    metadata = await rq.get_metadata()
    assert metadata.total_request_count == 1
    assert metadata.pending_request_count == 1


async def test_add_requests_batch(rq: RequestQueue) -> None:
    """Test adding multiple requests in a batch."""
    # Create a batch of requests
    urls = [
        'https://example.com/page1',
        'https://example.com/page2',
        'https://example.com/page3',
    ]

    # Add the requests
    await rq.add_requests(urls)

    # Wait for all background tasks to complete
    await asyncio.sleep(0.1)

    # Verify the queue stats
    metadata = await rq.get_metadata()
    assert metadata.total_request_count == 3
    assert metadata.pending_request_count == 3


async def test_add_requests_batch_with_forefront(rq: RequestQueue) -> None:
    """Test adding multiple requests in a batch with forefront option."""
    # Add some initial requests
    await rq.add_request('https://example.com/page1')
    await rq.add_request('https://example.com/page2')

    # Add a batch of priority requests at the forefront

    await rq.add_requests(
        [
            'https://example.com/priority1',
            'https://example.com/priority2',
            'https://example.com/priority3',
        ],
        forefront=True,
    )

    # Wait for all background tasks to complete
    await asyncio.sleep(0.1)

    # Fetch requests - they should come out in priority order first
    next_request1 = await rq.fetch_next_request()
    assert next_request1 is not None
    assert next_request1.url.startswith('https://example.com/priority')

    next_request2 = await rq.fetch_next_request()
    assert next_request2 is not None
    assert next_request2.url.startswith('https://example.com/priority')

    next_request3 = await rq.fetch_next_request()
    assert next_request3 is not None
    assert next_request3.url.startswith('https://example.com/priority')

    # Now we should get the original requests
    next_request4 = await rq.fetch_next_request()
    assert next_request4 is not None
    assert next_request4.url == 'https://example.com/page1'

    next_request5 = await rq.fetch_next_request()
    assert next_request5 is not None
    assert next_request5.url == 'https://example.com/page2'

    # Queue should be empty now
    next_request6 = await rq.fetch_next_request()
    assert next_request6 is None


async def test_add_requests_with_forefront(rq: RequestQueue) -> None:
    """Test adding requests to the front of the queue."""
    # Add some initial requests
    await rq.add_request('https://example.com/page1')
    await rq.add_request('https://example.com/page2')

    # Add a priority request at the forefront
    await rq.add_request('https://example.com/priority', forefront=True)

    # Fetch the next request - should be the priority one
    next_request = await rq.fetch_next_request()
    assert next_request is not None
    assert next_request.url == 'https://example.com/priority'


async def test_add_requests_mixed_forefront(rq: RequestQueue) -> None:
    """Test the ordering when adding requests with mixed forefront values."""
    # Add normal requests
    await rq.add_request('https://example.com/normal1')
    await rq.add_request('https://example.com/normal2')

    # Add a batch with forefront=True
    await rq.add_requests(
        ['https://example.com/priority1', 'https://example.com/priority2'],
        forefront=True,
    )

    # Add another normal request
    await rq.add_request('https://example.com/normal3')

    # Add another priority request
    await rq.add_request('https://example.com/priority3', forefront=True)

    # Wait for background tasks
    await asyncio.sleep(0.1)

    # The expected order should be:
    # 1. priority3 (most recent forefront)
    # 2. priority1 (from batch, forefront)
    # 3. priority2 (from batch, forefront)
    # 4. normal1 (oldest normal)
    # 5. normal2
    # 6. normal3 (newest normal)

    requests = []
    while True:
        req = await rq.fetch_next_request()
        if req is None:
            break
        requests.append(req)
        await rq.mark_request_as_handled(req)

    assert len(requests) == 6
    assert requests[0].url == 'https://example.com/priority3'

    # The next two should be from the forefront batch (exact order within batch may vary)
    batch_urls = {requests[1].url, requests[2].url}
    assert 'https://example.com/priority1' in batch_urls
    assert 'https://example.com/priority2' in batch_urls

    # Then the normal requests in order
    assert requests[3].url == 'https://example.com/normal1'
    assert requests[4].url == 'https://example.com/normal2'
    assert requests[5].url == 'https://example.com/normal3'


async def test_fetch_next_request_and_mark_handled(rq: RequestQueue) -> None:
    """Test fetching and marking requests as handled."""
    # Add some requests
    await rq.add_request('https://example.com/page1')
    await rq.add_request('https://example.com/page2')

    # Fetch first request
    request1 = await rq.fetch_next_request()
    assert request1 is not None
    assert request1.url == 'https://example.com/page1'

    # Mark the request as handled
    result = await rq.mark_request_as_handled(request1)
    assert result is not None
    assert result.was_already_handled is True

    # Fetch next request
    request2 = await rq.fetch_next_request()
    assert request2 is not None
    assert request2.url == 'https://example.com/page2'

    # Mark the second request as handled
    await rq.mark_request_as_handled(request2)

    # Verify counts
    metadata = await rq.get_metadata()
    assert metadata.total_request_count == 2
    assert metadata.handled_request_count == 2
    assert metadata.pending_request_count == 0

    # Verify queue is empty
    empty_request: Request | None = await rq.fetch_next_request()
    assert empty_request is None


async def test_get_request_by_id(rq: RequestQueue) -> None:
    """Test retrieving a request by its ID."""
    # Add a request
    added_result = await rq.add_request('https://example.com')

    assert added_result is not None

    unique_key = added_result.unique_key

    # Retrieve the request by ID
    retrieved_request = await rq.get_request(unique_key)
    assert retrieved_request is not None
    assert retrieved_request.unique_key == unique_key
    assert retrieved_request.url == 'https://example.com'


async def test_handled_request_records_persistence(rq: RequestQueue) -> None:
    request = Request.from_url('https://example.com/1')
    await rq.add_request(request)
    fetched_request = await rq.fetch_next_request()
    assert isinstance(fetched_request, Request)
    await rq.mark_request_as_handled(fetched_request)
    fetched_request = await rq.get_request(request.unique_key)
    assert isinstance(fetched_request, Request)
    assert fetched_request.unique_key == request.unique_key


async def test_get_non_existent_request(rq: RequestQueue) -> None:
    """Test retrieving a request that doesn't exist."""
    non_existent_request = await rq.get_request('non-existent-id')
    assert non_existent_request is None


async def test_reclaim_request(rq: RequestQueue) -> None:
    """Test reclaiming a request that failed processing."""
    # Add a request
    await rq.add_request('https://example.com')

    # Fetch the request
    request = await rq.fetch_next_request()
    assert request is not None

    # Reclaim the request
    result = await rq.reclaim_request(request)
    assert result is not None
    assert result.was_already_handled is False

    # Verify we can fetch it again
    reclaimed_request = await rq.fetch_next_request()
    assert reclaimed_request is not None
    assert reclaimed_request.unique_key == request.unique_key
    assert reclaimed_request.url == 'https://example.com'


async def test_reclaim_request_with_forefront(rq: RequestQueue) -> None:
    """Test reclaiming a request to the front of the queue."""
    # Add requests
    await rq.add_request('https://example.com/first')
    await rq.add_request('https://example.com/second')

    # Fetch the first request
    first_request = await rq.fetch_next_request()
    assert first_request is not None
    assert first_request.url == 'https://example.com/first'

    # Reclaim it to the forefront
    await rq.reclaim_request(first_request, forefront=True)

    # The reclaimed request should be returned first (before the second request)
    next_request = await rq.fetch_next_request()
    assert next_request is not None
    assert next_request.url == 'https://example.com/first'


async def test_is_empty(rq: RequestQueue) -> None:
    """Test checking if a request queue is empty."""
    # Initially the queue should be empty
    assert await rq.is_empty() is True

    # Add a request
    await rq.add_request('https://example.com')
    assert await rq.is_empty() is False

    # Fetch and handle the request
    request = await rq.fetch_next_request()

    assert request is not None
    await rq.mark_request_as_handled(request)

    # Queue should be empty again
    assert await rq.is_empty() is True


@pytest.mark.parametrize(
    ('wait_for_all'),
    [
        pytest.param(True, id='wait for all'),
        pytest.param(False, id='do not wait for all'),
    ],
)
async def test_add_requests_wait_for_all(
    rq: RequestQueue,
    *,
    wait_for_all: bool,
) -> None:
    """Test adding requests with wait_for_all_requests_to_be_added option."""
    urls = [f'https://example.com/{i}' for i in range(15)]

    # Add requests without waiting
    await rq.add_requests(
        urls,
        batch_size=5,
        wait_for_all_requests_to_be_added=wait_for_all,
        wait_time_between_batches=timedelta(milliseconds=50),
    )

    if not wait_for_all:
        # Immediately after adding, the total count may be less than 15 due to background processing
        assert await rq.get_total_count() <= 15

        # Wait for background tasks to complete
        while await rq.get_total_count() < 15:  # noqa: ASYNC110
            await asyncio.sleep(0.1)

    # Verify all requests were added
    assert await rq.get_total_count() == 15


async def test_is_finished(rq: RequestQueue) -> None:
    """Test checking if a request queue is finished."""
    # Initially the queue should be finished (empty and no background tasks)
    assert await rq.is_finished() is True

    # Add a request
    await rq.add_request('https://example.com')
    assert await rq.is_finished() is False

    # Add requests in the background
    await rq.add_requests(
        ['https://example.com/1', 'https://example.com/2'],
        wait_for_all_requests_to_be_added=False,
    )

    # Queue shouldn't be finished while background tasks are running
    assert await rq.is_finished() is False

    # Wait for background tasks to finish
    await asyncio.sleep(0.2)

    # Process all requests
    while True:
        request = await rq.fetch_next_request()
        if request is None:
            break
        await rq.mark_request_as_handled(request)

    # Now queue should be finished
    assert await rq.is_finished() is True


async def test_mark_non_existent_request_as_handled(rq: RequestQueue) -> None:
    """Test marking a non-existent request as handled."""
    # Create a request that hasn't been added to the queue
    request = Request.from_url(url='https://example.com', id='non-existent-id')

    # Attempt to mark it as handled
    result = await rq.mark_request_as_handled(request)
    assert result is None


async def test_reclaim_non_existent_request(rq: RequestQueue) -> None:
    """Test reclaiming a non-existent request."""
    # Create a request that hasn't been added to the queue
    request = Request.from_url(url='https://example.com', id='non-existent-id')

    # Attempt to reclaim it
    result = await rq.reclaim_request(request)
    assert result is None


async def test_drop(
    storage_client: StorageClient,
) -> None:
    """Test dropping a request queue removes it from cache and clears its data."""
    rq = await RequestQueue.open(
        name='drop-test',
        storage_client=storage_client,
    )

    # Add a request
    await rq.add_request('https://example.com')

    # Drop the request queue
    await rq.drop()

    # Verify request queue is empty (by creating a new one with the same name)
    new_rq = await RequestQueue.open(
        name='drop-test',
        storage_client=storage_client,
    )

    # Verify the queue is empty
    assert await new_rq.is_empty() is True
    metadata = await new_rq.get_metadata()
    assert metadata.total_request_count == 0
    assert metadata.pending_request_count == 0
    await new_rq.drop()


async def test_reopen_default(
    storage_client: StorageClient,
) -> None:
    """Test reopening the default request queue."""
    # First clean up any storage instance caches
    storage_instance_manager = service_locator.storage_instance_manager
    storage_instance_manager.clear_cache()

    # Open the default request queue
    rq1 = await RequestQueue.open(
        storage_client=storage_client,
    )

    # If a request queue already exists (due to previous test run), purge it to start fresh
    try:
        await rq1.purge()
    except Exception:
        # If purge fails, try dropping and recreating
        await rq1.drop()
        rq1 = await RequestQueue.open(
            storage_client=storage_client,
        )

    # Verify we're starting fresh
    metadata1 = await rq1.get_metadata()
    assert metadata1.pending_request_count == 0

    # Add a request
    await rq1.add_request('https://example.com/')

    # Verify the request was added
    metadata1 = await rq1.get_metadata()
    assert metadata1.pending_request_count == 1

    # Open the default request queue again
    rq2 = await RequestQueue.open(
        storage_client=storage_client,
    )

    # Verify they are the same queue
    assert rq1.id == rq2.id
    assert rq1.name == rq2.name
    metadata1 = await rq1.get_metadata()
    metadata2 = await rq2.get_metadata()
    assert metadata1.total_request_count == metadata2.total_request_count
    assert metadata1.pending_request_count == metadata2.pending_request_count
    assert metadata1.handled_request_count == metadata2.handled_request_count

    # Verify the request is accessible
    request = await rq2.fetch_next_request()
    assert request is not None
    assert request.url == 'https://example.com/'

    # Clean up after the test
    await rq1.drop()


async def test_purge(
    storage_client: StorageClient,
) -> None:
    """Test purging a request queue removes all requests but keeps the queue itself."""
    # First create a request queue
    rq = await RequestQueue.open(
        name='purge-test-queue',
        storage_client=storage_client,
    )

    # Add some requests
    await rq.add_requests(
        [
            'https://example.com/page1',
            'https://example.com/page2',
            'https://example.com/page3',
        ]
    )

    # Verify requests were added
    metadata = await rq.get_metadata()
    assert metadata.total_request_count == 3
    assert metadata.pending_request_count == 3
    assert metadata.handled_request_count == 0

    # Record the queue ID
    queue_id = rq.id

    # Purge the queue
    await rq.purge()

    # Verify the queue still exists but is empty
    assert rq.id == queue_id  # Same ID preserved
    assert rq.name == 'purge-test-queue'  # Same name preserved

    # Queue should be empty now
    metadata = await rq.get_metadata()
    assert metadata.total_request_count == 0
    assert metadata.pending_request_count == 0
    assert metadata.handled_request_count == 0
    assert await rq.is_empty() is True

    # Verify we can add new requests after purging
    await rq.add_request('https://example.com/new-after-purge')

    request = await rq.fetch_next_request()
    assert request is not None
    assert request.url == 'https://example.com/new-after-purge'

    # Clean up
    await rq.drop()


async def test_open_with_alias(
    storage_client: StorageClient,
) -> None:
    """Test opening request queues with alias parameter for NDU functionality."""
    # Create request queues with different aliases
    rq_1 = await RequestQueue.open(
        alias='test_alias_1',
        storage_client=storage_client,
    )
    rq_2 = await RequestQueue.open(
        alias='test_alias_2',
        storage_client=storage_client,
    )

    # Verify they have different IDs but no names (unnamed)
    assert rq_1.id != rq_2.id
    assert rq_1.name is None
    assert rq_2.name is None

    # Add different requests to each
    await rq_1.add_request('https://example.com/1')
    await rq_1.add_request('https://example.com/2')
    await rq_2.add_request('https://example.com/3')

    # Verify data isolation
    request_1 = await rq_1.fetch_next_request()
    request_2 = await rq_2.fetch_next_request()

    assert request_1 is not None
    assert request_2 is not None
    assert request_1.url == 'https://example.com/1'
    assert request_2.url == 'https://example.com/3'

    # Clean up
    await rq_1.drop()
    await rq_2.drop()


async def test_alias_caching(
    storage_client: StorageClient,
) -> None:
    """Test that request queues with same alias return same instance (cached)."""
    # Open rq with alias
    rq_1 = await RequestQueue.open(
        alias='cache_test',
        storage_client=storage_client,
    )

    # Open again with same alias
    rq_2 = await RequestQueue.open(
        alias='cache_test',
        storage_client=storage_client,
    )

    # Should be same instance
    assert rq_1 is rq_2
    assert rq_1.id == rq_2.id

    # Clean up
    await rq_1.drop()


async def test_alias_with_id_error(
    storage_client: StorageClient,
) -> None:
    """Test that providing both alias and id raises error."""
    with pytest.raises(
        ValueError,
        match=r'Only one of "id", "name", "alias" can be specified, but following arguments '
        r'were specified: "id", "alias".',
    ):
        await RequestQueue.open(
            id='some-id',
            alias='some-alias',
            storage_client=storage_client,
        )


async def test_alias_with_name_error(
    storage_client: StorageClient,
) -> None:
    """Test that providing both alias and name raises error."""
    with pytest.raises(
        ValueError,
        match=r'Only one of "id", "name", "alias" can be specified, but following arguments '
        r'were specified: "name", "alias".',
    ):
        await RequestQueue.open(
            name='some-name',
            alias='some-alias',
            storage_client=storage_client,
        )


async def test_alias_with_special_characters(
    storage_client: StorageClient,
) -> None:
    """Test alias functionality with special characters."""
    special_aliases = [
        'alias-with-dashes',
        'alias_with_underscores',
        'alias.with.dots',
        'alias123with456numbers',
        'CamelCaseAlias',
    ]

    queues = []
    for alias in special_aliases:
        rq = await RequestQueue.open(
            alias=alias,
            storage_client=storage_client,
        )
        queues.append(rq)

        # Add request with the alias as identifier in URL
        await rq.add_request(f'https://example.com/{alias}')

    # Verify all work correctly
    for i, rq in enumerate(queues):
        request = await rq.fetch_next_request()
        assert request is not None
        assert f'/{special_aliases[i]}' in request.url

    # Clean up
    for rq in queues:
        await rq.drop()


async def test_alias_request_operations(
    storage_client: StorageClient,
) -> None:
    """Test that request operations work correctly with alias queues."""
    rq = await RequestQueue.open(
        alias='request_ops_test',
        storage_client=storage_client,
    )

    # Test adding multiple requests
    urls = [
        'https://example.com/page1',
        'https://example.com/page2',
        'https://example.com/page3',
    ]

    for url in urls:
        result = await rq.add_request(url)
        assert result is not None
        assert result.was_already_present is False

    # Test queue metadata
    metadata = await rq.get_metadata()
    assert metadata.total_request_count == 3
    assert metadata.pending_request_count == 3
    assert metadata.handled_request_count == 0

    # Test fetching and handling requests
    processed_urls = []
    while not await rq.is_empty():
        request = await rq.fetch_next_request()
        if request:
            processed_urls.append(request.url)
            await rq.mark_request_as_handled(request)

    # Verify all requests were processed
    assert len(processed_urls) == 3
    assert set(processed_urls) == set(urls)

    # Verify final state
    metadata = await rq.get_metadata()
    assert metadata.pending_request_count == 0
    assert metadata.handled_request_count == 3
    assert await rq.is_empty() is True

    # Clean up
    await rq.drop()


async def test_alias_forefront_operations(
    storage_client: StorageClient,
) -> None:
    """Test forefront operations work correctly with alias queues."""
    rq = await RequestQueue.open(
        alias='forefront_test',
        storage_client=storage_client,
    )

    # Add normal requests
    await rq.add_request('https://example.com/normal1')
    await rq.add_request('https://example.com/normal2')

    # Add priority request to forefront
    await rq.add_request('https://example.com/priority', forefront=True)

    # Priority request should come first
    priority_request = await rq.fetch_next_request()
    assert priority_request is not None
    assert priority_request.url == 'https://example.com/priority'

    # Then normal requests
    normal_request = await rq.fetch_next_request()
    assert normal_request is not None
    assert normal_request.url == 'https://example.com/normal1'

    # Clean up
    await rq.drop()


async def test_alias_batch_operations(
    storage_client: StorageClient,
) -> None:
    """Test batch operations work correctly with alias queues."""
    rq = await RequestQueue.open(
        alias='batch_test',
        storage_client=storage_client,
    )

    # Test batch adding
    batch_urls = [
        'https://example.com/batch1',
        'https://example.com/batch2',
        'https://example.com/batch3',
    ]

    await rq.add_requests(batch_urls)

    # Wait for background processing
    await asyncio.sleep(0.1)

    # Verify all requests were added
    metadata = await rq.get_metadata()
    assert metadata.total_request_count == 3

    # Clean up
    await rq.drop()


async def test_named_vs_alias_conflict_detection(
    storage_client: StorageClient,
) -> None:
    """Test that conflicts between named and alias storages are detected."""
    # Test 1: Create named storage first, then try alias with same name
    named_rq = await RequestQueue.open(
        name='conflict-test',
        storage_client=storage_client,
    )
    assert named_rq.name == 'conflict-test'

    # Try to create alias with same name - should raise error
    with pytest.raises(ValueError, match=r'Cannot create alias storage "conflict-test".*already exists'):
        await RequestQueue.open(alias='conflict-test', storage_client=storage_client)

    # Clean up
    await named_rq.drop()

    # Test 2: Create alias first, then try named with same name
    alias_rq = await RequestQueue.open(alias='conflict-test2', storage_client=storage_client)
    assert alias_rq.name is None  # Alias storages have no name

    # Try to create named with same name - should raise error
    with pytest.raises(ValueError, match=r'Cannot create named storage "conflict-test2".*already exists'):
        await RequestQueue.open(name='conflict-test2', storage_client=storage_client)

    # Clean up
    await alias_rq.drop()

    # Test 3: Different names should work fine
    named_rq_ok = await RequestQueue.open(name='different-name')
    alias_rq_ok = await RequestQueue.open(alias='different-alias')

    assert named_rq_ok.name == 'different-name'
    assert alias_rq_ok.name is None

    # Clean up
    await named_rq_ok.drop()
    await alias_rq_ok.drop()


async def test_alias_parameter(
    storage_client: StorageClient,
) -> None:
    """Test request queue creation and operations with alias parameter."""
    # Create request queue with alias
    alias_rq = await RequestQueue.open(
        alias='test_alias',
        storage_client=storage_client,
    )

    # Verify alias request queue properties
    assert alias_rq.id is not None
    assert alias_rq.name is None  # Alias storages should be unnamed

    # Test data operations
    await alias_rq.add_request('https://example.com/alias')
    metadata = await alias_rq.get_metadata()
    assert metadata.pending_request_count == 1

    await alias_rq.drop()


async def test_alias_vs_named_isolation(
    storage_client: StorageClient,
) -> None:
    """Test that alias and named request queues with same identifier are isolated."""
    # Create named request queue
    named_rq = await RequestQueue.open(
        name='test-identifier',
        storage_client=storage_client,
    )

    # Verify named request queue
    assert named_rq.name == 'test-identifier'
    await named_rq.add_request('https://named.example.com')

    # Clean up named request queue first
    await named_rq.drop()

    # Now create alias request queue with same identifier (should work after cleanup)
    alias_rq = await RequestQueue.open(
        alias='test-identifier',
        storage_client=storage_client,
    )

    # Should be different instance
    assert alias_rq.name is None
    await alias_rq.add_request('https://alias.example.com')

    # Verify alias data
    alias_request = await alias_rq.fetch_next_request()
    assert alias_request is not None
    assert alias_request.url == 'https://alias.example.com'

    await alias_rq.drop()


async def test_default_vs_alias_default_equivalence(
    storage_client: StorageClient,
) -> None:
    """Test that default request queue and alias='default' are equivalent."""
    # Open default request queue
    default_rq = await RequestQueue.open(
        storage_client=storage_client,
    )

    alias_default_rq = await RequestQueue.open(
        alias=StorageInstanceManager._DEFAULT_STORAGE_ALIAS,
        storage_client=storage_client,
    )

    # Should be the same
    assert default_rq.id == alias_default_rq.id
    assert default_rq.name is None
    assert alias_default_rq.name is None

    # Data should be shared
    await default_rq.add_request('https://default.example.com')
    metadata = await alias_default_rq.get_metadata()
    assert metadata.pending_request_count == 1

    await default_rq.drop()


async def test_multiple_alias_isolation(
    storage_client: StorageClient,
) -> None:
    """Test that different aliases create separate request queues."""
    request_queues = []

    for i in range(3):
        rq = await RequestQueue.open(
            alias=f'alias_{i}',
            storage_client=storage_client,
        )
        await rq.add_request(f'https://example.com/alias_{i}')
        request_queues.append(rq)

    # All should be different
    for i in range(3):
        for j in range(i + 1, 3):
            assert request_queues[i].id != request_queues[j].id

    # Verify data isolation
    for i, rq in enumerate(request_queues):
        request = await rq.fetch_next_request()
        assert request is not None
        assert request.url == f'https://example.com/alias_{i}'
        await rq.drop()


async def test_purge_on_start_enabled(storage_client: StorageClient) -> None:
    """Test purge behavior when purge_on_start=True: named storages retain data, unnamed storages are purged."""

    # Skip this test for memory storage since it doesn't persist data between client instances.
    if isinstance(storage_client, MemoryStorageClient):
        pytest.skip('Memory storage does not persist data between client instances.')

    configuration = Configuration(purge_on_start=True)

    # First, create all storage types with purge enabled and add data.
    default_rq = await RequestQueue.open(
        storage_client=storage_client,
        configuration=configuration,
    )

    alias_rq = await RequestQueue.open(
        alias='purge-test-alias',
        storage_client=storage_client,
        configuration=configuration,
    )

    named_rq = await RequestQueue.open(
        name='purge-test-named',
        storage_client=storage_client,
        configuration=configuration,
    )

    await default_rq.add_requests(
        [
            'https://default.example.com/1',
            'https://default.example.com/2',
            'https://default.example.com/3',
        ]
    )
    await alias_rq.add_requests(
        [
            'https://alias.example.com/1',
            'https://alias.example.com/2',
            'https://alias.example.com/3',
        ]
    )
    await named_rq.add_requests(
        [
            'https://named.example.com/1',
            'https://named.example.com/2',
            'https://named.example.com/3',
        ]
    )

    default_request = await default_rq.fetch_next_request()
    alias_request = await alias_rq.fetch_next_request()
    named_request = await named_rq.fetch_next_request()

    assert default_request is not None
    assert alias_request is not None
    assert named_request is not None

    await default_rq.mark_request_as_handled(default_request)
    await alias_rq.mark_request_as_handled(alias_request)
    await named_rq.mark_request_as_handled(named_request)

    # Verify data was added
    default_metadata = await default_rq.get_metadata()
    alias_metadata = await alias_rq.get_metadata()
    named_metadata = await named_rq.get_metadata()

    assert default_metadata.pending_request_count == 2
    assert alias_metadata.pending_request_count == 2
    assert named_metadata.pending_request_count == 2

    assert default_metadata.handled_request_count == 1
    assert alias_metadata.handled_request_count == 1
    assert named_metadata.handled_request_count == 1

    assert default_metadata.total_request_count == 3
    assert alias_metadata.total_request_count == 3
    assert named_metadata.total_request_count == 3

    # Verify that default and alias storages are unnamed
    assert default_metadata.name is None
    assert alias_metadata.name is None
    assert named_metadata.name == 'purge-test-named'

    # Clear storage cache to simulate "reopening" storages
    service_locator.storage_instance_manager.clear_cache()

    # Now "reopen" all storages
    default_rq_2 = await RequestQueue.open(
        storage_client=storage_client,
        configuration=configuration,
    )
    alias_rq_2 = await RequestQueue.open(
        alias='purge-test-alias',
        storage_client=storage_client,
        configuration=configuration,
    )
    named_rq_2 = await RequestQueue.open(
        name='purge-test-named',
        storage_client=storage_client,
        configuration=configuration,
    )

    # Check the data after purge
    default_metadata_after = await default_rq_2.get_metadata()
    alias_metadata_after = await alias_rq_2.get_metadata()
    named_metadata_after = await named_rq_2.get_metadata()

    # Unnamed storages (alias and default) should be purged (data removed)
    assert default_metadata_after.pending_request_count == 0
    assert alias_metadata_after.pending_request_count == 0
    assert named_metadata_after.pending_request_count == 2

    assert default_metadata_after.handled_request_count == 0
    assert alias_metadata_after.handled_request_count == 0
    assert named_metadata_after.handled_request_count == 1

    assert default_metadata_after.total_request_count == 0
    assert alias_metadata_after.total_request_count == 0
    assert named_metadata_after.total_request_count == 3

    # Clean up
    await named_rq_2.drop()
    await alias_rq_2.drop()
    await default_rq_2.drop()


async def test_purge_on_start_disabled(storage_client: StorageClient) -> None:
    """Test purge behavior when purge_on_start=False: all storages retain data regardless of type."""

    # Skip this test for memory storage since it doesn't persist data between client instances.
    if isinstance(storage_client, MemoryStorageClient):
        pytest.skip('Memory storage does not persist data between client instances.')

    configuration = Configuration(purge_on_start=False)

    # First, create all storage types with purge disabled and add data.
    default_rq = await RequestQueue.open(
        storage_client=storage_client,
        configuration=configuration,
    )

    alias_rq = await RequestQueue.open(
        alias='purge-test-alias',
        storage_client=storage_client,
        configuration=configuration,
    )

    named_rq = await RequestQueue.open(
        name='purge-test-named',
        storage_client=storage_client,
        configuration=configuration,
    )

    await default_rq.add_requests(
        [
            'https://default.example.com/1',
            'https://default.example.com/2',
            'https://default.example.com/3',
        ]
    )
    await alias_rq.add_requests(
        [
            'https://alias.example.com/1',
            'https://alias.example.com/2',
            'https://alias.example.com/3',
        ]
    )
    await named_rq.add_requests(
        [
            'https://named.example.com/1',
            'https://named.example.com/2',
            'https://named.example.com/3',
        ]
    )

    default_request = await default_rq.fetch_next_request()
    alias_request = await alias_rq.fetch_next_request()
    named_request = await named_rq.fetch_next_request()

    assert default_request is not None
    assert alias_request is not None
    assert named_request is not None

    await default_rq.mark_request_as_handled(default_request)
    await alias_rq.mark_request_as_handled(alias_request)
    await named_rq.mark_request_as_handled(named_request)

    # Verify data was added
    default_metadata = await default_rq.get_metadata()
    alias_metadata = await alias_rq.get_metadata()
    named_metadata = await named_rq.get_metadata()

    assert default_metadata.pending_request_count == 2
    assert alias_metadata.pending_request_count == 2
    assert named_metadata.pending_request_count == 2

    assert default_metadata.handled_request_count == 1
    assert alias_metadata.handled_request_count == 1
    assert named_metadata.handled_request_count == 1

    assert default_metadata.total_request_count == 3
    assert alias_metadata.total_request_count == 3
    assert named_metadata.total_request_count == 3

    # Verify that default and alias storages are unnamed
    assert default_metadata.name is None
    assert alias_metadata.name is None
    assert named_metadata.name == 'purge-test-named'

    # Clear storage cache to simulate "reopening" storages
    service_locator.storage_instance_manager.clear_cache()

    # Now "reopen" all storages
    default_rq_2 = await RequestQueue.open(
        storage_client=storage_client,
        configuration=configuration,
    )
    alias_rq_2 = await RequestQueue.open(
        alias='purge-test-alias',
        storage_client=storage_client,
        configuration=configuration,
    )
    named_rq_2 = await RequestQueue.open(
        name='purge-test-named',
        storage_client=storage_client,
        configuration=configuration,
    )

    # Check the data after purge
    default_metadata_after = await default_rq_2.get_metadata()
    alias_metadata_after = await alias_rq_2.get_metadata()
    named_metadata_after = await named_rq_2.get_metadata()

    # Unnamed storages (alias and default) should be purged (data removed)
    assert default_metadata_after.pending_request_count == 2
    assert alias_metadata_after.pending_request_count == 2
    assert named_metadata_after.pending_request_count == 2

    assert default_metadata_after.handled_request_count == 1
    assert alias_metadata_after.handled_request_count == 1
    assert named_metadata_after.handled_request_count == 1

    assert default_metadata_after.total_request_count == 3
    assert alias_metadata_after.total_request_count == 3
    assert named_metadata_after.total_request_count == 3

    # Clean up
    await named_rq_2.drop()
    await alias_rq_2.drop()
    await default_rq_2.drop()


async def test_name_default_not_allowed(storage_client: StorageClient) -> None:
    """Test that storage can't have default alias as name, to prevent collisions with unnamed storage alias."""
    with pytest.raises(
        ValueError,
        match=f'Storage name cannot be "{StorageInstanceManager._DEFAULT_STORAGE_ALIAS}" as '
        f'it is reserved for default alias.',
    ):
        await RequestQueue.open(name=StorageInstanceManager._DEFAULT_STORAGE_ALIAS, storage_client=storage_client)


@pytest.mark.parametrize(
    ('name', 'is_valid'),
    [
        pytest.param('F', True, id='single-char'),
        pytest.param('7', True, id='single-digit'),
        pytest.param('FtghdfseySds', True, id='mixed-case'),
        pytest.param('125673450', True, id='all-digits'),
        pytest.param('Ft2134Sfe0O1hf', True, id='mixed-alphanumeric'),
        pytest.param('name-with-dashes', True, id='dashes'),
        pytest.param('1-value', True, id='number start'),
        pytest.param('value-1', True, id='number end'),
        pytest.param('test-1-value', True, id='number middle'),
        pytest.param('test-------value', True, id='multiple-dashes'),
        pytest.param('test-VALUES-test', True, id='multiple-cases'),
        pytest.param('name_with_underscores', False, id='underscores'),
        pytest.param('name with spaces', False, id='spaces'),
        pytest.param('-test', False, id='dashes start'),
        pytest.param('test-', False, id='dashes end'),
    ],
)
async def test_validate_name(storage_client: StorageClient, name: str, *, is_valid: bool) -> None:
    """Test name validation logic."""
    if is_valid:
        # Should not raise
        dataset = await RequestQueue.open(name=name, storage_client=storage_client)
        assert dataset.name == name
        await dataset.drop()
    else:
        with pytest.raises(ValueError, match=rf'Invalid storage name "{name}".*'):
            await RequestQueue.open(name=name, storage_client=storage_client)


async def test_reclaim_request_with_change_state(rq: RequestQueue) -> None:
    """Test reclaiming a request and changing its state."""
    # Add a request
    await rq.add_request(Request.from_url('https://example.com/original', user_data={'state': 'original'}))

    # Fetch the request
    request = await rq.fetch_next_request()
    assert request is not None
    assert request.url == 'https://example.com/original'
    assert request.user_data['state'] == 'original'

    # Reclaim the request with modified user data
    request.user_data['state'] = 'modified'
    result = await rq.reclaim_request(request)
    assert result is not None
    assert result.was_already_handled is False

    # Fetch the reclaimed request
    reclaimed_request = await rq.fetch_next_request()
    assert reclaimed_request is not None
    assert reclaimed_request.url == 'https://example.com/original'
    assert reclaimed_request.user_data['state'] == 'modified'


async def test_request_with_noascii_chars(rq: RequestQueue) -> None:
    """Test handling requests with non-ASCII characters in user data."""
    data_with_special_chars = {
        'record_1': 'Supermaxi El Jardín',
        'record_2': 'záznam dva',
        'record_3': '記録三',
    }
    init_request = Request.from_url('https://crawlee.dev', user_data=data_with_special_chars)

    # Add a request with special user data
    await rq.add_request(init_request)

    # Get the request and verify
    request = await rq.fetch_next_request()
    assert request is not None
    assert request.url == 'https://crawlee.dev'
    assert request.user_data == init_request.user_data


================================================
FILE: tests/unit/storages/test_storage_instance_manager.py
================================================
import asyncio
import sys
from pathlib import Path
from typing import cast
from unittest.mock import AsyncMock

import pytest

from crawlee import service_locator
from crawlee.configuration import Configuration
from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
from crawlee.storages._base import Storage


@pytest.fixture(autouse=True)
def clean_storage_instance_manager() -> None:
    """Helper function to clean the storage instance manager before each test."""
    service_locator.storage_instance_manager.clear_cache()


@pytest.fixture(params=[KeyValueStore, Dataset, RequestQueue])
def storage_type(request: pytest.FixtureRequest) -> type[Storage]:
    return cast('type[Storage]', request.param)


async def test_unique_storage_by_storage_client(tmp_path: Path, storage_type: type[Storage]) -> None:
    config = Configuration(purge_on_start=True, storage_dir=str(tmp_path))

    storage_1 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config)
    storage_2 = await storage_type.open(storage_client=FileSystemStorageClient(), configuration=config)
    assert storage_1 is not storage_2


async def test_same_storage_when_different_client(tmp_path: Path, storage_type: type[Storage]) -> None:
    config = Configuration(purge_on_start=True, storage_dir=str(tmp_path))

    storage_1 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config)
    storage_2 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config)
    assert storage_1 is storage_2


async def test_unique_storage_by_storage_type(tmp_path: Path) -> None:
    config = Configuration(purge_on_start=True, storage_dir=str(tmp_path))
    storage_client = MemoryStorageClient()

    kvs = await KeyValueStore.open(storage_client=storage_client, configuration=config)
    dataset = await Dataset.open(storage_client=storage_client, configuration=config)
    assert kvs is not dataset


async def test_unique_storage_by_name(storage_type: type[Storage]) -> None:
    """Test that StorageInstanceManager support different storage clients at the same time."""
    storage_client = MemoryStorageClient()

    storage_1 = await storage_type.open(storage_client=storage_client, name='kvs1')
    storage_2 = await storage_type.open(storage_client=storage_client, name='kvs2')
    assert storage_1 is not storage_2


async def test_unique_storage_by_unique_cache_key_different_path(tmp_path: Path, storage_type: type[Storage]) -> None:
    """Test that StorageInstanceManager support unique cache key. Difference in storage_dir."""
    path_1 = tmp_path / 'dir1'
    path_2 = tmp_path / 'dir2'
    path_1.mkdir()
    path_2.mkdir()

    config_1 = Configuration(storage_dir=str(path_1))

    config_2 = Configuration(storage_dir=str(path_2))

    storage_client = FileSystemStorageClient()

    storage_1 = await storage_type.open(storage_client=storage_client, configuration=config_1)
    storage_2 = await storage_type.open(storage_client=storage_client, configuration=config_2)
    assert storage_1 is not storage_2


async def test_unique_storage_by_unique_cache_key_same_path(tmp_path: Path, storage_type: type[Storage]) -> None:
    """Test that StorageInstanceManager support unique cache key. Different configs with same storage_dir create same
    storage."""
    config_1 = Configuration(storage_dir=str(tmp_path))

    config_2 = Configuration(storage_dir=str(tmp_path))

    storage_client = FileSystemStorageClient()

    storage_1 = await storage_type.open(storage_client=storage_client, configuration=config_1)
    storage_2 = await storage_type.open(storage_client=storage_client, configuration=config_2)
    assert storage_1 is storage_2


async def test_identical_storage_default_config(storage_type: type[Storage]) -> None:
    """Test that StorageInstanceManager correctly caches storage based on the storage client."""
    storage_client = MemoryStorageClient()

    storage_1 = await storage_type.open(storage_client=storage_client)
    storage_2 = await storage_type.open(storage_client=storage_client)
    assert storage_1 is storage_2


async def test_identical_storage_default_storage(storage_type: type[Storage]) -> None:
    """Test that StorageInstanceManager correctly caches storage based on the storage client."""
    storage_1 = await storage_type.open()
    storage_2 = await storage_type.open()
    assert storage_1 is storage_2


async def test_identical_storage_clear_cache(storage_type: type[Storage]) -> None:
    storage_1 = await storage_type.open()
    service_locator.storage_instance_manager.clear_cache()
    storage_2 = await storage_type.open()
    assert storage_1 is not storage_2


async def test_identical_storage_remove_from_cache(storage_type: type[Storage]) -> None:
    storage_1 = await storage_type.open()
    service_locator.storage_instance_manager.remove_from_cache(storage_1)
    storage_2 = await storage_type.open()
    assert storage_1 is not storage_2


async def test_preexisting_unnamed_storage_open_by_id(storage_type: type[Storage]) -> None:
    """Test that persisted pre-existing unnamed storage can be opened by ID."""
    storage_client = FileSystemStorageClient()
    storage_1 = await storage_type.open(alias='custom_name', storage_client=storage_client)

    # Make service_locator unaware of this storage
    service_locator.storage_instance_manager.clear_cache()

    storage_1_again = await storage_type.open(id=storage_1.id, storage_client=storage_client)

    assert storage_1.id == storage_1_again.id


@pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.Barrier was introduced in Python 3.11.')
async def test_concurrent_open_datasets() -> None:
    """Test that concurrent open datasets with the same name return the same instance."""
    from asyncio import Barrier  # type:ignore[attr-defined] # noqa: PLC0415

    barrier = Barrier(2)

    async def push_data(data: dict) -> None:
        await barrier.wait()
        dataset = await Dataset.open(name='concurrent-storage')
        await dataset.push_data(data)

    await asyncio.gather(
        push_data({'test_1': '1'}),
        push_data({'test_2': '2'}),
    )

    dataset = await Dataset.open(name='concurrent-storage')

    items = await dataset.get_data()
    assert len(items.items) == 2

    await dataset.drop()


@pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.Barrier was introduced in Python 3.11.')
async def test_concurrent_open_datasets_with_same_name_and_alias() -> None:
    """Test that concurrent open requests for the same storage return the same instance."""
    from asyncio import Barrier  # type:ignore[attr-defined] # noqa: PLC0415

    valid_kwargs: dict[str, str | None] = {}

    exception_calls = AsyncMock()

    barrier = Barrier(2)

    async def open_dataset(name: str | None, alias: str | None) -> None:
        await barrier.wait()
        try:
            await Dataset.open(name=name, alias=alias)
            valid_kwargs['name'] = name
            valid_kwargs['alias'] = alias
        except ValueError:
            await exception_calls()

    await asyncio.gather(
        open_dataset(name=None, alias='concurrent-storage'),
        open_dataset(name='concurrent-storage', alias=None),
    )

    # Ensure that a ValueError was raised due to name/alias conflict
    exception_calls.assert_called_once()

    dataset = await Dataset.open(name=valid_kwargs.get('name'), alias=valid_kwargs.get('alias'))

    await dataset.drop()


================================================
FILE: tests/unit/test_cli.py
================================================
from __future__ import annotations

import os
from unittest.mock import ANY, Mock

import pytest
import readchar
from typer.testing import CliRunner

import crawlee._cli

runner = CliRunner()


@pytest.fixture
def mock_cookiecutter(monkeypatch: pytest.MonkeyPatch) -> Mock:
    mock_cookiecutter = Mock()
    monkeypatch.setattr(target=crawlee._cli, name='cookiecutter', value=mock_cookiecutter)

    return mock_cookiecutter


def test_create_interactive(mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch) -> None:
    mock_input = iter(
        [
            *'my_project',
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
        ]
    )
    monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input))

    result = runner.invoke(crawlee._cli.cli, ['create'])
    assert 'Your project "my_project" was created.' in result.output

    mock_cookiecutter.assert_called_with(
        template=ANY,
        no_input=True,
        extra_context={
            'project_name': 'my_project',
            'package_manager': 'poetry',
            'crawler_type': 'beautifulsoup',
            'http_client': 'impit',
            'enable_apify_integration': False,
            'start_url': 'https://crawlee.dev',
            'install_project': True,
        },
    )


def test_create_interactive_non_default_template(mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch) -> None:
    mock_input = iter(
        [
            *'my_project',
            readchar.key.ENTER,
            readchar.key.DOWN,
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
        ]
    )
    monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input))

    result = runner.invoke(crawlee._cli.cli, ['create'])
    assert 'Your project "my_project" was created.' in result.output

    mock_cookiecutter.assert_called_with(
        template=ANY,
        no_input=True,
        extra_context={
            'project_name': 'my_project',
            'package_manager': 'poetry',
            'crawler_type': 'parsel',
            'http_client': 'impit',
            'enable_apify_integration': False,
            'start_url': 'https://crawlee.dev',
            'install_project': True,
        },
    )


def test_create_non_interactive(mock_cookiecutter: Mock) -> None:
    runner.invoke(
        crawlee._cli.cli,
        [
            'create',
            'my_project',
            '--crawler-type',
            'playwright',
            '--http-client',
            'httpx',
            '--package-manager',
            'pip',
            '--start-url',
            'https://yr.no',
            '--no-apify',
            '--no-install',
        ],
    )

    mock_cookiecutter.assert_called_with(
        template=ANY,
        no_input=True,
        extra_context={
            'project_name': 'my_project',
            'package_manager': 'pip',
            'crawler_type': 'playwright',
            'http_client': 'httpx',
            'start_url': 'https://yr.no',
            'enable_apify_integration': False,
            'install_project': False,
        },
    )


def test_create_existing_folder(
    mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch, tmp_path_factory: pytest.TempPathFactory
) -> None:
    mock_input = iter(
        [
            *'my_project',
            readchar.key.ENTER,
        ]
    )
    monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input))

    tmp = tmp_path_factory.mktemp('workdir')
    os.chdir(tmp)
    (tmp / 'existing_project').mkdir()

    result = runner.invoke(
        crawlee._cli.cli,
        [
            'create',
            'existing_project',
            '--crawler-type',
            'playwright',
            '--http-client',
            'httpx',
            '--package-manager',
            'pip',
            '--start-url',
            'https://yr.no',
            '--no-apify',
            '--install',
        ],
    )
    assert 'existing_project already exists' in result.output

    mock_cookiecutter.assert_called_with(
        template=ANY,
        no_input=True,
        extra_context={
            'project_name': 'my_project',
            'package_manager': 'pip',
            'crawler_type': 'playwright',
            'http_client': 'httpx',
            'start_url': 'https://yr.no',
            'enable_apify_integration': False,
            'install_project': True,
        },
    )


def test_create_existing_folder_interactive(
    mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch, tmp_path_factory: pytest.TempPathFactory
) -> None:
    mock_input = iter(
        [
            *'existing_project',
            readchar.key.ENTER,
            *'my_project',
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
        ]
    )
    monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input))

    tmp = tmp_path_factory.mktemp('workdir')
    os.chdir(tmp)
    (tmp / 'existing_project').mkdir()

    result = runner.invoke(crawlee._cli.cli, ['create', '--template', 'playwright'])
    assert 'existing_project already exists' in result.output

    mock_cookiecutter.assert_called_with(
        template=ANY,
        no_input=True,
        extra_context={
            'project_name': 'my_project',
            'package_manager': 'poetry',
            'crawler_type': 'playwright',
            'http_client': 'impit',
            'start_url': 'https://crawlee.dev',
            'enable_apify_integration': False,
            'install_project': True,
        },
    )


def test_create_existing_folder_interactive_multiple_attempts(
    mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch, tmp_path_factory: pytest.TempPathFactory
) -> None:
    mock_input = iter(
        [
            *'existing_project',
            readchar.key.ENTER,
            *'existing_project_2',
            readchar.key.ENTER,
            *'my_project',
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
            readchar.key.ENTER,
        ]
    )
    monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input))

    tmp = tmp_path_factory.mktemp('workdir')
    os.chdir(tmp)
    (tmp / 'existing_project').mkdir()
    (tmp / 'existing_project_2').mkdir()

    result = runner.invoke(crawlee._cli.cli, ['create', '--crawler-type', 'playwright'])
    assert 'existing_project already exists' in result.output

    mock_cookiecutter.assert_called_with(
        template=ANY,
        no_input=True,
        extra_context={
            'project_name': 'my_project',
            'package_manager': 'poetry',
            'crawler_type': 'playwright',
            'http_client': 'impit',
            'start_url': 'https://crawlee.dev',
            'enable_apify_integration': False,
            'install_project': True,
        },
    )


================================================
FILE: tests/unit/test_configuration.py
================================================
from __future__ import annotations

from typing import TYPE_CHECKING

from anyio import Path as AnyioPath

from crawlee import service_locator
from crawlee.configuration import Configuration
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.statistics import Statistics
from crawlee.storage_clients import MemoryStorageClient
from crawlee.storage_clients._file_system._storage_client import FileSystemStorageClient

if TYPE_CHECKING:
    from pathlib import Path

    from yarl import URL


def test_global_configuration_works() -> None:
    assert (
        Configuration.get_global_configuration()
        is Configuration.get_global_configuration()
        is service_locator.get_configuration()
        is service_locator.get_configuration()
    )


def test_global_configuration_works_reversed() -> None:
    assert (
        service_locator.get_configuration()
        is service_locator.get_configuration()
        is Configuration.get_global_configuration()
        is Configuration.get_global_configuration()
    )


async def test_storage_not_persisted_when_non_persistable_storage_used(tmp_path: Path, server_url: URL) -> None:
    """Make the Crawler use MemoryStorageClient which can't persist state."""
    service_locator.set_configuration(Configuration(storage_dir=str(tmp_path)))
    crawler = HttpCrawler(storage_client=MemoryStorageClient())

    @crawler.router.default_handler
    async def default_handler(context: HttpCrawlingContext) -> None:
        await context.push_data({'url': context.request.url})

    await crawler.run([str(server_url)])

    # Verify that no files were created in the storage directory.
    content = [path async for path in AnyioPath(tmp_path).iterdir()]
    assert content == [], 'Expected the storage directory to be empty, but it is not.'


async def test_storage_persisted_with_explicit_statistics_with_persistable_storage(
    tmp_path: Path, server_url: URL
) -> None:
    """Make the Crawler use MemoryStorageClient which can't persist state,
    but pass explicit statistics to it which will use global FileSystemStorageClient() that can persist state."""

    configuration = Configuration(storage_dir=str(tmp_path))
    service_locator.set_configuration(configuration)
    service_locator.set_storage_client(FileSystemStorageClient())

    crawler = HttpCrawler(
        storage_client=MemoryStorageClient(), statistics=Statistics.with_default_state(persistence_enabled=True)
    )

    @crawler.router.default_handler
    async def default_handler(context: HttpCrawlingContext) -> None:
        await context.push_data({'url': context.request.url})

    await crawler.run([str(server_url)])

    # Verify that files were created in the storage directory.
    content = [path async for path in AnyioPath(tmp_path).iterdir()]
    assert content != [], 'Expected the storage directory to contain files, but it does not.'


async def test_storage_persisted_when_enabled(tmp_path: Path, server_url: URL) -> None:
    configuration = Configuration(
        storage_dir=str(tmp_path),
    )

    storage_client = FileSystemStorageClient()

    crawler = HttpCrawler(
        configuration=configuration,
        storage_client=storage_client,
    )

    @crawler.router.default_handler
    async def default_handler(context: HttpCrawlingContext) -> None:
        await context.push_data({'url': context.request.url})

    await crawler.run([str(server_url)])

    # Verify that files were created in the storage directory.
    content = [path async for path in AnyioPath(tmp_path).iterdir()]
    assert content != [], 'Expected the storage directory to contain files, but it does not.'


================================================
FILE: tests/unit/test_log_config.py
================================================
from __future__ import annotations

import logging
import sys

import pytest

from crawlee._log_config import CrawleeLogFormatter


def get_log_record(level: int, msg: str, exc_info: logging._SysExcInfoType | None = None) -> logging.LogRecord:
    return logging.LogRecord(
        name='test',
        level=level,
        pathname=__file__,
        lineno=0,
        msg=msg,
        args=(),
        exc_info=exc_info,
    )


@pytest.mark.parametrize(
    ('level', 'msg', 'expected'),
    [
        (logging.DEBUG, 'Debug message', '\x1b[90m[test]\x1b[0m \x1b[34mDEBUG\x1b[0m Debug message'),
        (logging.INFO, 'Info message', '\x1b[90m[test]\x1b[0m \x1b[32mINFO \x1b[0m Info message'),
        (logging.WARNING, 'Warning message', '\x1b[90m[test]\x1b[0m \x1b[33mWARN \x1b[0m Warning message'),
        (logging.ERROR, 'Error message', '\x1b[90m[test]\x1b[0m \x1b[31mERROR\x1b[0m Error message'),
    ],
    ids=['debug', 'info', 'warning', 'error'],
)
def test_formatted_message(level: int, msg: str, expected: str) -> None:
    formatter = CrawleeLogFormatter()
    record = get_log_record(level, msg)
    formatted_message = formatter.format(record)
    assert formatted_message == expected


def test_formatting_with_exception() -> None:
    formatter = CrawleeLogFormatter()
    try:
        raise ValueError('This is a test exception')

    except ValueError:
        exc_info = sys.exc_info()
        record = get_log_record(logging.ERROR, 'Exception occurred', exc_info=exc_info)
        formatted_message = formatter.format(record)

        assert '\x1b[90m[test]\x1b[0m \x1b[31mERROR\x1b[0m Exception occurred' in formatted_message
        assert 'ValueError: This is a test exception' in formatted_message


def test_formatter_without_name() -> None:
    formatter = CrawleeLogFormatter(include_logger_name=False)
    record = get_log_record(logging.INFO, 'Info message without name')
    formatted_message = formatter.format(record)
    assert formatted_message == '\x1b[32mINFO \x1b[0m Info message without name'


================================================
FILE: tests/unit/test_router.py
================================================
from __future__ import annotations

import logging
from unittest.mock import AsyncMock, Mock

import pytest

from crawlee import Request
from crawlee._types import BasicCrawlingContext
from crawlee.router import Router
from crawlee.sessions import Session


class MockContext(BasicCrawlingContext):
    def __init__(self, *, label: str | None) -> None:
        super().__init__(
            request=Request.from_url(url='https://example.com/', user_data={'label': label}),
            session=Session(),
            send_request=AsyncMock(),
            add_requests=AsyncMock(),
            proxy_info=AsyncMock(),
            push_data=AsyncMock(),
            use_state=AsyncMock(),
            get_key_value_store=AsyncMock(),
            log=logging.getLogger(),
        )


async def test_router_no_handlers() -> None:
    router = Router[MockContext]()

    with pytest.raises(RuntimeError):
        await router(MockContext(label=None))


async def test_router_no_default_handler() -> None:
    router = Router[MockContext]()
    mock_handler = Mock()

    @router.handler('A')
    async def handler_a(_context: MockContext) -> None:
        mock_handler()

    with pytest.raises(RuntimeError):
        await router(MockContext(label='B'))

    mock_handler.assert_not_called()


async def test_router_default_handler_invoked() -> None:
    router = Router[MockContext]()
    mock_default_handler = Mock()
    mock_handler_a = Mock()

    @router.handler('A')
    async def handler_a(_context: MockContext) -> None:
        mock_handler_a()

    @router.default_handler
    async def default_handler(_context: MockContext) -> None:
        mock_default_handler()

    await router(MockContext(label='B'))

    mock_default_handler.assert_called()
    mock_handler_a.assert_not_called()


async def test_router_specific_handler_invoked() -> None:
    router = Router[MockContext]()
    mock_default_handler = Mock()
    mock_handler_a = Mock()
    mock_handler_b = Mock()

    @router.handler('A')
    async def handler_a(_context: MockContext) -> None:
        mock_handler_a()

    @router.handler('B')
    async def handler_b(_context: MockContext) -> None:
        mock_handler_b()

    @router.default_handler
    async def default_handler(_context: MockContext) -> None:
        mock_default_handler()

    await router(MockContext(label='B'))

    mock_default_handler.assert_not_called()
    mock_handler_a.assert_not_called()
    mock_handler_b.assert_called()


async def test_router_handler_not_nullified() -> None:
    router = Router[MockContext]()

    @router.handler('A')
    async def handler_a(_context: MockContext) -> None:
        pass

    assert handler_a is not None


async def test_router_multi_labelled_handler() -> None:
    router = Router[MockContext]()
    mock_handler = Mock()

    @router.handler('A')
    @router.handler('B')
    async def handler(_context: MockContext) -> None:
        mock_handler(_context.request.label)

    await router(MockContext(label='A'))
    mock_handler.assert_called_with('A')
    await router(MockContext(label='B'))
    mock_handler.assert_called_with('B')
    assert mock_handler.call_count == 2


================================================
FILE: tests/unit/test_service_locator.py
================================================
from __future__ import annotations

import pytest

from crawlee import service_locator
from crawlee.configuration import Configuration
from crawlee.errors import ServiceConflictError
from crawlee.events import LocalEventManager
from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient


def test_default_configuration() -> None:
    default_config = Configuration()
    config = service_locator.get_configuration()
    assert config == default_config  # == because these are in fact different instances, which should be fine


def test_custom_configuration() -> None:
    custom_config = Configuration(default_browser_path='custom_path')
    service_locator.set_configuration(custom_config)
    config = service_locator.get_configuration()
    assert config is custom_config


def test_configuration_overwrite_not_possible() -> None:
    default_config = Configuration()
    service_locator.set_configuration(default_config)

    custom_config = Configuration(default_browser_path='custom_path')
    with pytest.raises(ServiceConflictError):
        service_locator.set_configuration(custom_config)


def test_configuration_conflict() -> None:
    service_locator.get_configuration()
    custom_config = Configuration(default_browser_path='custom_path')

    with pytest.raises(ServiceConflictError, match=r'Configuration is already in use.'):
        service_locator.set_configuration(custom_config)


def test_default_event_manager() -> None:
    default_event_manager = service_locator.get_event_manager()
    assert isinstance(default_event_manager, LocalEventManager)


def test_custom_event_manager() -> None:
    custom_event_manager = LocalEventManager()
    service_locator.set_event_manager(custom_event_manager)
    event_manager = service_locator.get_event_manager()
    assert event_manager is custom_event_manager


def test_event_manager_overwrite_not_possible() -> None:
    custom_event_manager = LocalEventManager()
    service_locator.set_event_manager(custom_event_manager)

    another_custom_event_manager = LocalEventManager()
    with pytest.raises(ServiceConflictError):
        service_locator.set_event_manager(another_custom_event_manager)


def test_event_manager_conflict() -> None:
    service_locator.get_event_manager()
    custom_event_manager = LocalEventManager()

    with pytest.raises(ServiceConflictError, match=r'EventManager is already in use.'):
        service_locator.set_event_manager(custom_event_manager)


def test_default_storage_client() -> None:
    default_storage_client = service_locator.get_storage_client()
    assert isinstance(default_storage_client, FileSystemStorageClient)


def test_custom_storage_client() -> None:
    custom_storage_client = MemoryStorageClient()
    service_locator.set_storage_client(custom_storage_client)
    storage_client = service_locator.get_storage_client()
    assert storage_client is custom_storage_client


def test_storage_client_overwrite_not_possible() -> None:
    custom_storage_client = MemoryStorageClient()
    service_locator.set_storage_client(custom_storage_client)

    another_custom_storage_client = MemoryStorageClient()
    with pytest.raises(ServiceConflictError):
        service_locator.set_storage_client(another_custom_storage_client)


def test_storage_client_conflict() -> None:
    service_locator.get_storage_client()
    custom_storage_client = MemoryStorageClient()

    with pytest.raises(ServiceConflictError, match=r'StorageClient is already in use.'):
        service_locator.set_storage_client(custom_storage_client)


================================================
FILE: tests/unit/utils.py
================================================
import sys

import pytest

run_alone_on_mac = pytest.mark.run_alone if sys.platform == 'darwin' else lambda x: x


================================================
FILE: typos.toml
================================================
# Configuration for typos spell checker
# https://github.com/crate-ci/typos

[default]
extend-ignore-re = [
    "https?://[^\\s]+", # Ignore URLs
    "'gASV[^']+",       # Ignore base64-encoded pickle data
]

[files]
# Extend the default exclude list
extend-exclude = [
    "*.lock",
    "*.min.js",
    "*.min.css",
    "CHANGELOG.md",
]

[default.extend-identifiers]
# Add project-specific identifiers that should not be treated as typos
ser_json_inf_nan = "ser_json_inf_nan" # Pydantic config parameter
asend = "asend" # Python async generator method

[default.extend-words]
# Add project-specific words that should not be treated as typos
mke = "mke" # Sennheiser MKE product name
consts = "consts"  # Common abbreviation for "constants"


================================================
FILE: website/.eslintrc.json
================================================
{
    "extends": [
        "@apify/eslint-config-ts",
        "plugin:react/recommended",
        "plugin:react-hooks/recommended"
    ],
    "parserOptions": {
        "project": "./tsconfig.eslint.json",
        "ecmaFeatures": {
            "jsx": true
        },
        "ecmaVersion": 2020
    },
    "env": {
        "browser": true
    },
    "settings": {
        "react": {
            "version": "detect"
        }
    },
    "rules": {
        "quote-props": ["error", "consistent"],
        "no-void": 0
    },
    "root": true
}


================================================
FILE: website/.yarnrc.yml
================================================
nodeLinker: node-modules
enableGlobalCache: true


================================================
FILE: website/babel.config.js
================================================
module.exports = {
    presets: [require.resolve('@docusaurus/core/lib/babel/preset')],
};


================================================
FILE: website/build_api_reference.sh
================================================
#!/bin/bash

# Generate import shortcuts from the modules
python generate_module_shortcuts.py


================================================
FILE: website/docusaurus.config.js
================================================
/* eslint-disable global-require */
const path = require('path');

const { externalLinkProcessor } = require('./tools/utils/externalLink');

const GROUP_ORDER = [
    'Autoscaling',
    'Browser management',
    'Configuration',
    'Crawlers',
    'Crawling contexts',
    'Errors',
    'Event data',
    'Event managers',
    'Functions',
    'HTTP clients',
    'HTTP parsers',
    'Request loaders',
    'Session management',
    'Statistics',
    'Storage clients',
    'Storage data',
    'Storages',
    'Other',
];

const groupSort = (g1, g2) => {
    if (GROUP_ORDER.includes(g1) && GROUP_ORDER.includes(g2)) {
        return GROUP_ORDER.indexOf(g1) - GROUP_ORDER.indexOf(g2);
    }
    return g1.localeCompare(g2);
};

/** @type {Partial<import('@docusaurus/types').DocusaurusConfig>} */
module.exports = {
    title: 'Crawlee for Python · Fast, reliable Python web crawlers.',
    url: 'https://crawlee.dev',
    baseUrl: '/python/',
    trailingSlash: false,
    organizationName: 'apify',
    projectName: 'crawlee-python',
    scripts: [
        '/python/js/custom.js',
        '/crawlee-python/js/custom.js',
    ],
    githubHost: 'github.com',
    future: {
        experimental_faster: true,
        v4: {
            removeLegacyPostBuildHeadAttribute: true,
            useCssCascadeLayers: false, // this breaks styles on homepage and link colors everywhere
        },
    },
    headTags: [
        // Intercom messenger
        {
            tagName: 'script',
            innerHTML: `window.intercomSettings={api_base:"https://api-iam.intercom.io",app_id:"kod1r788"};`,
            attributes: {},
        },
        // Intercom messenger
        {
            tagName: 'script',
            innerHTML: `(function(){var w=window;var ic=w.Intercom;if(typeof ic==="function"){ic('reattach_activator');ic('update',w.intercomSettings);}else{var d=document;var i=function(){i.c(arguments);};i.q=[];i.c=function(args){i.q.push(args);};w.Intercom=i;var l=function(){var s=d.createElement('script');s.type='text/javascript';s.async=true;s.src='https://widget.intercom.io/widget/kod1r788';var x=d.getElementsByTagName('script')[0];x.parentNode.insertBefore(s,x);};if(document.readyState==='complete'){l();}else if(w.attachEvent){w.attachEvent('onload',l);}else{w.addEventListener('load',l,false);}}})()`,
            attributes: {},
        },
    ],
    favicon: 'img/favicon.ico',
    customFields: {
        markdownOptions: {
            html: true,
        },
        gaGtag: true,
        repoUrl: 'https://github.com/apify/crawlee-python',
    },
    onBrokenLinks: 'throw',
    markdown: {
        mermaid: true,
        hooks: {
            onBrokenMarkdownLinks: 'throw',
        },
    },
    themes: [
        '@docusaurus/theme-mermaid',
    ],
    presets: /** @type {import('@docusaurus/types').PresetConfig[]} */ ([
        [
            '@docusaurus/preset-classic',
            /** @type {import('@docusaurus/preset-classic').Options} */
            ({
                docs: {
                    showLastUpdateAuthor: true,
                    showLastUpdateTime: true,
                    path: '../docs',
                    sidebarPath: './sidebars.js',
                    rehypePlugins: [externalLinkProcessor],
                    // disableVersioning: true,
                    editUrl: (doc) => {
                        return `https://github.com/apify/crawlee-python/edit/master/website/${doc.versionDocsDirPath}/${doc.docPath}`;
                    },
                },
                theme: {
                    customCss: '/src/css/custom.css',
                },
            }),
        ],
    ]),
    plugins: [
        [
            '@apify/docusaurus-plugin-typedoc-api',
            {
                projectRoot: '.',
                changelogs: false,
                readmes: false,
                packages: [{ path: '.' }],
                typedocOptions: {
                    excludeExternals: false,
                },
                sortSidebar: groupSort,
                routeBasePath: 'api',
                pythonOptions: {
                    pythonModulePath: path.join(__dirname, '../src/crawlee'),
                    moduleShortcutsPath: path.join(__dirname, 'module_shortcuts.json'),
                },
            },
        ],
        // [
        //     '@docusaurus/plugin-client-redirects',
        //     {
        //         redirects: [
        //             {
        //                 from: '/docs',
        //                 to: '/docs/quick-start',
        //             },
        //             {
        //                 from: '/docs/next',
        //                 to: '/docs/next/quick-start',
        //             },
        //             {
        //                 from: '/docs/guides/environment-variables',
        //                 to: '/docs/guides/configuration',
        //             },
        //             {
        //                 from: '/docs/guides/getting-started',
        //                 to: '/docs/introduction',
        //             },
        //             {
        //                 from: '/docs/guides/apify-platform',
        //                 to: '/docs/deployment/apify-platform',
        //             },
        //         ],
        //         createRedirects(existingPath) {
        //             if (!existingPath.endsWith('/')) {
        //                 return `${existingPath}/`;
        //             }
        //
        //             return undefined; // Return a falsy value: no redirect created
        //         },
        //     },
        // ],
        [
            'docusaurus-gtm-plugin',
            {
                id: 'GTM-5P7MCS7',
            },
        ],
        [
            '@signalwire/docusaurus-plugin-llms-txt',
            {
                enableDescriptions: false,
                content: {
                    includeVersionedDocs: false,
                    enableLlmsFullTxt: true,
                    relativePaths: false,
                },
            },
        ],
        async function runnableCodeBlock() {
            return {
                name: 'runnable-code-block',
                configureWebpack() {
                    return {
                        resolveLoader: {
                            alias: {
                                'roa-loader': require.resolve(`${__dirname}/roa-loader/`),
                            },
                        },
                    };
                },
            };
        },
        // skipping svgo for animated crawlee logo
        async function doNotUseSVGO() {
            return {
                name: 'docusaurus-svgo',
                configureWebpack(config) {
                    // find the svg rule
                    const svgRule = config.module.rules.find((r) => typeof r === 'object' && r.test.toString() === '/\\.svg$/i');

                    // find the svgr loader
                    const svgrLoader = svgRule?.oneOf?.[0];

                    // make copy of svgr loader and disable svgo
                    const svgrLoaderCopy = JSON.parse(JSON.stringify(svgrLoader));

                    // include only animated logo
                    svgrLoaderCopy.include = /animated-crawlee-logo/;

                    // turn off svgo
                    svgrLoaderCopy.use[0].options.svgo = false;

                    // insert the copy after the original svgr loader
                    svgRule.oneOf.splice(1, 0, svgrLoaderCopy);

                    // exclude animated logo from the first svgr loader (with svgo enabled)
                    svgrLoader.exclude = /animated-crawlee-logo/;

                    return {
                        mergeStrategy: {
                            'module.rules': 'replace',
                        },
                        module: {
                            rules: config.module.rules,
                        },
                    };
                },
            };
        },
        [
            path.resolve(__dirname, 'src/plugins/docusaurus-plugin-segment'),
            {
                writeKey: process.env.SEGMENT_TOKEN,
                allowedInDev: false,
            },
        ],
    ],
    themeConfig:
    /** @type {import('@docusaurus/preset-classic').ThemeConfig} */ ({
        docs: {
            versionPersistence: 'localStorage',
            sidebar: {
                hideable: true,
            },
        },
        navbar: {
            hideOnScroll: true,
            logo: {
                src: 'img/crawlee-python-light.svg',
                srcDark: 'img/crawlee-python-dark.svg',
            },
            title: 'Crawlee for Python',
            items: [
                {
                    type: 'doc',
                    docId: 'quick-start/quick-start',
                    label: 'Docs',
                    position: 'left',
                },
                {
                    type: 'doc',
                    docId: '/examples',
                    label: 'Examples',
                    position: 'left',
                },
                {
                    to: '/api',
                    label: 'API',
                    position: 'left',
                    activeBaseRegex: 'api/(?!.*/changelog)',
                },
                {
                    type: 'doc',
                    label: 'Changelog',
                    docId: 'changelog',
                    className: 'changelog',
                },
                {
                    href: 'https://crawlee.dev/blog',
                    target: '_self',
                    rel: 'dofollow',
                    label: 'Blog',
                    position: 'left',
                },
            ],
        },
        colorMode: {
            defaultMode: 'light',
            disableSwitch: false,
            respectPrefersColorScheme: true,
        },
        prism: {
            defaultLanguage: 'typescript',
            theme: require('prism-react-renderer').themes.github,
            darkTheme: require('prism-react-renderer').themes.dracula,
            additionalLanguages: ['docker', 'log', 'bash', 'diff', 'json'],
        },
        metadata: [
            // eslint-disable-next-line max-len
            { name: 'description', content: `Crawlee helps you build and maintain your Python crawlers. It's open source and modern, with type hints for Python to help you catch bugs early.` },
            // eslint-disable-next-line max-len
            { name: 'og:description', content: `Crawlee helps you build and maintain your Python crawlers. It's open source and modern, with type hints for Python to help you catch bugs early.` },
        ],
        image: 'img/crawlee-python-og.png',
        footer: {
            links: [
                {
                    title: 'Docs',
                    items: [
                        {
                            label: 'Guides',
                            to: 'docs/guides',
                        },
                        {
                            label: 'Examples',
                            to: 'docs/examples',
                        },
                        {
                            label: 'API reference',
                            to: 'api',
                        },
                        {
                            label: 'Changelog',
                            to: 'docs/changelog',
                        },
                    ],
                },
                {
                    title: 'Product',
                    items: [
                        {
                            label: 'Discord',
                            href: 'https://discord.com/invite/jyEM2PRvMU',
                        },
                        {
                            label: 'Stack Overflow',
                            href: 'https://stackoverflow.com/questions/tagged/crawlee-python',
                        },
                        {
                            label: 'Twitter',
                            href: 'https://twitter.com/apify',
                        },
                        {
                            label: 'YouTube',
                            href: 'https://www.youtube.com/apify',
                        },
                    ],
                },
                {
                    title: 'More',
                    items: [
                        {
                            label: 'Apify platform',
                            href: 'https://apify.com',
                        },
                        {
                            label: 'Docusaurus',
                            href: 'https://docusaurus.io',
                        },
                        {
                            label: 'GitHub',
                            href: 'https://github.com/apify/crawlee-python',
                        },
                    ],
                },
            ],
        },
        algolia: {
            appId: '5JC94MPMLY',
            apiKey: '878493fcd7001e3c179b6db6796a999b', // search only (public) API key
            indexName: 'crawlee_python',
            placeholder: 'Search documentation',
            algoliaOptions: {
                facetFilters: ['version:VERSION'],
            },
            translations: {
                button: {
                    buttonText: 'Search documentation...',
                },
            },
        },
    }),
};


================================================
FILE: website/generate_module_shortcuts.py
================================================
#!/usr/bin/env python3

from __future__ import annotations

import importlib
import inspect
import json
from pathlib import Path
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from types import ModuleType


def get_module_shortcuts(module: ModuleType, parent_classes: list | None = None) -> dict:
    """Traverse a module and its submodules to identify and register shortcuts for classes."""
    shortcuts = {}

    if parent_classes is None:
        parent_classes = []

    parent_module_name = '.'.join(module.__name__.split('.')[:-1])
    module_classes = []

    for classname, cls in inspect.getmembers(module, inspect.isclass):
        module_classes.append(cls)
        if cls in parent_classes:
            shortcuts[f'{module.__name__}.{classname}'] = f'{parent_module_name}.{classname}'

    for _, submodule in inspect.getmembers(module, inspect.ismodule):
        if submodule.__name__.startswith('apify'):
            shortcuts.update(get_module_shortcuts(submodule, module_classes))

    return shortcuts


def resolve_shortcuts(shortcuts: dict) -> None:
    """Resolve linked shortcuts.

    For example, if there are shortcuts A -> B and B -> C, resolve them to A -> C.
    """
    for source, target in shortcuts.items():
        while target in shortcuts:
            shortcuts[source] = shortcuts[target]
            target = shortcuts[target]  # noqa: PLW2901


shortcuts = {}
for module_name in ['crawlee']:
    try:
        module = importlib.import_module(module_name)
        module_shortcuts = get_module_shortcuts(module)
        shortcuts.update(module_shortcuts)
    except ModuleNotFoundError:  # noqa: PERF203
        pass

resolve_shortcuts(shortcuts)

with Path('module_shortcuts.json').open('w', encoding='utf-8') as shortcuts_file:
    json.dump(shortcuts, shortcuts_file, indent=4, sort_keys=True)


================================================
FILE: website/package.json
================================================
{
    "name": "crawlee",
    "scripts": {
        "examples": "docusaurus-examples",
        "postinstall": "npx patch-package",
        "start": "rimraf .docusaurus && cp ../CHANGELOG.md ../docs/changelog.md && docusaurus start",
        "start:fast": "rimraf .docusaurus && cp ../CHANGELOG.md ../docs/changelog.md && CRAWLEE_DOCS_FAST=1 docusaurus start",
        "build": "rimraf .docusaurus && cp ../CHANGELOG.md ../docs/changelog.md && node --max_old_space_size=16000 node_modules/@docusaurus/core/bin/docusaurus.mjs build",
        "publish-gh-pages": "docusaurus-publish",
        "write-translations": "docusaurus write-translations",
        "version": "docusaurus version",
        "rename-version": "docusaurus rename-version",
        "prettify": "prettier --write --config ./tools/docs-prettier.config.js ../docs/guides/*.md",
        "swizzle": "docusaurus swizzle",
        "deploy": "rimraf .docusaurus && node --max_old_space_size=16000 node_modules/@docusaurus/core/bin/docusaurus.mjs deploy",
        "docusaurus": "docusaurus",
        "clean": "rimraf .docusaurus build",
        "lint": "yarn lint:code",
        "lint:fix": "yarn lint:code:fix",
        "lint:code": "eslint .",
        "lint:code:fix": "eslint . --fix"
    },
    "dependencies": {
        "@apify/docusaurus-plugin-typedoc-api": "^5.1.0",
        "@apify/utilities": "^2.8.0",
        "@docusaurus/core": "^3.9.2",
        "@docusaurus/faster": "^3.9.2",
        "@docusaurus/mdx-loader": "^3.9.2",
        "@docusaurus/plugin-client-redirects": "^3.9.2",
        "@docusaurus/preset-classic": "^3.9.2",
        "@docusaurus/theme-mermaid": "^3.9.2",
        "@giscus/react": "^3.0.0",
        "@mdx-js/react": "^3.0.1",
        "@mermaid-js/layout-elk": "^0.2.0",
        "@signalwire/docusaurus-plugin-llms-txt": "^1.2.1",
        "axios": "^1.5.0",
        "buffer": "^6.0.3",
        "clsx": "^2.0.0",
        "crypto-browserify": "^3.12.0",
        "docusaurus-gtm-plugin": "^0.0.2",
        "prism-react-renderer": "^2.1.0",
        "process": "^0.11.10",
        "prop-types": "^15.8.1",
        "raw-loader": "^4.0.2",
        "react": "^19.0.0",
        "react-dom": "^19.0.0",
        "react-github-btn": "^1.4.0",
        "react-lite-youtube-embed": "^3.0.0",
        "stream-browserify": "^3.0.0",
        "unist-util-visit": "^5.0.0"
    },
    "devDependencies": {
        "@apify/eslint-config-ts": "^0.4.0",
        "@apify/tsconfig": "^0.1.0",
        "@apify/ui-icons": "^1.23.0",
        "@docusaurus/module-type-aliases": "^3.9.2",
        "@docusaurus/types": "^3.9.2",
        "@types/react": "^19.0.0",
        "@typescript-eslint/eslint-plugin": "^8.46.0",
        "@typescript-eslint/parser": "^8.46.0",
        "eslint": "^10.0.0",
        "eslint-plugin-react": "^7.37.5",
        "eslint-plugin-react-hooks": "^7.0.0",
        "fs-extra": "^11.1.0",
        "patch-package": "^8.0.0",
        "path-browserify": "^1.0.1",
        "prettier": "^3.0.0",
        "rimraf": "^6.0.0",
        "typescript": "^5.9.3"
    },
    "packageManager": "yarn@4.13.0"
}


================================================
FILE: website/patches/@docusaurus+core+3.4.0.patch
================================================
diff --git a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js
index 903f8dc..b6b60bf 100644
--- a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js
+++ b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js
@@ -30,9 +30,11 @@ function scrollAfterNavigation({ location, previousLocation, }) {
         window.scrollTo(0, 0);
     }
     else {
-        const id = decodeURIComponent(hash.substring(1));
-        const element = document.getElementById(id);
-        element?.scrollIntoView();
+        setTimeout(() => {
+            const id = decodeURIComponent(hash.substring(1));
+            const element = document.getElementById(id);
+            element?.scrollIntoView();
+        }, 100);
     }
 }
 function ClientLifecyclesDispatcher({ children, location, previousLocation, }) {


================================================
FILE: website/patches/@docusaurus+core+3.5.2.patch
================================================
diff --git a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js
index 903f8dc..b6b60bf 100644
--- a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js
+++ b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js
@@ -30,9 +30,11 @@ function scrollAfterNavigation({ location, previousLocation, }) {
         window.scrollTo(0, 0);
     }
     else {
-        const id = decodeURIComponent(hash.substring(1));
-        const element = document.getElementById(id);
-        element?.scrollIntoView();
+        setTimeout(() => {
+            const id = decodeURIComponent(hash.substring(1));
+            const element = document.getElementById(id);
+            element?.scrollIntoView();
+        }, 100);
     }
 }
 function ClientLifecyclesDispatcher({ children, location, previousLocation, }) {


================================================
FILE: website/roa-loader/index.js
================================================
const { createHash } = require('node:crypto');
const { inspect } = require('node:util');

const { urlToRequest } = require('loader-utils');

const signingUrl = new URL('https://api.apify.com/v2/tools/encode-and-sign');
signingUrl.searchParams.set('token', process.env.APIFY_SIGNING_TOKEN);
const queue = [];
const cache = {};
let working = false;

function hash(source) {
    return createHash('sha1').update(source).digest('hex');
}

async function getHash(source) {
    const cacheKey = hash(source);

    if (cache[cacheKey]) {
        return cache[cacheKey];
    }

    const memory = source.match(/playwright|puppeteer/i) ? 4096 : 1024;
    const res = await (await fetch(signingUrl, {
        method: 'POST',
        body: JSON.stringify({
            input: JSON.stringify({ code: source }),
            options: {
                build: 'latest',
                contentType: 'application/json; charset=utf-8',
                memory,
                timeout: 180,
            },
        }),
        headers: {
            'Content-Type': 'application/json; charset=utf-8',
        },
    }));

    if (!res.ok) {
        console.error(`Signing failed: ${res.status} ${res.statusText}`, await res.text());
        return 'invalid-token';
    }

    const body = await res.json();

    if (!body.data || !body.data.encoded) {
        console.error(`Signing failed:' ${inspect(body.error) || 'Unknown error'}`, body);
        return 'invalid-token';
    }

    cache[cacheKey] = body.data.encoded;
    await new Promise((resolve) => setTimeout(resolve, 100));

    return body.data.encoded;
}

async function encodeAndSign(source) {
    if (!process.env.APIFY_SIGNING_TOKEN) {
        return 'invalid-token';
    }

    if (working) {
        return new Promise((resolve, reject) => {
            queue.push(() => {
                return getHash(source).then(resolve, reject);
            });
        });
    }

    let res;

    try {
        working = true;
        res = await getHash(source);

        while (queue.length) {
            await queue.shift()();
        }
    } finally {
        working = false;
    }

    return res;
}

module.exports = async function (code) {
    if (process.env.CRAWLEE_DOCS_FAST) {
        return { code, hash: 'fast' };
    }

    console.log(`Signing ${urlToRequest(this.resourcePath)}...`, { working, queue: queue.length });
    const codeHash = await encodeAndSign(code);
    return { code, hash: codeHash };
};


================================================
FILE: website/roa-loader/package.json
================================================
{
  "name": "roa-loader",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "dependencies": {
    "loader-utils": "^3.2.1"
  }
}


================================================
FILE: website/sidebars.js
================================================
module.exports = {
    docs: [
        'quick-start/quick-start',
        {
            type: 'category',
            label: 'Introduction',
            collapsed: false,
            link: {
                type: 'doc',
                id: 'introduction/introduction',
            },
            items: [
                'introduction/setting-up',
                'introduction/first-crawler',
                'introduction/adding-more-urls',
                'introduction/real-world-project',
                'introduction/crawling',
                'introduction/scraping',
                'introduction/saving-data',
                'introduction/refactoring',
                'introduction/deployment',
            ],
        },
        {
            type: 'category',
            label: 'Guides',
            collapsed: true,
            link: {
                type: 'generated-index',
                title: 'Guides',
                slug: '/guides',
                keywords: ['guides'],
            },
            items: [
                {
                    type: 'autogenerated',
                    dirName: 'guides',
                },
            ],
        },
        {
            type: 'category',
            label: 'Deployment',
            collapsed: true,
            link: {
                type: 'generated-index',
                title: 'Deployment guides',
                description: 'Here you can find guides on how to deploy your crawlers to various cloud providers.',
                slug: '/deployment',
            },
            items: [
                {
                    type: 'doc',
                    id: 'deployment/apify-platform',
                    label: 'Deploy on Apify',
                },
                {
                    type: 'doc',
                    id: 'deployment/aws-lambda',
                    label: 'Deploy on AWS Lambda'
                },
                {
                    type: 'category',
                    label: 'Deploy to Google Cloud',
                    items: [
                        'deployment/gcp-cloud-run-functions',
                        'deployment/gcp-cloud-run',
                    ],
                },
            ],
        },
        {
            type: 'category',
            label: 'Examples',
            collapsed: true,
            link: {
                type: 'generated-index',
                title: 'Examples',
                slug: '/examples',
                keywords: ['examples'],
            },
            items: [
                {
                    type: 'autogenerated',
                    dirName: 'examples',
                },
            ],
        },
        // {
        //     type: 'category',
        //     label: 'Experiments',
        //     link: {
        //         type: 'generated-index',
        //         title: 'Experiments',
        //         slug: '/experiments',
        //         keywords: ['experiments', 'experimental-features'],
        //     },
        //     items: [
        //         {
        //             type: 'autogenerated',
        //             dirName: 'experiments',
        //         },
        //     ],
        // },
        {
            type: 'category',
            label: 'Upgrading',
            collapsed: true,
            link: {
                type: 'generated-index',
                title: 'Upgrading',
                slug: '/upgrading',
                keywords: ['upgrading'],
            },
            items: [
                {
                    type: 'autogenerated',
                    dirName: 'upgrading',
                },
            ],
        },
        {
            type: 'doc',
            label: 'Changelog',
            id: 'changelog',
        },
    ],
};


================================================
FILE: website/src/components/ApiLink.jsx
================================================
import React from 'react';
import Link from '@docusaurus/Link';
// eslint-disable-next-line import/no-extraneous-dependencies
import { useDocsVersion } from '@docusaurus/theme-common/internal';
import useDocusaurusContext from '@docusaurus/useDocusaurusContext';

// const pkg = require('../../../packages/crawlee/package.json');
//
// const [v1, v2] = pkg.version.split('.');
// const stable = [v1, v2].join('.');

const ApiLink = ({ to, children }) => {
    return (
        <Link to={`/api/${to}`}>{children}</Link>
    );

    // const version = useDocsVersion();
    // const { siteConfig } = useDocusaurusContext();
    //
    // // if (siteConfig.presets[0][1].docs.disableVersioning || version.version === stable) {
    // if (siteConfig.presets[0][1].docs.disableVersioning) {
    //     return (
    //         <Link to={`/api/${to}`}>{children}</Link>
    //     );
    // }
    //
    // return (
    //     <Link to={`/api/${version.version === 'current' ? 'next' : version.version}/${to}`}>{children}</Link>
    // );
};

export default ApiLink;


================================================
FILE: website/src/components/Button.jsx
================================================
import Link from '@docusaurus/Link';
import clsx from 'clsx';
import React from 'react';

import styles from './Button.module.css';
import CrawleeSvg from '../../static/img/crawlee-logo-monocolor.svg';

export default function Button({ children, to, withIcon, type = 'primary', className, isBig }) {
    return (
        <Link to={to} target="_self" rel="dofollow">
            <span className={clsx(
                className,
                styles.button,
                type === 'primary' && styles.buttonPrimary,
                type === 'secondary' && styles.buttonSecondary,
                isBig && styles.big,
            )}>
                {withIcon && <CrawleeSvg />}
                {children}
            </span>
        </Link>
    );
}


================================================
FILE: website/src/components/Button.module.css
================================================
.button {
    display: inline-flex;
    align-items: center;
    text-align: center;
    padding: 8px 16px;
    border-radius: 8px;
    font-family: (--ifm-font-family-base);
    font-size: 16px;
    font-style: normal;
    font-weight: 500;
    line-height: 24px;
    cursor: pointer;
    transition: background-color 0.2s;

    svg {
        margin-right: 8px;
    }
}

.buttonPrimary {
    background-color: var(--color-black-action);
    color: var(--color-text-on-primary);
    border: none;

    path {
        stroke: var(--color-text-on-primary);
        &:first-child {
            fill: var(--color-text-on-primary);
        }
    }
}

.buttonPrimary:hover {
    background-color: var(--color-primary-action-hover);
}

.buttonSecondary {
    background-color: var(--color-background);
    color: var(--color-text);
    border: 1px solid var(--color-border);

    path {
        stroke: var(--color-black-action);
        &:first-child {
            fill: var(--color-black-action);
        }
    }
}

.buttonSecondary:hover {
    border: 1px solid var(--color-text);
}

.big {
    padding: 12px 24px;
}

/* TABLET */
@media (min-width: 768px) {
    .button {
        width: auto;
    }
}


================================================
FILE: website/src/components/CopyButton.jsx
================================================
/* eslint-disable max-len */
import clsx from 'clsx';
import React, { useState } from 'react';

import styles from './CopyButton.module.css';

export default function CopyButton({ copyText, compact = false, className }) {
    const [copied, setCopied] = useState(false);
    const copy = async () => {
        await navigator.clipboard.writeText(copyText);
        setCopied(true);
        setTimeout(() => setCopied(false), 2000);
    };
    return <button
        type="button"
        aria-label="Copy code to clipboard"
        title="Copy"
        onClick={copy}
        className={clsx(className, styles.copyButton, compact ? styles.copyButtonCompact : styles.copyButtonDefault)}
    >
        {copied
            ? <svg width="20" height="20" viewBox="0 0 20 20" xmlns="http://www.w3.org/2000/svg">
                <path fillRule="evenodd" clipRule="evenodd" d="M18.0303 5.09467C18.3232 5.38756 18.3232 5.86244 18.0303 6.15533L8.03033 16.1553C7.73744 16.4482 7.26256 16.4482 6.96967 16.1553L2.59467 11.7803C2.30178 11.4874 2.30178 11.0126 2.59467 10.7197C2.88756 10.4268 3.36244 10.4268 3.65533 10.7197L7.5 14.5643L16.9697 5.09467C17.2626 4.80178 17.7374 4.80178 18.0303 5.09467Z" />
            </svg>

            : <svg width="20" height="20" viewBox="0 0 20 20" xmlns="http://www.w3.org/2000/svg">
                <path
                    fillRule="evenodd"
                    clipRule="evenodd"
                    d="M8.375 2.375C7.13236 2.375 6.125 3.38236 6.125 4.625V6.125H4.625C3.38236 6.125 2.375 7.13236 2.375 8.375V15.375C2.375 16.6176 3.38236 17.625 4.625 17.625H11.625C12.8676 17.625 13.875 16.6176 13.875 15.375V13.875H15.375C16.6176 13.875 17.625 12.8676 17.625 11.625V4.625C17.625 3.38236 16.6176 2.375 15.375 2.375H8.375ZM13.875 12.375H15.375C15.7892 12.375 16.125 12.0392 16.125 11.625V4.625C16.125 4.21079 15.7892 3.875 15.375 3.875H8.375C7.96079 3.875 7.625 4.21079 7.625 4.625V6.125H11.625C12.8676 6.125 13.875 7.13236 13.875 8.375V12.375ZM4.625 7.625C4.21079 7.625 3.875 7.96079 3.875 8.375V15.375C3.875 15.7892 4.21079 16.125 4.625 16.125H11.625C12.0392 16.125 12.375 15.7892 12.375 15.375V8.375C12.375 7.96079 12.0392 7.625 11.625 7.625H4.625Z" />
            </svg>
        }
    </button>;
}


================================================
FILE: website/src/components/CopyButton.module.css
================================================
.copyButton {
  all: unset;
  display: inline-flex;
  align-items: center;
  justify-content: center;
  box-sizing: border-box;
  cursor: pointer;
  fill: var(--color-icon);

  svg {
    flex-shrink: 0;
  }
}

.copyButtonDefault {
  width: 28px;
  height: 28px;
  background-color: var(--color-background-muted);
  border: 1px solid var(--color-border);
  border-radius: 6px;
  transition: background-color 0.12s ease-out;

  &:hover {
      background-color: var(--color-hover);
  }

  svg {
    padding: 1px;
  }
}

.copyButtonCompact {
  svg {
    width: 16px;
    height: 16px;
  }
}

================================================
FILE: website/src/components/Gradients.jsx
================================================
import React from 'react';

export default function Gradients() {
    return (
        <svg xmlns="http://www.w3.org/2000/svg" width="0" height="0" viewBox="0 0 0 0" fill="none">
            <defs>
                <linearGradient id="gradient-1" x1="26.6667" y1="12" x2="14.2802" y2="34.5208"
                                gradientUnits="userSpaceOnUse">
                    <stop offset="0%" stop-color="#9dceff"/>
                    <stop offset="70%" stop-color="#4584b6"/>
                    <stop offset="100%" stop-color="#4584b6"/>
                </linearGradient>
                <linearGradient id="gradient-2" x1="29.6667" y1="0" x2="-1.80874" y2="26.2295"
                                gradientUnits="userSpaceOnUse">
                <stop offset="0%" stop-color="#4584b6"/>
                </linearGradient>
            </defs>
        </svg>
    );
}


================================================
FILE: website/src/components/Highlights.jsx
================================================
import React from 'react';
import clsx from 'clsx';
import styles from './Highlights.module.css';
import Gradients from './Gradients';

const FeatureList = [
    {
        title: 'Python with type hints',
        Svg: require('../../static/img/features/runs-on-py.svg').default,
        description: (
            <>
                Crawlee for Python is written in a modern way using type hints, providing code completion in your IDE
                and helping you catch bugs early on build time.
            </>
        ),
    },
    // {
    //     title: 'HTTP scraping',
    //     Svg: require('../../static/img/features/fingerprints.svg').default,
    //     description: (
    //         <>
    //             Crawlee makes HTTP requests that <a href="https://crawlee.dev/docs/guides/avoid-blocking"><b>mimic browser headers and TLS fingerprints</b></a>.
    //             It also rotates them automatically based on data about real-world traffic. Popular HTML
    //             parsers <b><a href="https://crawlee.dev/docs/guides/cheerio-crawler-guide">Cheerio</a>&nbsp;
    //             and <a href="https://crawlee.dev/docs/guides/jsdom-crawler-guide">JSDOM</a></b> are included.
    //         </>
    //     ),
    // },
    {
        title: 'Headless browsers',
        Svg: require('../../static/img/features/works-everywhere.svg').default,
        description: (
            <>
                Switch your crawlers from HTTP to a <a href="https://crawlee.dev/python/api/class/PlaywrightCrawler">headless browser</a> in 3 lines of code.
                Crawlee builds on top of <b>Playwright</b> and adds its own features. Chrome, Firefox and more.
            </>
        ),

        // TODO: this is not true yet
        // Crawlee builds on top of <b>Playwright</b> and adds its own <b>anti-blocking features and human-like fingerprints</b>. Chrome, Firefox and more.
    },
    {
        title: 'Automatic scaling and proxy management',
        Svg: require('../../static/img/features/auto-scaling.svg').default,
        description: (
            <>
                Crawlee automatically manages concurrency based on <a href="https://crawlee.dev/python/api/class/AutoscaledPool">available system resources</a> and&nbsp;
                <a href="https://crawlee.dev/python/api/class/ProxyConfiguration">smartly rotates proxies</a>.
                Proxies that often time-out, return network errors or bad HTTP codes like 401 or 403 are discarded.
            </>
        ),
    },
    // {
    //     title: 'Queue and Storage',
    //     Svg: require('../../static/img/features/storage.svg').default,
    //     description: (
    //         <>
    //             You can <a href="https://crawlee.dev/docs/guides/result-storage">save files, screenshots and JSON results</a> to disk with one line of code
    //             or plug an adapter for your DB. Your URLs are <a href="https://crawlee.dev/docs/guides/request-storage">kept in a queue</a> that ensures their
    //             uniqueness and that you don't lose progress when something fails.
    //         </>
    //     ),
    // },
    // {
    //     title: 'Helpful utils and configurability',
    //     Svg: require('../../static/img/features/node-requests.svg').default,
    //     description: (
    //         <>
    //             Crawlee includes tools for <a href="https://crawlee.dev/api/utils/namespace/social">extracting social handles</a> or phone numbers, infinite scrolling, blocking
    //             unwanted assets <a href="https://crawlee.dev/api/utils">and many more</a>. It works great out of the box, but also provides&nbsp;
    //             <a href="https://crawlee.dev/api/basic-crawler/interface/BasicCrawlerOptions">rich configuration options</a>.
    //         </>
    //     ),
    // },
];

function Feature({ Svg, title, description }) {
    return (
        <div className={clsx('col col--4')}>
            <div className="padding-horiz--md padding-bottom--md">
                <div className={styles.featureIcon}>
                    {Svg ? <Svg alt={title}/> : null}
                </div>
                <h3>{title}</h3>
                <p>{description}</p>
            </div>
        </div>
    );
}

export default function Highlights() {
    return (
        <section className={styles.features}>
            <Gradients />
            <div className="container">
                <div className="row">
                    {FeatureList.map((props, idx) => (
                        <Feature key={idx} {...props} />
                    ))}
                </div>
            </div>
        </section>
    );
}


================================================
FILE: website/src/components/Highlights.module.css
================================================
.features {
    display: flex;
    align-items: center;
    width: 100%;
    font-size: 18px;
    line-height: 32px;
    color: #41465d;
}

html[data-theme="dark"] .features {
    color: #b3b8d2;
}

.feature svg {
    height: 60px;
    width: 60px;
}

.features svg path:nth-child(1) {
    fill: url(#gradient-1) !important;
}

.features svg path:nth-child(n + 1) {
    fill: url(#gradient-2) !important;
}

html[data-theme="dark"] .featureIcon {
    background: #272c3d;
}

.featureIcon {
    display: flex;
    justify-content: center;
    align-items: center;
    margin-bottom: 24px;
    border-radius: 8px;
    background-color: #f2f3fb;
    width: 48px;
    height: 48px;
}

.features h3 {
    font-weight: 700;
    font-size: 18px;
    line-height: 32px;
}


================================================
FILE: website/src/components/Homepage/HomepageCliExample.jsx
================================================
import React from 'react';

import CopyButton from '../CopyButton';
import styles from './HomepageCliExample.module.css';

const cliCommand = `uvx 'crawlee[cli]' create my-crawler`;

export default function CliExample() {
    return (
        <section className={styles.cliExampleSection}>
            <div className={styles.cliExampleTitle}>
                Or start with a template from our CLI
            </div>
            <code className={styles.cliExampleCodeBlock}>
                <pre>
                    <span className={styles.cliCommandPrefix}>$</span>
                    {cliCommand}
                    <CopyButton copyText={cliCommand} />
                </pre>
            </code>
            <div className={styles.cliExampleSubtitle}>
                Built with 🤍 by Apify. Forever free and open-source.
            </div>
        </section>
    );
}


================================================
FILE: website/src/components/Homepage/HomepageCliExample.module.css
================================================
.cliExampleSection {
    display: flex;
    flex-direction: column;
    justify-content: center;
    align-items: center;
    text-align: center;
    padding: 16px;
}

.cliExampleTitle {
    color: var(--color-text-muted);
    font-size: 18px;
    font-style: normal;
    font-weight: 400;
    line-height: 28px;
    margin-bottom: 16px;
}

.cliExampleCodeBlock {
    width: fit-content;
    height: fit-content;
    padding: 0;
    border: 0;
    margin-bottom: 18px;
    width: 100%;

    pre {
        margin: 0;
        width: 100%;
        padding: 8px 16px;
        background-color: var(--color-background-muted);
        border: 1px solid var(--color-border);
        display: flex;
        align-items: center;
        gap: 16px;
        font-size: 14px;
        line-height: 20px;

        button {
            margin-left: auto;
        }
    }

    .cliCommandPrefix {
        color: var(--color-text-muted);
        user-select: none;
    }

    /* TABLET */
    @media (min-width: 768px) {
        max-width: 526px;
    }
}

.cliExampleSubtitle {
    color: var(--color-text-subtle);
    font-size: 16px;
    font-style: normal;
    font-weight: 400;
    line-height: 24px;
}

/* TABLET */
@media (min-width: 768px) {
    .cliExampleSection {
        padding: 64px 0;
    }
}


================================================
FILE: website/src/components/Homepage/HomepageCtaSection.jsx
================================================
import { useColorMode } from '@docusaurus/theme-common';
import React from 'react';

import AnimatedLogoDark from './animated-crawlee-logo-dark.svg';
import AnimatedLogoLight from './animated-crawlee-logo-light.svg';
import styles from './HomepageCtaSection.module.css';
import homepageStyles from '../../pages/index.module.css';
import Button from '../Button';

export default function HomepageCtaSection() {
    const { colorMode } = useColorMode();
    return (
        <section className={styles.ctaSection}>
            <h2 className={styles.ctaTitle}>Get started now!</h2>
            <div className={styles.ctaDescription}>
                Crawlee won’t fix broken selectors for you (yet), but it makes
                building and maintaining reliable crawlers faster and easier—so
                you can focus on what matters most.
            </div>
            <div className={styles.ctaButtonContainer}>
                <Button to="/docs/quick-start" withIcon type="primary" isBig>
                    Get started
                </Button>
            </div>

            <div
                className={homepageStyles.fadedOutSeparator}
                id={styles.ctaFadedOutSeparator}
            />
            <div
                className={homepageStyles.fadedOutSeparatorVertical}
                id={styles.fadedOutSeparatorVerticalLeft}
            />
            <div
                className={homepageStyles.fadedOutSeparatorVertical}
                id={styles.fadedOutSeparatorVerticalRight}
            />
            <div
                className={homepageStyles.dashedDecorativeCircle}
                id={styles.ctaDashedCircleRight}
            />

            {colorMode === 'dark' ? (
                <AnimatedLogoDark className={styles.ctaImage} />
            ) : (
                <AnimatedLogoLight className={styles.ctaImage} />
            )}
        </section>
    );
}


================================================
FILE: website/src/components/Homepage/HomepageCtaSection.module.css
================================================
.ctaSection {
    position: relative;
    display: flex;
    flex-direction: column;
    justify-content: center;
    align-items: center;
    text-align: center;
    padding: 16px;
    padding-bottom: 0;
    gap: 24px;
    overflow: clip;
}

.ctaTitle {
    color: var(--color-text);
    font-family: 'Lota Grotesque';
    font-size: 36px;
    font-style: normal;
    font-weight: 400;
    line-height: 46px;
    margin: 0;
}

.ctaDescription {
    color: var(--color-text-muted);
    font-size: 18px;
    font-style: normal;
    font-weight: 400;
    line-height: 28px;
    max-width: 780px;
}

.ctaButtonContainer {
    display: flex;
    flex-direction: column;
    justify-content: center;
    align-items: center;
    text-align: center;
    gap: 16px;
    width: 100%;
}

.ctaImage {
    z-index: -1;
    margin-top: -90px;
    margin-bottom: -30px;
    min-height: 400px;
}

#ctaFadedOutSeparator {
    position: absolute;
    top: 370px;
    width: 100%;
    z-index: -2;
}

#fadedOutSeparatorVerticalLeft {
    position: absolute;
    left: 190px;
    bottom: 0;
    height: 100%;
    z-index: -2;
}

#fadedOutSeparatorVerticalRight {
    position: absolute;
    right: 190px;
    bottom: 0;
    height: 100%;
    z-index: -2;
}

#ctaDashedCircleRight {
    position: absolute;
    right: -120px;
    top: 370px;
    z-index: -2;
}

/* TABLET */
@media (min-width: 768px) {
    .ctaSection {
        padding-top: 80px;
    }

    .ctaTitle {
        font-size: 48px;
        line-height: 56px;
    }

    .ctaButtonContainer {
        flex-direction: row;
    }
}


================================================
FILE: website/src/components/Homepage/HomepageHeroSection.jsx
================================================
import React from 'react';

import styles from './HomepageHeroSection.module.css';
import homepageStyles from '../../pages/index.module.css';

export default function HomepageHeroSection() {
    return (
        <section className={styles.hero}>
            <h1 className={styles.heroTitle}>
                Build reliable web scrapers. Fast.
            </h1>
            <div
                className={homepageStyles.dashedSeparator}
                id={styles.separatorHeroHeader}
            />
            <p className={styles.heroSubtitle}>
                Crawlee is a web scraping library for JavaScript and Python. It
                handles blocking, crawling, proxies, and browsers for you.
            </p>
            <div
                className={homepageStyles.dashedSeparator}
                id={styles.separatorHeroHeader2}
            >
                <div
                    className={homepageStyles.dashedDecorativeCircle}
                    id={styles.heroDecorativeCircle}
                />
            </div>
        </section>
    );
}


================================================
FILE: website/src/components/Homepage/HomepageHeroSection.module.css
================================================
.hero {
    display: flex;
    flex-direction: column;
    align-items: center;
    justify-content: center;
    padding: 32px 0;
    h1 {
        padding-inline: 12px;
    }
}

.heroTitle {
    color: var(--color-text);
    font-size: 52px;
    line-height: 60px;
    font-weight: 400;
    text-align: center;
    margin: 0 0 16px 0;
}

.heroSubtitle {
    color: var(--color-text-muted);
    font-size: 18px;
    line-height: 28px;
    font-weight: 400;
    text-align: center;
    margin: 0 16px;
    max-width: 792px;
}

#separatorHeroHeader {
    display: none;
}

#separatorHeroHeader2 {
    display: none;
}

#heroDecorativeCircle {
    width: 60px;
    height: 60px;
    right: -60px;
    top: 0px;
}

/* TABLET */
@media (min-width: 768px) {
    .hero {
        padding: 64px 0 0 0;
        h1 {
            padding-inline: 24px;
        }
    }
    .heroTitle {
        font-size: 54px;
        line-height: 64px;
        margin: 0 16px 24px 16px;
    }
    .heroSubtitle {
        margin: 0 16px 30px 16px;
    }
    #separatorHeroHeader {
        display: none;
    }
    #separatorHeroHeader2 {
        display: block;
    }
}

/* DESKTOP */
@media (min-width: 1024px) {
    .hero {
        padding: 120px 0 0 0;
    }
    .heroSubtitle {
        margin: 30px 16px;
    }
    #separatorHeroHeader {
        display: block;
    }
}


================================================
FILE: website/src/components/Homepage/LanguageInfoWidget.jsx
================================================
import { useColorMode } from '@docusaurus/theme-common';
import ThemedImage from '@theme/ThemedImage';
import clsx from 'clsx';
import React from 'react';
import GitHubButton from 'react-github-btn';

import Button from '../Button';
import CopyButton from '../CopyButton';
import styles from './LanguageInfoWidget.module.css';

export default function LanguageInfoWidget({
    language,
    command,
    to,
    githubUrl,
}) {
    const { colorMode } = useColorMode();
    return (
        <div className={styles.languageGetStartedContainer}>
            {language === 'JavaScript' && (
                <ThemedImage
                    sources={{
                        light: 'img/crawlee-javascript-light.svg',
                        dark: 'img/crawlee-javascript-dark.svg',
                    }}
                    alt="Crawlee JavaScript"
                />
            )}
            {language === 'Python' && (
                <ThemedImage
                    sources={{
                        light: 'img/crawlee-python-light.svg',
                        dark: 'img/crawlee-python-dark.svg',
                    }}
                    alt="Crawlee Python"
                />
            )}
            <div className={clsx(styles.buttonContainer)}>
                <Button to={to}>
                    {command ? 'Learn more' : 'Get started'}
                </Button>
                <GitHubButton
                    href={githubUrl}
                    data-color-scheme={colorMode}
                    data-show-count="true"
                    aria-label="Star crawlee on GitHub"
                    data-size="large"
                    style={{ minHeight: '28px' }}
                >
                    Star
                </GitHubButton>
            </div>
            {command && (
                <code className={styles.commandContainer}>
                    {command} <CopyButton copyText={command} compact />
                </code>
            )}
        </div>
    );
}


================================================
FILE: website/src/components/Homepage/LanguageInfoWidget.module.css
================================================
.languageGetStartedContainer {
    margin: 0;
    display: flex;
    flex-direction: column;
    align-items: center;
    padding-inline: 12px;
}

.languageGetStartedContainer img {
    height: 40px;
    margin-bottom: 16px;
}

.buttonContainer {
    display: flex;
    flex-direction: column;
    align-items: center;
    gap: 16px;
    & > span {
        line-height: 0;
        min-height: 28px;
    }
    a,
    a span {
        min-width: 190px;
        text-align: center;
        justify-content: center;
    }
}

.buttonContainer:has(+ code) {
    margin-bottom: 16px;
    gap: 12px;
}

.commandContainer {
    margin: 0;
    padding: 0;
    color: var(--color-text);
    font-size: 12px;
    font-style: normal;
    font-weight: 400;
    line-height: 16px;
    background-color: transparent;
    border: 0;
    display: flex;
    align-items: center;
}

.commandContainer button {
    opacity: 0;
    transition: opacity var(--ifm-transition-fast) ease-in;
}

.commandContainer:hover button,
.commandContainer button:hover {
    opacity: 1;
}

/* TABLET */
@media (min-width: 768px) {
    .languageGetStartedContainer {
        margin: 24px 0 40px 0;
    }
    .buttonContainer:has(+ code) {
        flex-direction: row;
    }
    .buttonContainer:has(+ code) {
        a,
        a span {
            min-width: 0;
        }
    }
}


================================================
FILE: website/src/components/Homepage/LanguageSwitch.jsx
================================================
import React, { useCallback, useEffect, useRef, useState } from 'react';
import styles from './LanguageSwitch.module.css';
import clsx from 'clsx';

export default function LanguageSwitch({
    options = ['JavaScript', 'Python'],
    defaultOption = 'JavaScript',
    onChange,
}) {
    const [activeOption, setActiveOption] = useState(defaultOption)
    const [backgroundStyle, setBackgroundStyle] = useState({})
    const optionRefs = useRef < (HTMLButtonElement | null)[] > ([])

    const updateBackgroundStyle = useCallback(() => {
        const activeIndex = options.indexOf(activeOption)
        const activeElement = optionRefs.current[activeIndex]
        if (activeElement) {
            const { offsetLeft, offsetWidth } = activeElement
            setBackgroundStyle({
                transform: `translateX(${offsetLeft}px)`,
                width: `${offsetWidth}px`,
            })
        }
    }, [activeOption, options])

    useEffect(() => {
        updateBackgroundStyle()
    }, [updateBackgroundStyle])

    const handleOptionClick = (option) => {
        setActiveOption(option)
        onChange?.(option)
    }

    return (
        <div className={styles.languageSwitch}>
            {options.map((option, index) => (
                <button
                    key={option}
                    ref={(el) => (optionRefs.current[index] = el)}
                    className={clsx(styles.switchOption, option === activeOption && styles.active)}
                    onClick={() => handleOptionClick(option)}
                >
                    {option}
                </button>
            ))}
            <div className={styles.switchBackground} style={backgroundStyle} />
        </div>
    )
}


================================================
FILE: website/src/components/Homepage/LanguageSwitch.module.css
================================================
.languageSwitch {
    z-index: 1;
    display: inline-flex;
    position: relative;
    background-color: var(--color-background-subtle);
    border-radius: 6px;
    padding: 4px;
}

.switchOption {
    position: relative;
    z-index: 1;
    padding: 6px 16px;
    font-size: 14px;
    font-weight: 500;
    color: var(--color-text-muted);
    background: none;
    border: none;
    cursor: pointer;
    transition: color 0.3s ease;
}

.switchOption:hover {
    color: var(--color-text);
}

.switchOption.active {
    color: var(--color-text);
}

.switchBackground {
    position: absolute;
    top: 4px;
    bottom: 4px;
    left: 0;
    border-radius: 6px;
    background-color: var(--color-background);
    transition:
        transform 0.3s ease,
        width 0.3s ease;
}


================================================
FILE: website/src/components/Homepage/RiverSection.jsx
================================================
import Link from '@docusaurus/Link';
import clsx from 'clsx';
import React from 'react';

import styles from './RiverSection.module.css';

export default function RiverSection({ title, description, content, reversed, to }) {
    return (
        <div className={styles.riverWrapper}>
            <div className={clsx(styles.riverContainer, { [styles.riverReversed]: reversed })}>
                <div className={clsx(styles.riverSection, styles.riverText)}>
                    <h3 className={styles.riverTitle}>{title}</h3>
                    <p className={styles.riverDescription}>{description}</p>
                    <Link className={styles.riverButton} to={to}>
                        Learn more
                    </Link>
                </div>
                <div className={clsx(styles.riverSection, styles.riverContent)}>{content}</div>
            </div>
        </div>
    );
}


================================================
FILE: website/src/components/Homepage/RiverSection.module.css
================================================
/* Base styles */
.riverWrapper {
    width: 100%;
    border-top: 1px solid var(--color-separator);
    border-bottom: 1px solid var(--color-separator);
}

.riverContainer {
    max-width: 1200px;
    margin: 0 auto;
    display: flex;
    flex-direction: column;

    /* Tablet layout */
    @media (min-width: 768px) {
        flex-direction: row;

        &.riverReversed {
            flex-direction: row-reverse;
        }
    }
}

.riverSection {
    width: 100%;

    /* Tablet layout */
    @media (min-width: 768px) {
        min-width: 0;
        flex-basis: 50%;
        flex-grow: 0;
    }
}

.riverText {
    padding: 24px 16px;

    /* Tablet layout */
    @media (min-width: 768px) {
        padding: 40px 32px;
    }

    /* Desktop layout */
    @media (min-width: 1024px) {
        padding: 48px 80px;
    }
}

/* Text styles */
.riverTitle {
    flex: 1;
    margin-top: 0;
    margin-bottom: 12px;
    font-size: 32px;
    font-weight: 400;
    line-height: 40px;

    /* Desktop layout */
    @media (min-width: 1024px) {
        max-width: 440px;
    }
}

.riverDescription {
    margin-bottom: 24px;
    color: var(--color-text-muted);
    font-size: 16px;
    line-height: 24px;

    /* Desktop layout */
    @media (min-width: 1024px) {
        max-width: 440px;
    }
}

.riverButton {
    cursor: pointer;
    padding: 8px 12px;
    background-color: transparent;
    border: 1px solid var(--color-border);
    border-radius: 12px;
    display: flex;
    align-items: center;
    justify-content: center;
    font-size: 16px;
    line-height: 24px;
    transition: background-color 0.12s ease-out;
    width: fit-content;
    color: var(--color-text);

    &:hover {
        background-color: var(--color-hover);
        color: var(--color-text);
    }

    path {
        stroke: var(--color-icon);
    }
}

.riverButton::after {
    content: '→';
    margin-inline: 4px;
    transition: margin 0.3s ease;
}

.riverButton:hover {
    color: var(--color-text);
    &::after {
        margin: 0 0 0 8px;
    }
}

.riverContent {
    min-height: 180px;
    background-color: var(--color-background-muted);
    border-top: 1px solid var(--color-separator);
    display: flex;
    flex-direction: column;
    overflow: hidden;

    img {
        max-height: 284px;
        object-fit: cover;
        height: 100%;
        width: 100%;
        margin-block: auto;
    }

    :global(.code-block) {
        flex-grow: 1;
        margin-bottom: 0;
        border-radius: 0;
        box-shadow: none;

        :global(div[class*="codeBlockContent"]) {
            height: 100%;

            pre {
                height: 100%;
                display: flex;
                align-items: center;
                background: var(--color-background-muted) !important;
            }
            code {
                height: auto;
                font-size: 14px;
                background: var(--color-background-muted);
                min-width: initial;
                padding: 16px 8px 16px 4px;

                span::before {
                    margin-right: 16px;
                    left: unset !important;
                    color: var(--color-text-subtle);
                    opacity: 1;
                }
            }
        }
    }

    /* Tablet layout */
    @media (min-width: 768px) {
        border-top: none;
        border-left: 1px solid var(--color-separator);
    }

    .riverReversed & {
        /* Tablet layout */
        @media (min-width: 768px) {
            border-left: none;
            border-right: 1px solid var(--color-separator);
        }
    }
}


================================================
FILE: website/src/components/Homepage/ThreeCardsWithIcon.jsx
================================================
import Link from '@docusaurus/Link';
import clsx from 'clsx';
import React from 'react';

import styles from './ThreeCardsWithIcon.module.css';

export default function ThreeCardsWithIcon({ cards }) {
    return (
        <div className={styles.cardsWrapper}>
            {cards?.map((card, index) => {
                const content = (
                    <>
                        <div className={styles.cardIcon}>{card.icon}</div>
                        <h3 className={styles.cardTitle}>{card.title}</h3>
                        <p className={styles.cardDescription}>
                            {card.description}
                        </p>
                        {card.actionLink && (
                            <Link
                                to={card.actionLink.href}
                                className={styles.cardAction}
                            >
                                {card.actionLink.text}
                            </Link>
                        )}
                    </>
                );

                if (card.to) {
                    return (
                        <Link
                            className={clsx(
                                styles.cardItem,
                                styles.cardItemLink,
                            )}
                            to={card.to}
                            key={index}
                        >
                            {content}
                        </Link>
                    );
                }

                return (
                    <div className={styles.cardItem} key={index}>
                        {content}
                    </div>
                );
            })}
        </div>
    );
}


================================================
FILE: website/src/components/Homepage/ThreeCardsWithIcon.module.css
================================================
.cardsWrapper {
    display: flex;
    flex-direction: column;
    border-block: 1px solid var(--color-separator);

    @media (min-width: 768px) {
        flex-direction: row;
    }
}

/* Card styles */
.cardItem {
    display: flex;
    flex: 1;
    flex-direction: column;
    padding: 40px 24px;
    background: var(--color-card-background);
    transition: background 0.1s ease;

    border-bottom: 1px solid var(--color-separator);
    &:last-child {
        border-bottom: 0;
    }

    @media (min-width: 768px) {
        border-bottom: 0;
        border-right: 1px solid var(--color-separator);
        &:last-child {
            border-right: 0;
        }
    }
}

a.cardItem:hover {
    background: var(--color-card-background-hover);
}

.cardItem:has(:local(.cardAction)) {
    padding: 24px;
}

.cardIcon {
    margin-bottom: 16px;
    display: flex;
    align-items: center;
    justify-content: center;

    width: 72px;
    height: 72px;

    border-radius: 6px;
    border: 1px solid var(--color-separator);
    background: var(--color-background);
}

.cardIcon img {
    width: 50px;
}

.cardTitle {
    margin: 0;
    margin-bottom: 8px;
    color: var(--color-text);
    font-size: 26px;
    font-style: normal;
    font-weight: 400;
    line-height: 34px;
}

.cardDescription {
    color: var(--color-text-muted);
    font-family: var(--ifm-font-family-base);
    font-size: 16px;
    font-style: normal;
    font-weight: 400;
    line-height: 24px;
    margin: 0;
    margin-bottom: 12px;
}

.cardAction {
    color: var(--color-text-muted);
    font-family: var(--ifm-font-family-base);
    font-size: 16px;
    font-style: normal;
    font-weight: 650;
    line-height: 24px;
    width: fit-content;
    margin-top: auto;
}

.cardAction::after {
    content: "→";
    margin-left: 4px;
    transition: margin 0.3s ease;
}

.cardAction:hover {
    color: var(--color-text);
    &::after {
        margin-left: 8px;
    }
}


================================================
FILE: website/src/components/LLMButtons.jsx
================================================
import {
    AnthropicIcon,
    ChatGptIcon,
    CheckIcon,
    ChevronDownIcon,
    CopyIcon,
    ExternalLinkIcon,
    LoaderIcon,
    MarkdownIcon,
    PerplexityIcon,
} from '@apify/ui-icons';
import { useLocation } from '@docusaurus/router';
import clsx from 'clsx';
import React, {
    useCallback,
    useEffect,
    useMemo,
    useRef,
    useState,
} from 'react';

import styles from './LLMButtons.module.css';

const DROPDOWN_OPTIONS = [
    {
        label: 'Copy for LLM',
        description: 'Copy page as Markdown for LLMs',
        showExternalIcon: false,
        icon: CopyIcon,
        value: 'copyForLLM',
        analytics: {
            buttonText: 'Copy for LLM',
            element: 'llm-buttons.copyForLLM',
        },
    },
    {
        label: 'View as Markdown',
        description: 'View this page as plain text',
        icon: MarkdownIcon,
        value: 'viewAsMarkdown',
        showExternalIcon: true,
        analytics: {
            buttonText: 'View as Markdown',
            element: 'llm-buttons.viewAsMarkdown',
        },
    },
    {
        label: 'Open in ChatGPT',
        description: 'Ask questions about this page',
        icon: ChatGptIcon,
        value: 'openInChatGPT',
        showExternalIcon: true,
        analytics: {
            buttonText: 'Open in ChatGPT',
            element: 'llm-buttons.openInChatGPT',
        },
    },
    {
        label: 'Open in Claude',
        description: 'Ask questions about this page',
        icon: AnthropicIcon,
        value: 'openInClaude',
        showExternalIcon: true,
        analytics: {
            buttonText: 'Open in Claude',
            element: 'llm-buttons.openInClaude',
        },
    },
    {
        label: 'Open in Perplexity',
        description: 'Ask questions about this page',
        icon: PerplexityIcon,
        value: 'openInPerplexity',
        showExternalIcon: true,
        analytics: {
            buttonText: 'Open in Perplexity',
            element: 'llm-buttons.openInPerplexity',
        },
    },
];

const CHAT_GPT_BASE = 'https://chatgpt.com/?hints=search&q=';
const CLAUDE_BASE = 'https://claude.ai/new?q=';
const PERPLEXITY_BASE = 'https://www.perplexity.ai/search/new?q=';

const getPrompt = (currentUrl) => `Read from ${currentUrl} so I can ask questions about it.`;
const getMarkdownUrl = (currentUrl) => {
    const url = new URL(currentUrl);
    url.pathname = `${url.pathname.replace(/\/$/, '')}.md`;
    return url.toString();
};

const trackClick = (buttonText, element) => {
    if (typeof window !== 'undefined' && window.analytics) {
        window.analytics.track('Clicked', {
            app: 'crawlee',
            button_text: buttonText,
            element,
        });
    }
};

const getOptionHref = (value, currentUrl) => {
    if (!currentUrl) {
        return undefined;
    }

    switch (value) {
        case 'viewAsMarkdown':
            return getMarkdownUrl(currentUrl);
        case 'openInChatGPT':
            return `${CHAT_GPT_BASE}${encodeURIComponent(getPrompt(currentUrl))}`;
        case 'openInClaude':
            return `${CLAUDE_BASE}${encodeURIComponent(getPrompt(currentUrl))}`;
        case 'openInPerplexity':
            return `${PERPLEXITY_BASE}${encodeURIComponent(getPrompt(currentUrl))}`;
        default:
            return undefined;
    }
};

const Menu = ({
    className,
    components = {},
    onMenuOpen,
    onSelect,
    options = [],
}) => {
    const [isOpen, setIsOpen] = useState(false);
    const [focusedIndex, setFocusedIndex] = useState(0);
    const menuRef = useRef(null);
    const menuItemRefs = useRef([]);

    const MenuBaseComponent = components.MenuBase;

    const closeMenu = useCallback(() => {
        setIsOpen(false);
        setFocusedIndex(0);
    }, []);

    const toggleMenu = useCallback(() => {
        setIsOpen((prev) => {
            if (!prev) {
                setFocusedIndex(0);
            }
            return !prev;
        });
    }, []);

    const handleKeyDown = useCallback(
        (event) => {
            if (event.key === 'Enter' || event.key === ' ') {
                event.preventDefault();
                toggleMenu();
            } else if (event.key === 'ArrowDown') {
                event.preventDefault();
                if (!isOpen) {
                    toggleMenu();
                } else {
                    setFocusedIndex((prev) => (prev + 1) % options.length);
                }
            } else if (event.key === 'ArrowUp') {
                event.preventDefault();
                if (isOpen) {
                    setFocusedIndex((prev) => (prev - 1 + options.length) % options.length);
                }
            }
        },
        [toggleMenu, isOpen, options.length],
    );

    const handleOptionSelect = useCallback(
        (option, event) => {
            onSelect?.(option, event);
            closeMenu();
        },
        [closeMenu, onSelect],
    );

    const handleMenuItemKeyDown = useCallback(
        (event, option, index) => {
            if (event.key === 'Enter' || event.key === ' ') {
                event.preventDefault();
                event.currentTarget.click();
                return;
            }

            if (event.key === 'ArrowDown') {
                event.preventDefault();
                setFocusedIndex((index + 1) % options.length);
                return;
            }

            if (event.key === 'ArrowUp') {
                event.preventDefault();
                setFocusedIndex((index - 1 + options.length) % options.length);
                return;
            }

            if (event.key === 'Escape') {
                event.preventDefault();
                closeMenu();
            }
        },
        [options.length, closeMenu],
    );

    useEffect(() => {
        onMenuOpen?.(isOpen);
    }, [isOpen, onMenuOpen]);

    useEffect(() => {
        if (isOpen && menuItemRefs.current[focusedIndex]) {
            menuItemRefs.current[focusedIndex].focus();
        }
    }, [isOpen, focusedIndex]);

    useEffect(() => {
        if (!isOpen) {
            return undefined;
        }

        const handleClickOutside = (event) => {
            if (!menuRef.current?.contains(event.target)) {
                closeMenu();
            }
        };

        const handleEscape = (event) => {
            if (event.key === 'Escape') {
                closeMenu();
            }
        };

        document.addEventListener('mousedown', handleClickOutside);
        document.addEventListener('keydown', handleEscape);

        return () => {
            document.removeEventListener('mousedown', handleClickOutside);
            document.removeEventListener('keydown', handleEscape);
        };
    }, [closeMenu, isOpen]);

    return (
        <div className={clsx(styles.menu, className)} ref={menuRef}>
            <MenuBaseComponent
                onClick={toggleMenu}
                onKeyDown={handleKeyDown}
                aria-haspopup="menu"
                aria-expanded={isOpen}
                aria-controls="llm-menu"
            />
            {isOpen && (
                <div className={styles.menuDropdown} role="menu" id="llm-menu">
                    {options.map((option, index) => {
                        const WrapperComponent = option.href ? 'a' : 'button';

                        return (
                            <WrapperComponent
                                key={option.value}
                                ref={(el) => {
                                    menuItemRefs.current[index] = el;
                                }}
                                className={styles.menuOptionWrapper}
                                role="menuitem"
                                tabIndex={0}
                                href={option.href}
                                target={option.target}
                                rel={option.rel}
                                type={option.href ? undefined : 'button'}
                                onClick={(event) => {
                                    if (!option.href) {
                                        event.preventDefault();
                                    }
                                    handleOptionSelect(option, event);
                                }}
                                onKeyDown={(e) => handleMenuItemKeyDown(e, option, index)}
                            >
                                <Option {...option} />
                            </WrapperComponent>
                        );
                    })}
                </div>
            )}
        </div>
    );
};

function getButtonText({ status }) {
    switch (status) {
        case 'loading':
            return 'Copying...';
        case 'copied':
            return 'Copied';
        default:
            return 'Copy for LLM';
    }
}

const onCopyAsMarkdownClick = async ({ setCopyingStatus, currentUrl }) => {
    const sourceUrl = currentUrl || (typeof window !== 'undefined' ? window.location.href : '');

    if (!sourceUrl) {
        return;
    }

    trackClick('Copy for LLM', 'llm-buttons.copyForLLM');

    const markdownUrl = getMarkdownUrl(sourceUrl);

    try {
        setCopyingStatus('loading');

        // Safari requires clipboard writes to be created synchronously inside the user gesture.
        // We therefore pass a Promise that resolves to a Blob into ClipboardItem instead of
        // awaiting fetch first — otherwise Safari would reject the clipboard operation.
        const markdownContent = new ClipboardItem({
            'text/plain': fetch(markdownUrl)
                .then((response) => {
                    if (!response.ok) {
                        throw new Error(`Failed to fetch markdown: ${response.status}`);
                    }
                    return response.text();
                })
                .then((content) => new Blob([content], { type: 'text/plain' })),
        });

        await navigator.clipboard.write([markdownContent]);

        // Show success feedback
        setCopyingStatus('copied');
    } catch (error) {
        console.error('Failed to copy markdown content:', error);
    } finally {
        setTimeout(() => setCopyingStatus('idle'), 2000);
    }
};

const COPYING_STATUS_ICON = {
    loading: <LoaderIcon size={16} />,
    copied: <CheckIcon size={16} />,
    idle: <CopyIcon size={16} />,
}

const MenuBase = React.forwardRef(({
    copyingStatus,
    setCopyingStatus,
    chevronIconRef,
    currentUrl,
    ...buttonProps
}, ref) => {
    const mergedButtonProps = {
        ...buttonProps,
        tabIndex: buttonProps.tabIndex ?? 0,
    };

    return (
        <div className={styles.llmButtonWrapper}>
            <div
                ref={ref}
                className={styles.llmButton}
                {...mergedButtonProps}
            >
                <div
                    className={styles.copyUpIconWrapper}
                    onClick={(event) => {
                        event.stopPropagation();
                        onCopyAsMarkdownClick({ setCopyingStatus, currentUrl });
                    }}
                >
                    {COPYING_STATUS_ICON[copyingStatus]}
                </div>
                <span
                    onClick={(event) => {
                        event.stopPropagation();
                        onCopyAsMarkdownClick({ setCopyingStatus, currentUrl });
                    }}
                    className={styles.llmButtonText}
                >
                    {getButtonText({ status: copyingStatus })}
                </span>
                <div className={styles.chevronIconWrapper}>
                    <ChevronDownIcon
                        size="16"
                        color="currentColor"
                        className={styles.chevronIcon}
                        ref={chevronIconRef}
                    />
                </div>
            </div>
        </div>
    );
});
MenuBase.displayName = 'MenuBase';

const Option = ({ label, description, showExternalIcon, icon }) => {
    const Icon = icon ?? CopyIcon;

    return (
        <div className={styles.menuOption}>
            <Icon size={16} className={styles.menuOptionIcon} />
            <div className={styles.menuOptionText}>
                <span className={styles.menuOptionLabel}>{label}</span>
                <span className={styles.menuOptionDescription}>{description}</span>
            </div>
            {showExternalIcon && (
                <ExternalLinkIcon
                    size={16}
                    className={styles.menuOptionExternalIcon}
                />
            )}
        </div>
    );
};

export default function LLMButtons() {
    const [copyingStatus, setCopyingStatus] = useState('idle');
    const [isMarkdownAvailable, setIsMarkdownAvailable] = useState(false);
    const chevronIconRef = useRef(null);
    const location = useLocation();

    const currentUrl = typeof window !== 'undefined'
        ? `${window.location.origin}${location.pathname}${location.search}${location.hash}`
        : '';

    useEffect(() => {
        if (!currentUrl) {
            // TODO: Feel free to tell me how to fix this 🤦‍♂️
            // eslint-disable-next-line react-hooks/set-state-in-effect
            setIsMarkdownAvailable(false);
            return undefined;
        }

        const controller = new AbortController();
        const markdownUrl = getMarkdownUrl(currentUrl);

        const checkMarkdownAvailability = async () => {
            try {
                const response = await fetch(markdownUrl, {
                    method: 'HEAD',
                    signal: controller.signal,
                });
                setIsMarkdownAvailable(response.ok);
            } catch (error) {
                if (error.name === 'AbortError') {
                    return;
                }
                setIsMarkdownAvailable(false);
            }
        };

        checkMarkdownAvailability();

        return () => {
            controller.abort();
        };
    }, [currentUrl]);

    const menuOptions = useMemo(
        () => DROPDOWN_OPTIONS.map((option) => {
            const href = getOptionHref(option.value, currentUrl);

            if (option.value === 'viewAsMarkdown') {
                if (!isMarkdownAvailable) {
                    return null;
                }
            }

            return {
                ...option,
                href,
                target: href ? '_blank' : undefined,
                rel: href ? 'noopener noreferrer' : undefined,
            };
        }).filter(Boolean),
        [isMarkdownAvailable, currentUrl],
    );

    const onMenuOptionClick = useCallback(
        (option, event) => {
            if (!option) {
                return;
            }

            if (option.analytics) {
                trackClick(option.analytics.buttonText, option.analytics.element);
            }

            if (option.value === 'copyForLLM') {
                event?.preventDefault();
                onCopyAsMarkdownClick({ setCopyingStatus, currentUrl });
            }
        },
        [setCopyingStatus, currentUrl],
    );

    return (
        <Menu
            className={styles.llmMenu}
            onMenuOpen={(isOpen) => chevronIconRef.current?.classList.toggle(
                styles.chevronIconOpen,
                isOpen,
            )}
            components={{
                MenuBase: (props) => (
                    <MenuBase
                        copyingStatus={copyingStatus}
                        setCopyingStatus={setCopyingStatus}
                        chevronIconRef={chevronIconRef}
                        currentUrl={currentUrl}
                        {...props}
                    />
                ),
            }}
            onSelect={onMenuOptionClick}
            options={menuOptions}
        />
    );
}


================================================
FILE: website/src/components/LLMButtons.module.css
================================================
.llmMenu {
    display: flex;
    justify-content: flex-end;
    flex: 0 0 auto;
  }
  
  @media (max-width: 996px) {
    .llmMenu {
      width: 100%;
      justify-content: flex-start;
    }
  }
  
  .llmButtonWrapper {
    display: flex;
    justify-content: flex-end;
    width: auto;
  }
  
  .llmButton {
    display: flex;
    align-items: center;
    border-radius: 0.5rem;
    border: 1px solid var(--color-separator);
    background-color: var(--color-background-subtle);
    cursor: pointer;
    transition: background-color 0.2s ease-in-out, border-color 0.2s ease-in-out;
  }
  
  .copyUpIconWrapper {
    display: flex;
    align-items: center;
    justify-content: center;
    padding: 0.6rem 0.5rem 0.6rem 0.8rem;
  }
  
  .llmButtonText {
    display: flex;
    align-items: center;
    padding-right: 0.8rem;
    border-right: 1px solid var(--color-separator);
    margin: 0;
    font: 400 0.875rem/1.4 Inter, sans-serif;
  }
  
  .chevronIconWrapper {
    display: flex;
    align-items: center;
    justify-content: center;
    padding-inline: 0.25rem;
  }
  
  .chevronIcon {
    transition: transform 0.2s ease-in-out;
  }
  
  .chevronIconOpen {
    transform: rotate(180deg);
  }
  
  .menu {
    position: relative;
    width: fit-content;
  }
  
  .menuDropdown {
    position: absolute;
    right: 0;
    top: calc(100% + 0.5rem); 
    padding: 0.375rem;
    border-radius: 0.75rem;
    border: 1px solid var(--color-separator);
    background-color: var(--color-background);
    box-shadow: 0 12px 32px rgb(10 11 36 / 20%);
    min-width: 17rem;
    max-width: min(20rem, calc(100vw - 1.5rem));
    z-index: 2;
    display: flex;
    flex-direction: column;
    gap: 0.25rem;
  }
  
  @media (max-width: 996px) {
    .menuDropdown {
      left: 0;
      right: auto;
      width: min(20rem, calc(100vw - 1.5rem));
    }
  }
  
  .menuOption {
    display: flex;
    gap: 0.5rem;
    padding: 0.25rem 0.5rem;
    border-radius: 0.5rem;
    transition: background-color 0.15s ease-in-out;
  }
  
  .menuOption:hover {
    background: var(--color-hover);
  }
  
  .menuOptionWrapper {
    border: none;
    background: transparent;
    padding: 0;
    text-align: left;
    width: 100%;
    display: block;
    text-decoration: none;
    color: inherit;
    cursor: pointer;
    outline: none;
  }
  
  .menuOptionWrapper:focus-visible .menuOption {
    background: var(--color-hover);
    outline-offset: -2px;
  }
  
  .menuOptionIcon,
  .menuOptionExternalIcon {
    flex-shrink: 0;
  }
  
  .menuOptionIcon {
    margin-top: 0.2rem;
  }
  
  .menuOptionText {
    flex: 1;
    display: flex;
    flex-direction: column;
    gap: 0.125rem;
    line-height: 1rem;
    padding: 4px 0;
  }
  
  .menuOptionLabel {
    margin: 0;
    font-size: 0.875rem;
    line-height: 1rem;
    font-weight: 400;
    color: var(--ifm-font-color-base);
  }
  
  .menuOptionDescription {
    margin: 0;
    font-size: 0.8rem;
    color: var(--color-text-subtle);
  }

================================================
FILE: website/src/components/RunnableCodeBlock.jsx
================================================
import React from 'react';
import clsx from 'clsx';
import CodeBlock from '@theme/CodeBlock';
import Link from '@docusaurus/Link';
import styles from './RunnableCodeBlock.module.css';

const PYTHON_ACTOR_RUNNER = 'HH9rhkFXiZbheuq1V'

const RunnableCodeBlock = ({ children, actor, hash, ...props }) => {
    hash = hash ?? children.hash;

    if (!children.code) {
        throw new Error(`RunnableCodeBlock requires "code" and "hash" props
Make sure you are importing the code block contents with the roa-loader.`);
    }

    if (!hash) {
        return (
            <CodeBlock {...props}>
                { children.code }
            </CodeBlock>
        );
    }

    const href = `https://console.apify.com/actors/${actor ?? PYTHON_ACTOR_RUNNER}?runConfig=${hash}&asrc=run_on_apify`;

    return (
        <div className={clsx(styles.container, 'runnable-code-block')}>
            <Link href={href} className={styles.button} rel="follow">
                Run on
                <svg width="91" height="25" viewBox="0 0 91 25" fill="none" xmlns="http://www.w3.org/2000/svg" className="apify-logo-light alignMiddle_src-theme-Footer-index-module">
                    <path fill="#246DFF" d="M13.785 0h9.889c.201 0 .364.163.364.363v15.074c0 .361-.47.501-.669.2L13.48.561A.363.363 0 0 1 13.785 0Z"/><path fill="#20A34E" d="M10.253 0H.364A.364.364 0 0 0 0 .363v15.074c0 .361.47.501.669.2L10.558.561A.363.363 0 0 0 10.253 0Z"/><path fill="#F86606" d="M11.85 12.069.616 23.358a.363.363 0 0 0 .259.62h22.298a.363.363 0 0 0 .26-.618L12.37 12.07a.365.365 0 0 0-.52-.001Z"/><path className="apify-logo" fill="#000" d="M77.267 3.298H73.06c-1.317 0-1.881.657-1.881 1.853V6.3h6.13l3.503 8.066L84.315 6.3h3.056l-7.335 16.859h-3.009l2.257-5.206-4.195-9.12h-3.91v9.331H68.17V8.832h-3.268V6.3h3.268V4.565c0-2.298 1.27-3.658 3.973-3.658h5.124v2.391Z"/><path className="apify-logo" fill="#000" fill-rule="evenodd" d="M53.32 6.042c3.102 0 5.641 2.321 5.641 6.19 0 3.893-2.538 6.19-5.641 6.19-2.586 0-3.88-1.594-4.114-2.063v6.776h-2.962V6.3h2.985v1.876c.212-.446 1.505-2.134 4.09-2.134Zm-.776 2.626c-2.045 0-3.362 1.524-3.362 3.564 0 2.017 1.316 3.564 3.362 3.564 2.068 0 3.385-1.547 3.385-3.564 0-2.04-1.317-3.564-3.385-3.564ZM38.44 5.995c3.69 0 5.735 1.923 5.735 4.736v4.01c0 .704.259 1.032.94 1.079v2.415h-.94c-1.48-.024-2.445-.587-2.774-1.642-.587.844-1.81 1.83-3.855 1.83-2.797 0-4.913-1.595-4.913-4.01 0-2.392 1.81-3.682 4.748-3.682h3.903c0-1.43-1.105-2.344-2.845-2.344-1.645 0-2.303.89-2.468 1.195h-3.033c.236-1.266 1.764-3.587 5.501-3.587Zm-.565 6.776c-1.387 0-2.28.516-2.28 1.595 0 1.149 1.081 1.829 2.586 1.829 1.692 0 3.103-.844 3.103-2.415V12.77h-3.409Z" clip-rule="evenodd"/><path className="apify-logo" fill="#000" d="M63.47 18.164h-3.009V6.3h3.01v11.864ZM63.518 4.4H60.39V.837h3.127v3.565Z"/>
                </svg>
            </Link>
            <CodeBlock {...props} className={clsx(styles.codeBlock, 'code-block', props.title != null ? 'has-title' : 'no-title')}>
                { children.code }
            </CodeBlock>
        </div>
    );
};

export default RunnableCodeBlock;


================================================
FILE: website/src/components/RunnableCodeBlock.module.css
================================================
.button {
    display: inline-block;
    padding: 3px 10px;
    position: absolute;
    top: calc(var(--ifm-pre-padding) / 2);
    right: 9px;
    z-index: 1;
    font-size: 16px;
    line-height: 28px;
    background: var(--prism-background-color);
    color: var(--prism-color);
    border: 1px solid var(--ifm-color-emphasis-300);
    border-radius: var(--ifm-global-radius);
    opacity: 0.7;
    font-weight: 600;
    width: 155px;
}

@media screen and (max-width: 768px) {
    .button {
        display: none;
    }
}

.button svg {
    height: 20px;
    position: absolute;
    top: 7.5px;
    right: 0;
}

.button:hover {
    opacity: 1;
    color: var(--prism-color);
}

.container {
    position: relative;
}


================================================
FILE: website/src/css/custom.css
================================================
@import url('https://fonts.googleapis.com/css2?family=Be+Vietnam+Pro:wght@400;600;700&display=swap');

html[data-theme='dark'] {
    --ifm-navbar-background-color: #1a1b23;
    --ifm-background-color: #1a1b21;
    --ifm-background-surface-color: #242736;

    --ifm-font-color-base: #f2f3fb;

    --ifm-pre-background: #242736;

    --ifm-color-primary: #5d9df1;
    --ifm-link-color: #5d9df1;
    --ifm-heading-color: #f2f3fb;
    --ifm-navbar-link-color: #f2f3fb;
    --ifm-menu-color-active: #b2b8cc;

    --docusaurus-highlighted-code-line-bg: rgba(255, 255, 255, 0.1);

    --docsearch-text-color: #8c93a8;
    --docsearch-highlight-color: #f3f4fa;

    --color-background: #1a1b21;
    --color-background-subtle: #2a2d39;
    --color-background-muted: #252832;
    --color-field-background: #101114;
    --color-separator: #343847;
    --color-border: #414758;
    --color-card-background: #1e2027;
    --color-card-background-hover: #252832;
    --color-text: #f3f4fa;
    --color-text-subtle: #8c93a8;
    --color-text-muted: #b2b8cc;
    --color-text-on-primary: #1a1b21;
    --color-text-placeholder: #6e758a;
    --color-black-action: #fff;
    --color-icon: #b2b8cc;
    --color-hover: #2d313e;
    --color-primary-action-hover: #d1d5e4;
}

:root {
    /* use default system font based on https://devhints.io/css-system-font-stack */
    --ifm-font-family-base: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', sans-serif;
    --ifm-heading-font-family: 'Lota Grotesque', sans-serif;
    --ifm-font-weight-semibold: 600;
    --ifm-font-color-base: #242736;

    --ifm-navbar-item-padding-horizontal: 0;
    --ifm-navbar-item-padding-vertical: 0;
    --ifm-navbar-sidebar-width: 100%;

    --ifm-navbar-link-color: #41465d;
    --ifm-navbar-shadow: none;

    --ifm-heading-margin-top: var(--ifm-heading-margin-bottom);
    --ifm-hero-background-color: transparent;

    --ifm-code-background: var(--ifm-pre-background) !important;
    --ifm-code-padding-horizontal: 0.4rem;
    --ifm-code-padding-vertical: 0.2rem;

    --ifm-color-primary-lightest: #5d9df1;
    --ifm-color-primary-lighter: #3a87ee;
    --ifm-color-primary-light: #2e80ed;
    --ifm-color-primary: #1672eb;
    --ifm-color-primary-dark: #1266d5;
    --ifm-color-primary-darker: #1161c9;
    --ifm-color-primary-darkest: #0e50a6;

    --ifm-link-color: hsl(214, 84%, 50%);
    --ifm-link-hover-color: hsl(214, 84%, 65%);
    --ifm-link-hover-decoration: none;
    --ifm-pre-padding: 1.6rem;

    --ifm-footer-background-color: #272c3d;
    --ifm-footer-title-color: #f2f3fb;
    --ifm-footer-link-color: #f2f3fb;
    --ifm-menu-color-active: #555d76;
    --max-layout-width: 1680px;

    --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1);
    --docsearch-highlight-color: #242836;

    --ifm-heading-color: #242736;

    --docsearch-text-color: #6c7590;
    --docsearch-highlight-color: #242836;

    --color-background: #fff;
    --color-background-subtle: #f3f4fa;
    --color-background-muted: #f8f9fc;
    --color-field-background: #f8f9fc;
    --color-separator: #e0e3f2;
    --color-border: #d0d5e9;
    --color-card-background: #fff;
    --color-card-background-hover: #f8f9fc;

    --color-text: #242836;
    --color-text-subtle: #6c7590;
    --color-text-muted: #555d76;
    --color-text-on-primary: #fff;
    --color-text-placeholder: #969eb8;
    --color-black-action: #272d3e;
    --color-icon: #555d76;
    --color-hover: #eef0f8;
    --color-primary-action-hover: #2b3143;
}

footer,
nav {
    --max-layout-width: 1200px;
}

@font-face {
    font-family: 'Lota Grotesque';
    src: url('/font/lota.woff2') format('woff2'),
         url('/font/lota.woff') format('woff');
    font-weight: 600;
}

.footer__title {
    font-size: 1.25rem;
    font-weight: 600;
}

html .DocSearch-Button {
    border-radius: 6px !important;
    font-weight: 400 !important;
    background: var(--color-field-background) !important;
    border: 1px solid var(--color-border) !important;
    width: 256px;
    height: 40px;
    padding: 0;
    padding-inline: 4px;

    /* Annoying, but needed */
    /* https://stackoverflow.com/questions/26140050/why-is-font-family-not-inherited-in-button-tags-automatically/26140154 */
    font-family: inherit;

    color: var(--color-text-placeholder);

    &:hover {
        color: var(--color-text-muted);
        box-shadow: none !important;
        background: var(--color-field-background) !important;
    }
}
.DocSearch-Button-Placeholder {
    display: block !important;
    font-size: 16px !important;
}

.DocSearch-Search-Icon {
    display: none;
}

div[class*="navbarSearchContainer"] {
    position: static;
}

html[data-theme="dark"] .DocSearch-Button {
    background: none;
    border: 1px solid var(--docsearch-muted-color);
}

html[data-theme="dark"] .DocSearch-Button .DocSearch-Search-Icon {
    color: var(--docsearch-muted-color);
}

html.plugin-pages .main-wrapper {
    overflow-x: hidden;
}

.main-wrapper > div {
    max-width: var(--max-layout-width);
}

aside > div > a {
    padding-left: 16px;
}

aside > div > a > b {
    display: none;
}

@media (max-width: 1200px) {
    .navbar__toggle {
        display: inherit;
    }
    .navbar__item {
        display: none;
    }
}

@media (max-width: 767px) {
    .navbar__items--right > div,
    .navbar__items--right > a {
        display: none;
    }
}

.navbar__toggle {
    margin: 0;
    padding: 8px !important;

    svg {
        color: var(--color-icon);
        width: 20px;
        height: 20px;
    }
}

.navbar__title {
    /* Replaced by SVG */
    display: none;
}

.navbar__inner {
    /* .container */
    max-width: var(--max-layout-width);
    margin: auto;
    width: 100%;
}

.navbar__items {
    height: 28px;
    @media (min-width: 768px) {
        height: 40px;
    }
}

.navbar__items--right {
    gap: 16px;
}

.navbar__item, .navbar__link {
    font-size: 16px;
    font-weight: 500;
    line-height: 24px; /* 150% */
    padding: 0;
    color: var(--color-text);
    border-radius: 12px;

    &:hover,
    &:focus {
        color: var(--color-text-muted);
        background: var(--color-background-muted);
    }
}

.navbar__item {
    padding: 4px 8px;
}

.navbar__item.dropdown {
    padding: 4px 16px 4px 8px;
    a {
        display: inline-flex;
    }
}

.navbar__link--active {
    color: var(--color-text-muted);
    background: var(--color-background-muted);
}

.dropdown > .navbar__link::after {
    border-color: currentColor;
    border-style: solid;
    border-width: 0.1em 0.1em 0 0;
    content: '';
    display: inline-block;
    height: 0.3em;
    left: 0.3em;
    position: relative;
    vertical-align: top;
    width: 0.3em;
    top: 8px;
    transform: rotate(135deg);
}

.navbar {
    border-bottom: 1px solid var(--color-separator);
    height: auto;
    background: var(--color-background);

    padding: 16px;

    @media (min-width: 768px) {
        padding: 20px 40px;
    }
    @media (min-width: 1024px) {
        padding: 20px 64px;
    }
}

nav[class*='navbarHidden'] {
    div[class*='navbarLogo'] {
        display: none;
    }
}

.navbar .icon {
    font-size: 0;
    padding: 4px;
    margin-left: 20px;
    line-height: 0;
}

.navbar .icon::before {
    content: '';
    display: block;
    width: 24px;
    height: 24px;
    background-size: cover;
}

.navbar svg[class*="iconExternalLink"],
aside svg[class*="iconExternalLink"] {
    display: none;
}

header.hero div[class^="heroButtons"] {
    justify-content: inherit;
}

article .card h2 {
    margin-top: 0;
}

.tsd-kind-icon,
.menu__link,
.table-of-contents__link {
    text-overflow: ellipsis;
    width: 100%;
    overflow: hidden;
    white-space: nowrap;
}

.tsd-flag {
    user-select: none;
}

.menu__caret:before,
.menu__link--sublist:after {
    float: right;
}

.table-of-contents__link {
    height: 20px;
}

nav.navbar .dropdown__menu {
    top: 32px;

    min-width: 6rem;
    background: var(--color-card-background);
    border: 1px solid var(--color-border);
}

.dropdown__menu .dropdown__link {
    width: 100%;
    border-radius: 8px;
}

.dropdown__menu .dropdown__link--active {
    color: var(--color-text-muted);
    background: var(--color-background-muted);
}

.dropdown__menu .dropdown__link:hover,
.dropdown__menu .dropdown__link--active:hover {
    background: var(--color-background-muted);
    color: var(--color-text-muted);
}

.navbar__logo {
    height: 2rem;
}

.navbar__logo_appendix {
    margin-left: -30px;
    font-weight: bold;
}

.navbar__logo_appendix_sidebar {
    display: block;
    position: absolute;
    top: 18px;
    left: 213px;
}

.main-wrapper {
    align-items: safe center;
}

.main-wrapper > div {
    width: calc(min(100%, var(--max-layout-width))) !important;
}

.main-wrapper a[class*="sidebarLogo"] {
    margin: 0;

    b {
        display: none;
    }

    img {
        height: 28px;
        margin-top: 4px;
        margin-bottom: 24px;
        margin-left: 24px;
    }
}

div[class*="sidebarViewport"] {
    top: 22px;
}

html.plugin-pages {
    font-size: 18px;
    line-height: 32px;
}

html.plugin-pages h2 {
    font-size: 36px;
    line-height: 48px;
}

html.plugin-docs .theme-doc-markdown {
    font-size: 18px;
    line-height: 32px;
}

html.plugin-docs .theme-doc-markdown h1 {
    font-weight: 600;
    font-size: 48px;
    line-height: 64px;
    color: #000;
}

html[data-theme="dark"].plugin-docs .theme-doc-markdown h1 {
    color: #fff;
}

html.plugin-typedoc-api .theme-doc-markdown h1 {
    color: #000;
}

html[data-theme="dark"].plugin-typedoc-api .theme-doc-markdown h1 {
    color: #fff;
}

html.plugin-docs .theme-doc-markdown h2 {
    font-size: 36px;
    line-height: 48px;
}

html.plugin-docs .theme-doc-markdown h3 {
    font-size: 28px;
    line-height: 36px;
    /*color: #242736;*/
}

.theme-doc-toc-desktop .table-of-contents {
    font-size: 16px;
    line-height: 24px;
}

.theme-doc-sidebar-menu .menu__link,
.theme-doc-toc-desktop .table-of-contents .toc-highlight {
    height: auto;
    color: #6f7490;
    background: none;
}

.theme-doc-sidebar-menu .menu__link:hover {
    background: inherit;
}

.theme-doc-sidebar-menu .menu__link {
    font-weight: 400;
}

.theme-doc-sidebar-menu .menu__link--active {
    font-weight: 700;
    color: var(--color-text-muted);
}

.theme-doc-sidebar-menu .menu__list-item-collapsible,
.theme-doc-sidebar-menu .menu__list-item-collapsible--active {
    background: none;
}

.theme-doc-toc-desktop .table-of-contents .table-of-contents__link--active {
    font-weight: 700;
}

html[data-theme='dark'] .theme-doc-sidebar-menu .menu__link,
html[data-theme='dark'] .theme-doc-toc-desktop .table-of-contents .toc-highlight {
    color: #b3b8d2;
}

html[data-theme='dark'] .theme-doc-sidebar-menu .menu__link--active,
html[data-theme='dark'] .theme-doc-toc-desktop .table-of-contents .table-of-contents__link--active {
    color: #f2f3fb;
}

.theme-doc-sidebar-menu .menu__link:hover,
.theme-doc-sidebar-menu .menu__link--active,
.theme-doc-toc-desktop .table-of-contents .table-of-contents__link:hover,
.theme-doc-toc-desktop .table-of-contents .table-of-contents__link--active {
    color: #242736;
}

.hero {
    position: relative;
}

.apiItemContainer .tsd-readme h1:first-child {
    display: none;
}

html .theme-doc-sidebar-container {
    border: 0;
}

html .theme-doc-sidebar-container button {
    border: 0;
    border-radius: 10px;
}

html .table-of-contents {
    border-left: 0;
}

html .table-of-contents ul {
    border-left: 2px solid #dfe2f5;
}

html.plugin-typedoc-api .theme-doc-sidebar-menu > li:first-child::before,
html.plugin-typedoc-api .theme-doc-sidebar-menu > li:nth-child(6)::before {
    text-transform: uppercase;
    font-size: 18px;
    line-height: 28px;
    color: #6f7490;
    padding: 20px 12px;
}

/*
html.plugin-typedoc-api .theme-doc-sidebar-menu > li:first-child::before {
    display: block;
    content: 'Core';
}

html.plugin-typedoc-api .theme-doc-sidebar-menu > li:nth-child(6)::before {
    display: block;
    content: 'Advanced';
    padding-top: 60px;
}
 */

#giscus-comments {
    display: block;
    margin-top: 50px;
}

.video-container {
    margin: 85px auto 0;
    max-width: 560px;
    overflow: hidden;
    position: relative;
    width: 100%;
    border-radius: 10px;
}

.yt-lite > .lty-playbtn {
    border: 0;
    cursor: pointer;
}

@media screen and (min-width: 768px) {
    .runnable-code-block .code-block.no-title pre + div {
        position: absolute;
        right: 170px;
        line-height: 28px;
    }
}

.runnable-code-block .code-block button {
    height: 36px;
    margin-top: 1px;
}

.runnable-code-block:hover .code-block button {
    opacity: 0.4;
}

html[data-theme='dark'] .runnable-code-block svg .apify-logo {
    fill: #fff;
}

/*
 * Reset the line-number counter for each .prism-code scope
 */
.prism-code {
    counter-reset: line-number;
}

/*
 * Notice the chained .language-ts class name to .prism-code
 * You can chain more languages in order to add line numbers
 */
.prism-code.language-ts .token-line::before,
.prism-code.language-typescript .token-line::before,
.prism-code.language-javascript .token-line::before,
.prism-code.language-json .token-line::before,
.prism-code.language-json5 .token-line::before,
.prism-code.language-python .token-line::before,
.prism-code.language-dockerfile .token-line::before,
.prism-code.language-XML .token-line::before,
.prism-code.language-js .token-line::before,
.prism-code.language-python .token-line::before {
    counter-increment: line-number;
    content: counter(line-number);
    margin-right: calc(var(--ifm-pre-padding) * 0.8);
    text-align: right;
    min-width: 1.5rem;
    display: inline-block;
    opacity: .3;
    position: sticky;
    left: var(--ifm-pre-padding);
}

div[class^="announcementBar_"] {
    background: #4585b6;
    color: #fff;
}

div[class^="announcementBar_"] button {
    color: #fff;
}

.markdown blockquote {
    --ifm-alert-background-color: var(--ifm-color-info-contrast-background);
    --ifm-alert-background-color-highlight: rgba(84,199,236,.15);
    --ifm-alert-foreground-color: var(--ifm-color-info-contrast-foreground);
    --ifm-alert-border-color: var(--ifm-color-info-dark);
    --ifm-code-background: var(--ifm-alert-background-color-highlight);
    --ifm-link-color: var(--ifm-alert-foreground-color);
    --ifm-link-hover-color: var(--ifm-alert-foreground-color);
    --ifm-link-decoration: underline;
    --ifm-tabs-color: var(--ifm-alert-foreground-color);
    --ifm-tabs-color-active: var(--ifm-alert-foreground-color);
    --ifm-tabs-color-active-border: var(--ifm-alert-border-color);
    background-color: var(--ifm-alert-background-color);
    border: var(--ifm-alert-border-width) solid var(--ifm-alert-border-color);
    border-left-width: var(--ifm-alert-border-left-width);
    border-radius: var(--ifm-alert-border-radius);
    box-shadow: var(--ifm-alert-shadow);
    padding: var(--ifm-alert-padding-vertical) var(--ifm-alert-padding-horizontal);
}

.tsd-parameters li {
    margin-bottom: 16px;
}

.tsd-parameters-title {
    font-size: 16px;
    margin-bottom: 16px !important;
}

.tsd-returns-title {
    font-size: 16px;
}

.DocSearch-Button-Key {
    background: var(--color-background-subtle) !important;
    box-shadow: none !important;
    border: 1px solid var(--color-border) !important;
    padding: 0 !important;
    color: var(--color-text-muted) !important;
}

.navbar-sidebar__brand {
    border-bottom: 1px solid var(--color-separator);
    flex-direction: column;
    height: auto;
    padding: 0;
}

.menu-primary {
    padding: 0;
    .menu__list-item {
        border-bottom: 1px solid var(--color-separator);
        margin: 0px 24px !important;
        a {
            margin: 8px 0px 4px;
            padding: 8px;
        }
        display: flex;
    }
    .menu__link {
        font-size: 16px;
        font-weight: 500;
        line-height: 24px;
    }
}

.navbar-sidebar__close {
    margin-left: 16px;
    svg {
        g {
            stroke: var(--color-icon);
        }
        width: 32px;
        height: 32px;
        padding: 8px;
    }
}

.DocSearch-Modal {
    font-family: var(--ifm-font-family-base);

    border-radius: 8px !important;
    border: 1px solid var(--color-border) !important;
    background: var(--color-card-background) !important;
    box-shadow: none !important;

    button {
        font-family: var(--ifm-font-family-base);
    }

    .DocSearch-Logo {
        display: none;
    }

    .DocSearch-Footer {
        flex-direction: row;
        border-top: 1px solid var(--color-border);
        background: var(--color-background);
        box-shadow: none;
    }

    .DocSearch-Label {
        color: var(--color-text-subtle);
        font-size: 14px;
        font-weight: 400;
        line-height: 20px;
    }

    .DocSearch-Commands-Key {
        border-radius: 4px;
        border: 1px solid var(--color-border);
        background: var(--color-background-subtle);
        box-shadow: none;
        g {
            stroke: var(--color-text-subtle);
        }
    }

    .DocSearch-Clear {
        color: var(--color-text-subtle);
    }

    .DocSearch-Form {
        border-radius: 6px;
        border-radius: var(--Radius-6, 6px);
        border: 1px solid var(--color-border);
        background: var(--color-background);
        box-shadow: none;
        height: 40px;
        padding: 8px 12px;
    }

    .DocSearch-Input {
        color: var(--color-text);
        font-size: 14px;
        line-height: 20px;
        padding: 0;
    }

    .DocSearch-Input::placeholder {
        color: var(--color-text-placeholder);
        font-style: italic;
    }

    .DocSearch-Search-Icon {
        width: 16px;
        height: 16px;
        path {
            stroke: var(--color-text-muted);
        }
    }

    .DocSearch-Reset {
        display: none;
    }

    .DocSearch-Help {
        color: var(--color-text-subtle);
    }

    .DocSearch-Hit-source {
        color: var(--color-text-subtle);
        font-size: 14px;
        font-weight: 400;
        line-height: 20px;
        padding-bottom: 4px;
        padding-left: 12px;
        background: var(--color-card-background);
    }

    .DocSearch-Hit {
        background: transparent;
        a {
            background: transparent !important;
            padding: 0;
            box-shadow: none;
        }
        a:hover {
            background: var(--color-hover) !important;
        }
    }

    .DocSearch-Hit[aria-selected='true'] a {
        background: var(--color-hover) !important;
    }

    .DocSearch-Hit-Container {
        background: transparent;
        height: 50px;
    }

    .DocSearch-Screen-Icon {
        display: none;
    }

    .DocSearch-NoResults {
        margin: 0;
        display: flex;
        flex-direction: column;
        width: 100%;
        padding: 16px 8px;
        gap: 24px;

        .DocSearch-Title {
            color: var(--color-text);
            font-size: 16px;
            font-weight: 500;
            line-height: 24px;
            width: fit-content;
            margin: 0;
        }
    }

    .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-title,
    .DocSearch-Hit-title {
        color: var(--color-text) !important;
        font-size: 16px;
        font-style: normal;
        font-weight: 500;
        line-height: 24px; /* 150% */
    }

    .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-path,
    .DocSearch-Hit-path,
    .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-action,
    .DocSearch-Hit-action,
    .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-icon,
    .DocSearch-Hit-icon,
    .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-Tree,
    .DocSearch-Hit-Tree {
        color: var(--color-text-muted) !important;
    }

    .DocSearch-Hit[aria-selected='true'] mark,
    .DocSearch-Hit mark {
        color: var(--color-text-subtle) !important;
    }

    .DocSearch-Help {
        color: var(--color-text-subtle);
        font-size: 14px;
        font-weight: 400;
        line-height: 16px;
    }

    .DocSearch-NoResults-Prefill-List {
        padding: 0;
        li {
            list-style-type: none;
            margin-top: 4px;
        }
    }

    .DocSearch-Prefill {
        color: var(--color-text);
        font-size: 14px;
        font-weight: 500;
        line-height: 20px;
        &:hover {
            color: var(--color-text-subtle);
            text-decoration: none;
        }
    }

    .DocSearch-HitsFooter {
        color: var(--color-text-subtle);
        font-size: 14px;
        font-weight: 400;
        line-height: 16px;

        a {
            border: none;
        }

        a:hover {
            color: var(--color-text);
        }
    }

    .DocSearch-Hit-icon {
        margin-left: 8px;
        width: auto;
        height: auto;
        svg {
            width: 16px;
            height: 16px;
        }
    }

    li[id*='recentSearches'] {
        .DocSearch-Hit-icon {
            display: none;
        }
    }

    .DocSearch-SearchBar {
        padding: 16px 16px 8px;
    }

    .DocSearch-Hit-Select-Icon {
        display: none !important;
    }

    .DocSearch-Dropdown {
        padding: 0 8px;
    }

    .DocSearch-Cancel {
        color: var(--color-text-subtle);
        font-size: 14px;
        font-weight: 500;
        line-height: 20px;
        &:hover {
            color: var(--color-text);
        }
    }

    .DocSearch-NoResults-Prefill-List ul {
        padding: 0;
    }
}


================================================
FILE: website/src/pages/home_page_example.py
================================================
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler(
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
        headless=True,  # Run in headless mode (set to False to see the browser).
        browser_type='firefox',  # Use Firefox browser.
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract data from the page using Playwright API.
        data = {
            'url': context.request.url,
            'title': await context.page.title(),
        }

        # Push the extracted data to the default dataset.
        await context.push_data(data)

        # Extract all links on the page and enqueue them.
        await context.enqueue_links()

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev'])

    # Export the entire dataset to a CSV file.
    await crawler.export_data('results.csv')

    # Or access the data directly.
    data = await crawler.get_data()
    crawler.log.info(f'Extracted data: {data.items}')


if __name__ == '__main__':
    asyncio.run(main())


================================================
FILE: website/src/pages/index.js
================================================
/* eslint-disable max-len */
import Link from '@docusaurus/Link';
import useDocusaurusContext from '@docusaurus/useDocusaurusContext';
import CodeBlock from '@theme/CodeBlock';
import Layout from '@theme/Layout';
import ThemedImage from '@theme/ThemedImage';
import clsx from 'clsx';
import React from 'react';

import styles from './index.module.css';
import Button from '../components/Button';
import HomepageCliExample from '../components/Homepage/HomepageCliExample';
import HomepageCtaSection from '../components/Homepage/HomepageCtaSection';
import HomepageHeroSection from '../components/Homepage/HomepageHeroSection';
import LanguageInfoWidget from '../components/Homepage/LanguageInfoWidget';
import RiverSection from '../components/Homepage/RiverSection';
import RunnableCodeBlock from '../components/RunnableCodeBlock';
import ThreeCardsWithIcon from '../components/Homepage/ThreeCardsWithIcon';

import HomePageExample from '!!raw-loader!roa-loader!./home_page_example.py';

function GetStartedSection() {
    return (
        <section className={styles.languageGetStartedSection}>
            <LanguageInfoWidget
                language="Python"
                githubUrl="https://github.com/apify/crawlee-python"
                to="/python/docs/quick-start"
            />
        </section>
    );
}

function CodeExampleSection() {
    return (
        <section className={styles.codeExampleSection}>
            <div className={styles.decorativeRow} />
            <div className={styles.codeBlockContainer}>
                <RunnableCodeBlock className="language-python" language="python">
                    {HomePageExample}
                </RunnableCodeBlock>
            </div>
            <div className={styles.dashedSeparator} />
            <div className={styles.decorativeRow} />
        </section>
    );
}

const benefitsCodeBlockCrawler = `fingerprint_generator = DefaultFingerprintGenerator(
    header_options=HeaderGeneratorOptions(
        browsers=['chromium', 'firefox'],
        devices=['mobile'],
        locales=['en-US']
    ),
)`;

// TODO:
const benefitsCodeBlockHeadless = `crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser()

@crawler.router.default_handler
async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
    prices = await context.query_selector_all('span.price')
    await context.enqueue_links()`;

function BenefitsSection() {
    return (
        <section className={styles.benefitsSection}>
            <h2>What are the benefits?</h2>
            <RiverSection
                title="Unblock websites by default"
                description="Crawlee crawls stealthily with zero configuration, but you can customize its behavior to overcome any protection. Real-world fingerprints included."
                content={
                    <CodeBlock className="code-block" language="python">
                        {benefitsCodeBlockCrawler}
                    </CodeBlock>
                }
                to="/docs/guides/avoid-blocking"
            />
            <div className={styles.trianglesSeparator} />
            <RiverSection
                title="Work with your favorite tools"
                description="Crawlee integrates BeautifulSoup, Cheerio, Puppeteer, Playwright, and other popular open-source tools. No need to learn new syntax."
                content={
                    <ThemedImage
                        alt="Work with your favorite tools"
                        sources={{
                            light: '/python/img/favorite-tools-light.webp',
                            dark: '/python/img/favorite-tools-dark.webp',
                        }}
                    />
                }
                reversed
                to="/docs/quick-start#choose-your-crawler"
            />
            <div className={styles.trianglesSeparator} />
            <RiverSection
                title="One API for headless and HTTP"
                description="Switch between HTTP and headless without big rewrites thanks to a shared API. Or even let Adaptive crawler decide if JS rendering is needed."
                content={
                    <CodeBlock className="code-block" language="python">
                        {benefitsCodeBlockHeadless}
                    </CodeBlock>
                }
                to="/api"
            />
        </section>
    );
}

function OtherFeaturesSection() {
    return (
        <section className={styles.otherFeaturesSection}>
            <h2>What else is in Crawlee?</h2>
            <div className={styles.cardsWithContentContainer}>
                <div className={styles.cardsWithImageContainer}>
                    <Link className={styles.cardWithImage} to="/docs/guides/scaling-crawlers">
                        <ThemedImage
                            sources={{
                                light: '/python/img/auto-scaling-light.webp',
                                dark: '/python/img/auto-scaling-dark.webp',
                            }}
                            alt=""
                        />
                        <div className={styles.cardWithImageText}>
                            <h3 className={styles.cardWithImageTitle}>
                                Auto scaling
                            </h3>
                            <div className={styles.cardWithImageDescription}>
                                Crawlers automatically adjust concurrency based
                                on available system resources. Avoid memory
                                errors in small containers and run faster in
                                large ones.
                            </div>
                        </div>
                    </Link>
                    <Link className={styles.cardWithImage} to="/docs/guides/proxy-management">
                        <ThemedImage
                            sources={{
                                light: '/python/img/smart-proxy-light.webp',
                                dark: '/python/img/smart-proxy-dark.webp',
                            }}
                            alt=""
                        />
                        <div className={styles.cardWithImageText}>
                            <h3 className={styles.cardWithImageTitle}>
                                Smart proxy rotation
                            </h3>
                            <div className={styles.cardWithImageDescription}>
                                Crawlee uses a pool of sessions represented by
                                different proxies to maintain the proxy
                                performance and keep IPs healthy. Blocked
                                proxies are removed from the pool automatically.
                            </div>
                        </div>
                    </Link>
                </div>
                <ThreeCardsWithIcon
                    cards={[
                        {
                            icon: (
                                <ThemedImage
                                    sources={{
                                        light: '/python/img/queue-light-icon.svg',
                                        dark: '/python/img/queue-dark-icon.svg',
                                    }}
                                    alt=""
                                />
                            ),
                            title: 'Queue and storage',
                            description:
                                'Pause and resume crawlers thanks to a persistent queue of URLs and storage for structured data.',
                            to: '/docs/guides/storages',
                        },
                        {
                            icon: (
                                <ThemedImage
                                    sources={{
                                        light: '/python/img/scraping-utils-light-icon.svg',
                                        dark: '/python/img/scraping-utils-dark-icon.svg',
                                    }}
                                    alt=""
                                />
                            ),
                            title: 'Handy scraping utils',
                            description:
                                'Sitemaps, infinite scroll, contact extraction, large asset blocking and many more utils included.',
                            to: '/docs/guides/avoid-blocking',

                        },
                        {
                            icon: (
                                <ThemedImage
                                    sources={{
                                        light: '/python/img/routing-light-icon.svg',
                                        dark: '/python/img/routing-dark-icon.svg',
                                    }}
                                    alt=""
                                />
                            ),
                            title: 'Routing & middleware',
                            description:
                                'Keep your code clean and organized while managing complex crawls with a built-in router that streamlines the process.',
                            to: '/api/class/Router',
                        },
                    ]}
                />
            </div>
        </section>
    );
}

function DeployToCloudSection() {
    return (
        <section className={styles.deployToCloudSection}>
            <div className={styles.deployToCloudLeftSide}>
                <h2>Deploy to cloud </h2>
                <div className={styles.deployToCloudDescription}>
                    Crawlee, by Apify, works anywhere, but Apify offers the best
                    experience. Easily turn your project into an{' '}
                    <Link to="https://apify.com/actors" rel="dofollow">
                        Actor
                    </Link>
                    —a serverless micro-app with built-in infra, proxies, and
                    storage.
                </div>
                <Button
                    withIcon
                    to="https://docs.apify.com/platform/actors/development/deployment"
                >
                    Deploy to Apify
                </Button>
            </div>
            <div className={styles.deployToCloudRightSide}>
                <div
                    className={styles.dashedSeparatorVertical}
                    id={styles.verticalStepLine}
                />
                <div className={styles.deployToCloudStep}>
                    <div className={styles.deployToCloudStepNumber}>
                        <div>1</div>
                    </div>
                    <div className={styles.deployToCloudStepText}>
                        Install Apify SDK and Apify CLI.
                    </div>
                </div>
                <div className={styles.deployToCloudStep}>
                    <div className={styles.deployToCloudStepNumber}>
                        <div>2</div>
                    </div>
                    <div className={styles.deployToCloudStepText}>
                        Add <pre>Actor.init()</pre> to the beginning and{' '}
                        <pre>Actor.exit()</pre> to the end of your code.
                    </div>
                </div>
                <div className={styles.deployToCloudStep}>
                    <div className={styles.deployToCloudStepNumber}>
                        <div>3</div>
                    </div>
                    <div className={styles.deployToCloudStepText}>
                        Use the Apify CLI to push the code to the Apify
                        platform.
                    </div>
                </div>
            </div>
        </section>
    );
}

function BuildFastScrapersSection() {
    return (
        <section className={styles.buildFastScrapersSection}>
            <div className={styles.dashedDecorativeCircle} />
            <div className={styles.dashedSeparator} />
            <h2>Crawlee helps you build scrapers faster</h2>
            <ThreeCardsWithIcon
                cards={[
                    {
                        icon: (
                            <ThemedImage
                                sources={{
                                    light: '/python/img/zero-setup-light-icon.svg',
                                    dark: '/python/img/zero-setup-dark-icon.svg',
                                }}
                                alt=""
                            />
                        ),
                        title: 'Zero setup required',
                        description:
                            'Copy code example, install Crawlee and go. No CLI required, no complex file structure, no boilerplate.',
                        actionLink: {
                            text: 'Get started',
                            href: '/docs/quick-start',
                        },
                    },
                    {
                        icon: (
                            <ThemedImage
                                sources={{
                                    light: '/python/img/defaults-light-icon.svg',
                                    dark: '/python/img/defaults-dark-icon.svg',
                                }}
                                alt=""
                            />
                        ),
                        title: 'Reasonable defaults',
                        description:
                            'Unblocking, proxy rotation and other core features are already turned on. But also very configurable.',
                        actionLink: {
                            text: 'Learn more',
                            href: '/docs/examples',
                        },
                    },
                    {
                        icon: (
                            <ThemedImage
                                sources={{
                                    light: '/python/img/community-light-icon.svg',
                                    dark: '/python/img/community-dark-icon.svg',
                                }}
                                alt=""
                            />
                        ),
                        title: 'Helpful community',
                        description:
                            'Join our Discord community of over 10k developers and get fast answers to your web scraping questions.',
                        actionLink: {
                            text: 'Join Discord',
                            href: 'https://discord.gg/jyEM2PRvMU',
                        },
                    },
                ]}
            />
        </section>
    );
}

export default function JavascriptHomepage() {
    const { siteConfig } = useDocusaurusContext();
    return (
        <Layout description={siteConfig.description}>
            <div id={styles.homepageContainer}>
                <HomepageHeroSection />
                <GetStartedSection />
                <div className={clsx(styles.dashedSeparator, styles.codeExampleTopSeparator)} />
                <CodeExampleSection />
                <HomepageCliExample />
                <div className={styles.dashedSeparator}>
                    <div
                        className={styles.dashedDecorativeCircle}
                        id={styles.ctaDecorativeCircle}
                    />
                </div>
                <BenefitsSection />
                <div className={styles.dashedSeparator} />
                <OtherFeaturesSection />
                <div className={styles.dashedSeparator} />
                <DeployToCloudSection />
                <div className={styles.dashedSeparator} />
                <BuildFastScrapersSection />
                <HomepageCtaSection />
            </div>
        </Layout>
    );
}


================================================
FILE: website/src/pages/index.module.css
================================================
/************* PAGE LAYOUT *************/

#homepageContainer {
    width: calc(100% - 48px) !important;
    max-width: 1200px !important;
    border-left: 1px solid var(--color-separator);
    border-right: 1px solid var(--color-separator);
    margin: 0 24px;
}

.dashedSeparator {
    position: relative;
    width: 100%;
    border-bottom: 1px dashed var(--color-separator);
}

.dashedSeparatorVertical {
    position: relative;
    border-right: 1px dashed var(--color-separator);
}

.dashedDecorativeCircle {
    width: 120px;
    height: 120px;
    border: 1px dashed var(--color-separator);
    border-radius: 50%;
    position: absolute;
    transform: translate(-50%, -50%);
}

.fadedOutSeparator {
    border: none;
    height: 1px;
    background-image:
        linear-gradient(
            90deg,
            transparent,
            transparent 50%,
            var(--color-background) 50%,
            var(--color-background) 100%
        ),
        linear-gradient(
            90deg,
            var(--color-separator) 0%,
            transparent 50%,
            var(--color-separator) 100%
        );
    background-size:
        6px 1px,
        100% 1px;
}

.fadedOutSeparatorVertical {
    border: none;
    width: 1px;
    background-image:
        linear-gradient(
            180deg,
            transparent,
            transparent 50%,
            var(--color-background) 50%,
            var(--color-background) 100%
        ),
        linear-gradient(
            180deg,
            var(--color-separator) 0%,
            transparent 50%,
            var(--color-separator) 100%
        );
    background-size:
        1px 6px,
        1px 100%;
}

.trianglesSeparator {
    width: 100%;
    height: 32px;
    background-position: center;
    background-repeat: repeat-x;
    background-image: url("../../static/img/triangles_light.svg");

    html[data-theme="dark"] & {
        background-image: url("../../static/img/triangles_dark.svg");
    }

    /* TABLET */
    @media (min-width: 768px) {
        background-position: unset;
        background-repeat: repeat;
        height: 52px;
    }
}

/* most separators and decorations are not displayed on mobile */
.dashedSeparatorVertical,
.dashedDecorativeCircle,
.fadedOutSeparator,
.fadedOutSeparatorVertical {
    display: none;
}

/* TABLET */
@media (min-width: 768px) {
    .dashedSeparatorVertical,
    .dashedDecorativeCircle,
    .fadedOutSeparator,
    .fadedOutSeparatorVertical {
        display: block;
    }

    #homepageContainer {
        width: calc(100% - 80px) !important;
        margin: 0 40px;
    }
}

/* DESKTOP */
@media (min-width: 1024px) {
    .dashedSeparatorVertical,
    .dashedDecorativeCircle,
    .fadedOutSeparator,
    .fadedOutSeparatorVertical {
        display: block;
    }

    #homepageContainer {
        width: calc(100% - 128px) !important;
        margin: 0 64px;
    }
}

/************* LANGUAGE GET STARTED SECTION *************/

.languageGetStartedSection {
    display: flex;
    flex-direction: column;
    gap: 32px;
    margin: 0 0 32px 0;

    div[class^="languageGetStartedContainer"] {
        flex: 1;
    }
}

/* TABLET */
@media (min-width: 768px) {
    .languageGetStartedSection {
        flex-direction: row;
        align-items: stretch;
        justify-content: space-around;
        gap: 0;
        margin: 0;
    }
}

/************* CODE EXAMPLE SECTION *************/

.codeExampleTopSeparator {
    display: none;
}
@media (min-width: 768px) {
    .codeExampleTopSeparator {
        display: block;
    }
}

.languageSwitchContainer {
    place-self: center;
    margin: 32px 0 16px 0;
}

.codeBlockContainer {
    :global(.theme-code-block) {
        margin-bottom: 32px;
        border-radius: 0;
        box-shadow: none;
        border-bottom: 1px dashed var(--color-separator);
        border-top: 1px dashed var(--color-separator);
        code {
            font-size: 14px;
            background: var(--color-background-muted);
            padding: 16px 8px 16px 4px;

            span::before {
                margin-right: 16px !important;
                left: unset !important;
                margin-right: 16px !important;
                color: var(--color-text-subtle) !important;
                opacity: 1 !important;
            }
        }
    }
}

/* TABLET */
@media (min-width: 768px) {
    .codeBlockContainer :global(.theme-code-block) {
        margin-bottom: 0;
        border-bottom: none;
        border-top: none;
    }

    .codeExampleSection {
        position: relative;
    }

    .languageSwitchContainer {
        margin: 0;
        position: absolute;
        top: calc(46px - 18px);
        left: calc(50% - 90px);
    }

    .decorativeRow {
        position: relative;
        height: 46px;
        border-bottom: 1px dashed var(--color-separator);

        &::before {
            content: " ";
            position: absolute;
            left: 40px;
            height: 100%;
            border-right: 1px dashed var(--color-separator);
        }

        &::after {
            content: " ";
            position: absolute;
            right: 40px;
            height: 100%;
            border-left: 1px dashed var(--color-separator);
        }
    }

    .codeBlockContainer {
        margin: 0 40px;
        border-left: 1px dashed var(--color-separator);
        border-right: 1px dashed var(--color-separator);
    }
}

@media (min-width: 1024px) {
    .decorativeRow {
        &::before {
            left: 60px;
        }

        &::after {
            right: 60px;
        }
    }
    .codeBlockContainer {
        margin: 0 60px;
    }
}

#ctaDecorativeCircle {
    width: 120px;
    height: 120px;
}

/************** BENEFITS SECTION ***********/

.benefitsSection {
    margin-bottom: 60px;

    h2 {
        margin: 32px 0;
        text-align: center;
        padding: 0 12px;

        /* TABLET */
        @media (min-width: 768px) {
            margin: 80px 0;
        }
    }
}

/************** OTHER FEATURES SECTION ***********/

.otherFeaturesSection {
    display: flex;
    flex-direction: column;

    h2 {
        padding: 32px 12px;

        text-align: center;
        color: var(--color-text);
        font-weight: 400;

        line-height: 46px !important;
        font-size: 36px !important;

        @media (min-width: 768px) {
            line-height: 56px !important;
            font-size: 48px !important;
            margin: 80px 0 64px;
            padding: 32px 24px;
        }
    }
    margin-bottom: 40px;

    @media (min-width: 768px) {
        margin-bottom: 80px;
    }
}

.cardsWithContentContainer {
    display: flex;
    flex-direction: column;
    gap: 20px;
    background-position-x: 5px;
    background-image: url("../../static/img/triangles_light.svg");

    html[data-theme="dark"] & {
        background-image: url("../../static/img/triangles_dark.svg");
    }

    @media (min-width: 768px) {
        gap: 48px;
    }
}

.cardsWithImageContainer {
    display: flex;
    flex-direction: column;
    gap: 20px;
    width: 100%;

    @media (min-width: 768px) {
        gap: 32px;
        flex-direction: row;
    }
}

.cardWithImage {
    flex: 1;
    display: flex;
    flex-direction: column;
    overflow: hidden;
    background: var(--color-card-background);
    border-block: 1px solid var(--color-separator);
    transition: background 0.1s ease;

    @media (min-width: 768px) {
        border: 1px solid var(--color-separator);
    }

    &:first-child {
        border-left: 0;
    }
    &:last-child {
        border-right: 0;
    }

    &:hover {
        background: var(--color-card-background-hover);
    }
}

.cardWithImage img {
    width: 100%;
    height: 250px;
    object-fit: cover;
}

.cardWithImage:last-child img {
    object-position: left 90%;
}

.cardWithImageText {
    padding: 40px 24px;
    border-top: 1px solid var(--color-separator);
}

.cardWithImageTitle {
    margin: 0;

    color: var(--color-text);
    font-size: 26px;
    font-style: normal;
    font-weight: 400;
    line-height: 34px;
}

.cardWithImageDescription {
    margin-top: 12px;
    color: var(--color-text-muted);
    font-family: var(--ifm-font-family-base);
    font-size: 16px;
    font-style: normal;
    font-weight: 400;
    line-height: 24px;
}

/************** DEPLOY TO CLOUD SECTION ***********/

.deployToCloudSection {
    padding: 32px 16px;
    display: flex;
    flex-direction: column;
    align-items: center;
    gap: 48px;
}

.deployToCloudLeftSide {
    display: flex;
    flex-direction: column;
    flex-basis: 50%;
    gap: 24px;
    text-align: center;
    font-style: normal;
    font-weight: 400;

    a {
        width: fit-content;
        margin: auto;
    }

    h2 {
        color: var(--color-text);
        font-family: "Lota Grotesque";
        font-size: 38px;
        line-height: 46px;
    }
}

.deployToCloudDescription {
    color: var(--color-text-muted);
    font-size: 16px;
    line-height: 24px;

    a {
        color: inherit;
        text-decoration: underline;
    }
}

.deployToCloudRightSide {
    display: flex;
    flex-direction: column;
    gap: 24px;
    flex-basis: 50%;
    position: relative;
}

.deployToCloudStep {
    display: flex;
    flex-direction: row;
    gap: 16px;
    align-items: center;
}

.deployToCloudStepNumber {
    display: flex;
    justify-content: center;
    align-items: center;
    width: 72px;
    height: 72px;
    padding: 16px;
    border-radius: 8px;
    border: 1px solid var(--color-separator);
    background: var(--color-background);
    color: var(--color-text-muted);
    font-size: 16px;
    font-style: normal;
    font-weight: 400;
    line-height: 24px;
    z-index: 1;
    div {
        display: flex;
        justify-content: center;
        align-items: center;
        height: 40px;
        width: 40px;
        border-radius: 50%;
        border: 1px dashed var(--color-separator);
        flex-shrink: 0;
    }
}

.deployToCloudStepText {
    display: inline-flex;
    align-items: baseline;
    flex-wrap: wrap;
    gap: 4px;
    color: var(--color-text);
    font-size: 14px;
    font-style: normal;
    font-weight: 500;
    line-height: 20px;

    pre {
        margin: 0;
        padding: 0;
        background-color: transparent;
    }
}

#verticalStepLine {
    position: absolute;
    left: 36px;
    height: 100%;
    z-index: 0;
}

/* TABLET */
@media (min-width: 768px) {
    .deployToCloudSection {
        padding: 96px 40px;
        flex-direction: row;
    }
    .deployToCloudLeftSide {
        text-align: left;

        a {
            margin: 0;
        }

        h2 {
            color: var(--color-text);
            font-family: "Lota Grotesque";
            font-size: 48px;
            line-height: 58px;
        }
    }
    .deployToCloudDescription {
        font-size: 18px;
        line-height: 28px;
    }
}

/************** BUILD SCRAPERS FAST SECTION ***********/

.buildFastScrapersSection {
    position: relative;

    padding: 40px 0 32px;

    border-bottom: 1px solid var(--color-separator);

    h2 {
        margin: 0;
        padding: 32px 0;
        text-align: center;
        color: var(--color-text);
        font-weight: 400;
        padding-inline: 12px;

        line-height: 46px !important;
        font-size: 36px !important;

        @media (min-width: 768px) {
            padding-inline: 24px;

            line-height: 56px !important;
            font-size: 48px !important;
            padding: 80px 0 64px;
        }
    }

    div[class*="dashedDecorativeCircle"] {
        display: none;
    }

    @media (min-width: 1024px) {
        padding: 80px 0 60px;
        div[class*="dashedDecorativeCircle"] {
            display: block;
        }
    }
}

.buildFastScrapersContent {
    border-block: 1px solid var(--color-separator);
}


================================================
FILE: website/src/plugins/docusaurus-plugin-segment/index.js
================================================
const path = require('path');

module.exports = function (context, options) {
    const { writeKey, allowedInDev = false } = options;

    return {
        name: 'docusaurus-plugin-segment',

        getClientModules() {
            return [path.resolve(__dirname, './segment')];
        },

        injectHtmlTags() {
            if (process.env.NODE_ENV !== 'production' && !allowedInDev) {
                return {};
            }

            if (!writeKey) {
                console.warn('You need to specify a Segment writeKey in the plugin options');
                return {};
            }

            return {
                headTags: [
                    {
                        tagName: 'script',
                        innerHTML: `
            !function(){var i="analytics",analytics=window[i]=window[i]||[];if(!analytics.initialize)if(analytics.invoked)window.console&&console.error&&console.error("Segment snippet included twice.");else{analytics.invoked=!0;analytics.methods=["trackSubmit","trackClick","trackLink","trackForm","pageview","identify","reset","group","track","ready","alias","debug","page","screen","once","off","on","addSourceMiddleware","addIntegrationMiddleware","setAnonymousId","addDestinationMiddleware","register"];analytics.factory=function(e){return function(){if(window[i].initialized)return window[i][e].apply(window[i],arguments);var n=Array.prototype.slice.call(arguments);if(["track","screen","alias","group","page","identify"].indexOf(e)>-1){var c=document.querySelector("link[rel='canonical']");n.push({__t:"bpc",c:c&&c.getAttribute("href")||void 0,p:location.pathname,u:location.href,s:location.search,t:document.title,r:document.referrer})}n.unshift(e);analytics.push(n);return analytics}};for(var n=0;n<analytics.methods.length;n++){var key=analytics.methods[n];analytics[key]=analytics.factory(key)}analytics.load=function(key,n){var t=document.createElement("script");t.type="text/javascript";t.async=!0;t.setAttribute("data-global-segment-analytics-key",i);t.src="https://cdn.segment.com/analytics.js/v1/" + key + "/analytics.min.js";var r=document.getElementsByTagName("script")[0];r.parentNode.insertBefore(t,r);analytics._loadOptions=n};analytics._writeKey="${writeKey}";;analytics.SNIPPET_VERSION="5.2.0";
            analytics.load("${writeKey}", { integrations: { "Segment.io": { apiHost: "analytics.apify.com/v1" } } });
            }}();
            `,
                    },
                ],
            };
        },
    };
};


================================================
FILE: website/src/plugins/docusaurus-plugin-segment/segment.js
================================================
import ExecutionEnvironment from '@docusaurus/ExecutionEnvironment';

export default ExecutionEnvironment.canUseDOM ? {
    onRouteUpdate() {
        // this forces deferred execution that ensures `window.location` is in sync
        setTimeout(() => {
            // Don't track page views on development
            if (process.env.NODE_ENV === 'production' && window.analytics) {
                window.analytics.page({
                    app: 'crawlee',
                    path: window.location.pathname,
                    url: window.location.href,
                    search: window.location.search,
                });
            }
        }, 0);
    },
} : null;


================================================
FILE: website/src/theme/ColorModeToggle/index.js
================================================
import { translate } from '@docusaurus/Translate';
import useIsBrowser from '@docusaurus/useIsBrowser';
import clsx from 'clsx';
import React from 'react';

import IconDarkMode from './dark-mode-icon.svg';
import IconLightMode from './light-mode-icon.svg';
import styles from './styles.module.css';

function ColorModeToggle({
    className,
    value,
    onChange,
}) {
    const isBrowser = useIsBrowser();
    const title = translate(
        {
            message: 'Switch between dark and light mode (currently {mode})',
            id: 'theme.colorToggle.ariaLabel',
            description: 'The ARIA label for the navbar color mode toggle',
        },
        {
            mode:
                value === 'dark'
                    ? translate({
                        message: 'dark mode',
                        id: 'theme.colorToggle.ariaLabel.mode.dark',
                        description: 'The name for the dark color mode',
                    })
                    : translate({
                        message: 'light mode',
                        id: 'theme.colorToggle.ariaLabel.mode.light',
                        description: 'The name for the light color mode',
                    }),
        },
    );
    return (
        <div className={className}>
            <button
                className={clsx(
                    'clean-btn',
                    styles.toggleButton,
                    !isBrowser && styles.toggleButtonDisabled,
                )}
                type="button"
                onClick={() => onChange(value === 'dark' ? 'light' : 'dark')}
                disabled={!isBrowser}
                title={title}
                aria-label={title}>
                <IconLightMode
                    className={clsx(styles.toggleIcon, styles.lightToggleIcon)}

                />
                <IconDarkMode
                    className={clsx(styles.toggleIcon, styles.darkToggleIcon)}
                />
                <span />
            </button>
        </div>
    );
}

export default React.memo(ColorModeToggle);


================================================
FILE: website/src/theme/ColorModeToggle/styles.module.css
================================================
.toggleButton {
    padding: 4px;
    display: flex;
    gap: 4px;
    align-items: center;
    transition: all var(--ifm-transition-fast);
    position: relative;
    border-radius: 150px;
    background-color: var(--color-background-subtle);
}

.toggleButton span {
    width: 44px;
    height: 36px;
    border-radius: 50%;
    background: #fff;
    position: absolute;
    transition: all var(--ifm-transition-fast);
    left: 0;
    margin: 4px;

    border-radius: 150px;
    background-color: var(--color-background);

    /* Light/L1 */
    box-shadow:
        0px 0.5px 1.5px 0px rgba(63, 71, 93, 0.15),
        0.4px 0.8px 1px -1.2px rgba(63, 71, 93, 0.14),
        1px 2px 2.5px -2.5px rgba(63, 71, 93, 0.13);
}

.toggleButton svg {
    z-index: 1;
    margin: 8px 12px;
    width: 20px;
    height: 20px;
    path {
        stroke: var(--color-icon);
    }
}

[data-theme='dark'] .toggleButton span {
    left: 48px;
}

.toggleButtonDisabled {
    cursor: not-allowed;
}


================================================
FILE: website/src/theme/DocItem/Content/index.js
================================================
import { useDoc } from '@docusaurus/plugin-content-docs/client';
import LLMButtons from '@site/src/components/LLMButtons';
import Heading from '@theme/Heading';
import MDXContent from '@theme/MDXContent';
import clsx from 'clsx';
import React from 'react';

import styles from './styles.module.css';

function useSyntheticTitle() {
    const { metadata, frontMatter, contentTitle } = useDoc();
    const shouldRender = !frontMatter.hide_title && typeof contentTitle === 'undefined';

    if (!shouldRender) {
        return null;
    }

    return metadata.title;
}

export default function DocItemContent({ children }) {
    const syntheticTitle = useSyntheticTitle();

    return (
        <div className={clsx('markdown')}>
            {syntheticTitle && (
                <div className={styles.docItemContent}>
                    {syntheticTitle && <Heading as="h1">{syntheticTitle}</Heading>}
                    <LLMButtons />
                </div>
            )}
            <MDXContent>{children}</MDXContent>
        </div>
    );
}

================================================
FILE: website/src/theme/DocItem/Content/styles.module.css
================================================
.docItemContent {
    display: flex;
    align-items: center;
    justify-content: space-between;
    gap: 1rem;
    flex-wrap: wrap;
    padding-bottom: calc(
        var(--ifm-h1-vertical-rhythm-bottom) * var(--ifm-leading)
    );
  
    h1 {
        margin: 0 !important;
        flex: 1 1 auto;
        min-width: 12rem;
    }
  
    @media (max-width: 767px) {
        flex-direction: column;
        align-items: flex-start;
        gap: 0.75rem;
    }
  }

================================================
FILE: website/src/theme/DocItem/Layout/index.js
================================================
import { useDoc } from '@docusaurus/plugin-content-docs/client';
import { useWindowSize, useColorMode } from '@docusaurus/theme-common';
import Giscus from '@giscus/react';
import DocBreadcrumbs from '@theme/DocBreadcrumbs';
import DocItemContent from '@theme/DocItem/Content';
import DocItemFooter from '@theme/DocItem/Footer';
import DocItemPaginator from '@theme/DocItem/Paginator';
import DocItemTOCDesktop from '@theme/DocItem/TOC/Desktop';
import DocItemTOCMobile from '@theme/DocItem/TOC/Mobile';
import DocVersionBadge from '@theme/DocVersionBadge';
import DocVersionBanner from '@theme/DocVersionBanner';
import clsx from 'clsx';
import React from 'react';

import styles from './styles.module.css';

/**
 * Decide if the toc should be rendered, on mobile or desktop viewports
 */
function useDocTOC() {
    const {
        frontMatter,
        toc,
    } = useDoc();
    const windowSize = useWindowSize();
    const hidden = frontMatter.hide_table_of_contents;
    const canRender = !hidden && toc.length > 0;
    const mobile = canRender ? <DocItemTOCMobile/> : undefined;
    const desktop = canRender && (windowSize === 'desktop' || windowSize === 'ssr') ? (
        <DocItemTOCDesktop/>
    ) : undefined;
    return {
        hidden,
        mobile,
        desktop,
    };
}

export default function DocItemLayout({ children }) {
    const docTOC = useDocTOC();
    const { colorMode } = useColorMode();
    return (
        <div className="row">
            <div className={clsx('col', !docTOC.hidden && styles.docItemCol)}>
                <DocVersionBanner/>
                <div className={styles.docItemContainer}>
                    <article>
                        <DocBreadcrumbs/>
                        <DocVersionBadge/>
                        {docTOC.mobile}
                        <DocItemContent>{children}</DocItemContent>
                        <DocItemFooter/>
                    </article>
                    <DocItemPaginator/>

                    <Giscus
                        id="giscus-comments"
                        repo="apify/crawlee-python"
                        repoId="R_kgDOLDBXgA"
                        category="Comments"
                        categoryId="DIC_kwDOLDBXgM4CgQI1"
                        mapping="pathname"
                        reactionsEnabled="1"
                        emitMetadata="0"
                        inputPosition="top"
                        theme={colorMode}
                        lang="en"
                        strict="1"
                    />
                </div>
            </div>

            {docTOC.desktop && <div className="col col--3">{docTOC.desktop}</div>}
        </div>
    );
}


================================================
FILE: website/src/theme/DocItem/Layout/styles.module.css
================================================
.docItemContainer {
    margin-bottom: 50px;
}

.docItemContainer header + *,
.docItemContainer article > *:first-child {
  margin-top: 0;
}

@media (min-width: 997px) {
  .docItemCol {
    max-width: 75% !important;
  }
}


================================================
FILE: website/src/theme/Footer/LinkItem/index.js
================================================
import isInternalUrl from '@docusaurus/isInternalUrl';
import Link from '@docusaurus/Link';
import useBaseUrl from '@docusaurus/useBaseUrl';
import clsx from 'clsx';
import React from 'react';

import styles from './index.module.css';

export default function FooterLinkItem({ item }) {
    const ExternalLinkIcon = require('../../../../static/img/external-link.svg').default;

    const { to, href, label, prependBaseUrlToHref, className, ...props } = item;
    const toUrl = useBaseUrl(to);
    const normalizedHref = useBaseUrl(href, { forcePrependBaseUrl: true });

    return (
        <Link
            className={clsx('footer__link-item', className, styles.footerLink)}
            {...(href
                ? {
                    href: prependBaseUrlToHref ? normalizedHref : href,
                }
                : {
                    to: toUrl,
                })}
            {...props}>
            {label}
            {href && !isInternalUrl(href) && <ExternalLinkIcon className={styles.externalLinkIcon} />}
        </Link>
    );
}


================================================
FILE: website/src/theme/Footer/LinkItem/index.module.css
================================================
.footerLink {
    color: var(--color-text);
    cursor: pointer;
    font-size: 14px;
    line-height: 20px;
    &:hover {
        color: var(--color-text-subtle);
        path {
            fill: var(--color-text-subtle);
        }
    }
}

.externalLinkIcon {
    margin-left: 5px;
    path {
        fill: var(--color-text);
    }
}


================================================
FILE: website/src/theme/Footer/index.js
================================================
import Link from '@docusaurus/Link';
import { useThemeConfig } from '@docusaurus/theme-common';
import useBaseUrl from '@docusaurus/useBaseUrl';
import LinkItem from '@theme/Footer/LinkItem';
import NavbarColorModeToggle from '@theme/Navbar/ColorModeToggle';
import ThemedImage from '@theme/ThemedImage';
import clsx from 'clsx';
import React from 'react';

import styles from './index.module.css';

function FooterLinksColumn({ column }) {
    return (
        <div>
            <div className={styles.footerTitle}>{column.title}</div>
            <ul className={clsx(styles.footerList, 'clean-list')}>
                {column.items.map((item, i) => (
                    <li key={i}>
                        <LinkItem item={item} />
                    </li>
                ))}
            </ul>
        </div>
    );
}

function Footer() {
    const { footer } = useThemeConfig();

    const { links, style } = footer;

    const HearthIcon = require('../../../static/img/hearth.svg').default;
    const logoSources = {
        light: useBaseUrl('/img/crawlee-light.svg'),
        dark: useBaseUrl('/img/crawlee-dark.svg'),
    };

    if (!footer) {
        return null;
    }

    return (
        <footer className={clsx(styles.footer, style)
        } >
            <div className={styles.footerTop}>
                <div className={styles.footerTopRow}>
                    <div className={styles.footerTopRowLeft}>
                        <Link href="https://crawlee.dev" width="120" className={styles.footerLogo} target="_self" rel="dofollow">
                            <ThemedImage
                                width="120"
                                alt="Docusaurus themed image"
                                sources={logoSources}
                            />
                        </Link>
                        <NavbarColorModeToggle />
                    </div>
                    <div className={styles.footerTopRowRight}>
                        <FooterLinksColumn column={links[0]} />
                        <FooterLinksColumn column={links[1]} />
                        <FooterLinksColumn column={links[2]} />
                    </div>
                </div>
            </div>

            <div className={styles.footerBottom}>
                <div className={styles.footerBottomRow}>
                    <div>
                        <HearthIcon className={styles.hearthIcon} />
                        Crawlee is forever free and open source
                    </div>
                    <div>© {new Date().getFullYear()} Apify</div>
                </div>
            </div>
        </footer >
    );
}

export default React.memo(Footer);


================================================
FILE: website/src/theme/Footer/index.module.css
================================================
.footer {
    background: var(--color-background);
    color: var(--color-text);
}

.footerBottom,
.footerTop {
    border-top: 1px solid var(--color-separator);

    @media (min-width: 768px) {
        padding: 40px 40px;
    }
    @media (min-width: 1024px) {
        padding: 40px 64px;
    }
}

.footerTopRow {
    max-width: var(--max-layout-width);
    margin: auto;

    display: flex;
    flex-direction: column;
    @media (min-width: 768px) {
        flex-direction: row;
    }
}

.footerTopRowRight {
    flex-direction: column;
    display: flex;
    flex: 3;
    gap: 32px;
    padding: 16px 40px 40px;

    @media (min-width: 768px) {
        gap: 0;
        flex-direction: row;
        padding: 0;
        justify-content: space-between;
    }
}

.footerTopRowLeft {
    display: flex;
    flex-direction: column;
    justify-content: space-between;
    flex: 2;
    padding: 32px 40px 24px;
    gap: 32px;

    border-bottom: 1px solid var(--color-separator);

    img {
        display: block !important;
    }

    @media (min-width: 768px) {
        padding: 0;
        border: 0;
        gap: 0;
    }
}

.footerBottomRow {
    max-width: var(--max-layout-width);
    margin: auto;

    display: flex;
    flex-direction: column;
    align-items: center;
    gap: 24px;
    padding: 24px 40px;

    font-size: 14px;
    line-height: 20px;
    text-align: center;

    @media (min-width: 768px) {
        gap: 0;
        padding: 0;
        flex-direction: row;
        justify-content: space-between;
    }
}

.hearthIcon {
    margin-right: 8px;
    path {
        fill: var(--color-text-muted);
    }
}

.footerTitle {
    font-size: 16px;
    font-weight: 700;
    line-height: 24px;
}

.footerList {
    margin: 0;
    li {
        margin-top: 16px;
        height: 28px;
    }
}

.footerLogo {
    width: fit-content;
}


================================================
FILE: website/src/theme/MDXComponents/A.js
================================================
/* eslint-disable react/prop-types */
import Link from '@docusaurus/Link';
import useDocusaurusContext from '@docusaurus/useDocusaurusContext';
import React from 'react';

export default function MDXA(props) {
    const { siteConfig } = useDocusaurusContext();
    if (props.href?.startsWith(siteConfig.url)) {
        const { href, ...rest } = props;
        rest.to = props.href.replace(siteConfig.url + siteConfig.baseUrl, '/');
        props = rest;
    }

    return <Link {...props} />;
}


================================================
FILE: website/src/theme/Navbar/Content/index.js
================================================
import Link from '@docusaurus/Link';
import { useLocation } from '@docusaurus/router';
import { useThemeConfig } from '@docusaurus/theme-common';
import {
    splitNavbarItems,
    useNavbarMobileSidebar,
} from '@docusaurus/theme-common/internal';
import NavbarLogo from '@theme/Navbar/Logo';
import NavbarMobileSidebarToggle from '@theme/Navbar/MobileSidebar/Toggle';
import NavbarSearch from '@theme/Navbar/Search';
import NavbarItem from '@theme/NavbarItem';
import SearchBar from '@theme/SearchBar';
import clsx from 'clsx';
import React from 'react';

import styles from './styles.module.css';

function useNavbarItems() {
    return useThemeConfig().navbar.items;
}

function NavbarItems({ items, className }) {
    return (
        <div className={clsx(styles.navbarItems, className)}>
            {items.map((item, i) => (
                <NavbarItem {...item} key={i} />
            ))}
        </div>
    );
}

function NavbarContentLayout({ left, right }) {
    return (
        <div className="navbar__inner">
            <div className="navbar__items">{left}</div>
            <div className="navbar__items navbar__items--right">{right}</div>
        </div>
    );
}

const VERSIONS_ITEM = {
    type: 'docsVersionDropdown',
    position: 'left',
    label: 'Versions',
    dropdownItemsAfter: [
        {
            href: 'https://sdk.apify.com/docs/guides/getting-started',
            label: '2.2',
        },
        {
            href: 'https://sdk.apify.com/docs/1.3.1/guides/getting-started',
            label: '1.3',
        },
    ],
    dropdownItemsBefore: [],
};

export default function NavbarContent() {
    const location = useLocation();
    const mobileSidebar = useNavbarMobileSidebar();
    const items = useNavbarItems();
    const effectiveItems = location.pathname?.endsWith('/python/')
        || location.pathname?.endsWith('/python')
        ? items
        : [...items, VERSIONS_ITEM];
    const [leftItems, rightItems] = splitNavbarItems(effectiveItems);
    const searchBarItem = items.find((item) => item.type === 'search');
    return (
        <NavbarContentLayout
            left={
                <>
                    <NavbarLogo />
                    <NavbarItems items={leftItems} />
                </>
            }
            right={
                <>
                    {rightItems?.length > 0 && (
                        <NavbarItems items={rightItems} />
                    )}
                    {!searchBarItem && (
                        <NavbarSearch>
                            <SearchBar />
                        </NavbarSearch>
                    )}
                    <Link
                        className={styles.getStartedButton}
                        to="/docs/quick-start"
                    >
                        Get started
                    </Link>
                    {!mobileSidebar.disabled && <NavbarMobileSidebarToggle />}
                </>
            }
        />
    );
}


================================================
FILE: website/src/theme/Navbar/Content/styles.module.css
================================================
.navbarItems {
    display: flex;
    align-items: center;
    margin-inline: auto;
    gap: 16px;
}

.navbarItems__leftMargin {
    margin-left: 40px;
}

.getStartedButton {
    color: var(--color-text-on-primary);
    background: var(--color-black-action);
    border-radius: 8px;
    font-size: 16px;
    font-weight: 500;
    line-height: 24px;
    padding: 8px 16px !important;
    border: none;
    transition: background-color 0.2s;

    &:hover {
        color: var(--color-text-on-primary);
        background-color: var(--color-primary-action-hover);
    }
}


================================================
FILE: website/src/theme/Navbar/Logo/index.js
================================================
import Link from '@docusaurus/Link';
import { useThemeConfig } from '@docusaurus/theme-common';
import useBaseUrl from '@docusaurus/useBaseUrl';
import Logo from '@theme/Logo';
import ThemedImage from '@theme/ThemedImage';
import React from 'react';

import styles from './index.module.css';

export default function LogoWrapper(props) {
    const ArrowsIcon = require('../../../../static/img/menu-arrows.svg').default;
    const CheckIcon = require('../../../../static/img/check.svg').default;
    const { navbar: { logo } } = useThemeConfig();
    const javascriptLogo = {
        light: useBaseUrl('img/crawlee-javascript-light.svg'),
        dark: useBaseUrl('img/crawlee-javascript-dark.svg'),
    };
    const languageAgnosticLogo = {
        light: useBaseUrl('img/crawlee-light.svg'),
        dark: useBaseUrl('img/crawlee-dark.svg'),
    };
    const pythonLogo = {
        light: useBaseUrl(logo.src),
        dark: useBaseUrl(logo.srcDark || logo.src),
    };
    return (
        <div className={styles.navbarLogo}>
            <div className={styles.logoWithArrows}>
                <Logo titleClassName="navbar__title" />
                <ArrowsIcon />
            </div>
            <div className={styles.menuWrapper}>
                <div className={styles.menu}>
                    <Link className={styles.menuItem} href="https://crawlee.dev/js" target="_self" rel="dofollow">
                        <ThemedImage sources={javascriptLogo} alt="Crawlee JavaScript" />
                    </Link>
                    <Link className={styles.menuItem} to="/" >
                        <ThemedImage sources={pythonLogo} alt="Crawlee Python" />
                        <CheckIcon />
                    </Link>
                    <Link className={styles.menuItem} href="https://crawlee.dev" target="_self" rel="dofollow">
                        <ThemedImage sources={languageAgnosticLogo} alt="Crawlee" />
                    </Link>
                </div>
            </div>
        </div >
    );
}


================================================
FILE: website/src/theme/Navbar/Logo/index.module.css
================================================
.navbarLogo {
    position: relative;
    cursor: pointer;

    /* do not display the other theme logo when loading */
    a {
        img:nth-child(2) {
            display: none !important;
        }
    }
}

.logoWithArrows {
    display: flex;
    align-items: center;
    width: 220px;

    svg {
        margin: 0 2px;
        g {
            stroke: var(--color-icon);
        }
    }

    img {
        display: block !important;
        height: 28px;
    }
}

.menuWrapper {
    position: absolute;
    left: 0;
    top: 100%;

    z-index: 100;
    padding-top: 6px;
}

.menu {
    width: 230px;
    border-radius: 8px;
    border: 1px solid var(--color-border);
    box-shadow:
        0px 4px 8px 0px rgba(36, 39, 54, 0.12),
        0px 2px 4px 0px rgba(36, 39, 54, 0.08),
        0px 0px 1px 0px rgba(36, 39, 54, 0.24);

    background: var(--color-card-background);
    padding: 8px 0;
    overflow: hidden;
    transition: all 0.3s;

    flex-direction: column;
    align-items: flex-start;

    padding: 8px;

    display: none;

    img {
        height: 24px;
        width: auto;
        display: block !important;
    }
}

.navbarLogo:hover {
    .menu {
        display: flex;
    }
}

.menuItem {
    padding: 8px;
    width: 100%;
    border-radius: 12px;
    display: flex;
    justify-content: space-between;
    align-items: center;
    path {
        fill: var(--color-icon);
    }
    &:hover {
        background: var(--color-hover);
    }
}


================================================
FILE: website/src/theme/Navbar/MobileSidebar/Header/index.js
================================================
import Link from '@docusaurus/Link';
import { useLocation } from '@docusaurus/router';
import { useNavbarMobileSidebar } from '@docusaurus/theme-common/internal';
import { translate } from '@docusaurus/Translate';
import IconClose from '@theme/Icon/Close';
import NavbarLogo from '@theme/Navbar/Logo';
import SearchBar from '@theme/SearchBar';
import clsx from 'clsx';
import React from 'react';

import styles from './index.module.css';

function CloseButton() {
    const mobileSidebar = useNavbarMobileSidebar();
    return (
        <button
            type="button"
            aria-label={translate({
                id: 'theme.docs.sidebar.closeSidebarButtonAriaLabel',
                message: 'Close navigation bar',
                description: 'The ARIA label for close button of mobile sidebar',
            })}
            className="clean-btn navbar-sidebar__close"
            onClick={() => mobileSidebar.toggle()}>
            <IconClose color="var(--ifm-color-emphasis-600)" />
        </button>
    );
}
export default function NavbarMobileSidebarHeader() {
    const { toggle, shown } = useNavbarMobileSidebar();
    const closeSidebar = () => shown && toggle();

    return (
        <div className="navbar-sidebar__brand">
            <div className={styles.navbarHeader}>
                <NavbarLogo />
                <div className={clsx(styles.navbarButtonsWrapper, styles.navbarButtonsWrapperDesktop)} >
                    <div onClick={closeSidebar} >
                        <SearchBar />
                    </div>
                    <Link className={styles.getStartedButton} to="/docs/quick-start" onClick={closeSidebar} >
                        Get started
                    </Link>
                </div>
                <CloseButton />
            </div>
            <div className={clsx(styles.navbarButtonsWrapper, styles.navbarButtonsWrapperMobile)} >
                <Link className={styles.getStartedButton} to="/docs/quick-start" onClick={closeSidebar}>
                    Get started
                </Link>
                <div onClick={closeSidebar} >
                    <SearchBar />
                </div>
            </div>
        </div>
    );
}


================================================
FILE: website/src/theme/Navbar/MobileSidebar/Header/index.module.css
================================================
.getStartedButton {
    color: var(--color-text-on-primary);
    background: var(--color-black-action);
    border-radius: 8px;
    font-size: 16px;
    font-weight: 500;
    line-height: 24px;
    padding: 8px 16px !important;
    border: none;
    &:hover {
        color: var(--color-text-on-primary);
    }
    text-align: center;
}

.navbarHeader {
    display: flex;
    width: 100%;
    align-items: center;
    justify-content: space-between;
    padding: 16px;

    @media (min-width: 768px) {
        padding: 20px 40px;
    }
    @media (min-width: 1024px) {
        padding: 20px 64px;
    }
}

.navbarButtonsWrapper {
    display: flex;
    gap: 16px;
    margin-left: auto;
}

.navbarButtonsWrapperDesktop {
    display: flex;
    @media (max-width: 767px) {
        display: none;
    }
}
.navbarButtonsWrapperMobile {
    border-top: 1px solid var(--color-separator);
    display: none;
    @media (max-width: 767px) {
        display: flex;
    }
    width: 100%;
    margin: 0;
    flex-direction: column;
    gap: 16px;
    button {
        width: 100%;
    }
    padding: 16px 24px;
}


================================================
FILE: website/src/theme/Navbar/MobileSidebar/Layout/index.js
================================================
import { useNavbarSecondaryMenu } from '@docusaurus/theme-common/internal';
import clsx from 'clsx';
import React from 'react';

export default function NavbarMobileSidebarLayout({
    header,
    primaryMenu,
    secondaryMenu,
}) {
    const { shown: secondaryMenuShown } = useNavbarSecondaryMenu();
    return (
        <div className="navbar-sidebar">
            {header}
            <div
                className={clsx('navbar-sidebar__items', {
                    'navbar-sidebar__items--show-secondary': secondaryMenuShown,
                })}>
                <div className="navbar-sidebar__item menu menu-primary">{primaryMenu}</div>
                <div className="navbar-sidebar__item menu menu-secondary">{secondaryMenu}</div>
            </div>
        </div>
    );
}


================================================
FILE: website/src/theme/Navbar/MobileSidebar/PrimaryMenu/index.js
================================================
import { useThemeConfig } from '@docusaurus/theme-common';
import { useNavbarMobileSidebar } from '@docusaurus/theme-common/internal';
import NavbarItem from '@theme/NavbarItem';
import React from 'react';

function useNavbarItems() {
    return useThemeConfig().navbar.items;
}
// The primary menu displays the navbar items
export default function NavbarMobilePrimaryMenu() {
    const mobileSidebar = useNavbarMobileSidebar();
    const items = useNavbarItems();

    return (
        <ul className="menu__list">
            {items.map((item, i) => (
                <NavbarItem
                    mobile
                    {...item}
                    onClick={() => mobileSidebar.toggle()}
                    key={i}
                />
            ))}
        </ul>
    );
}


================================================
FILE: website/src/theme/Navbar/MobileSidebar/index.js
================================================
import {
    useLockBodyScroll,
    useNavbarMobileSidebar,
    useWindowSize,
} from '@docusaurus/theme-common/internal';
import NavbarMobileSidebarHeader from '@theme/Navbar/MobileSidebar/Header';
import NavbarMobileSidebarLayout from '@theme/Navbar/MobileSidebar/Layout';
import NavbarMobileSidebarPrimaryMenu from '@theme/Navbar/MobileSidebar/PrimaryMenu';
import NavbarMobileSidebarSecondaryMenu from '@theme/Navbar/MobileSidebar/SecondaryMenu';
import React from 'react';

export default function NavbarMobileSidebar() {
    const mobileSidebar = useNavbarMobileSidebar();
    const windowSize = useWindowSize({
        desktopBreakpoint: 1200,
    });

    useLockBodyScroll(mobileSidebar.shown);
    const shouldRender = !mobileSidebar.disabled && windowSize === 'mobile';
    if (!shouldRender) {
        return null;
    }
    return (
        <NavbarMobileSidebarLayout
            header={<NavbarMobileSidebarHeader />}
            primaryMenu={<NavbarMobileSidebarPrimaryMenu />}
            secondaryMenu={<NavbarMobileSidebarSecondaryMenu />}
        />
    );
}


================================================
FILE: website/src/theme/NavbarItem/ComponentTypes.js
================================================
import { useActiveDocContext, useLayoutDoc } from '@docusaurus/plugin-content-docs/client';
import DefaultNavbarItem from '@theme/NavbarItem/DefaultNavbarItem';
import DocSidebarNavbarItem from '@theme/NavbarItem/DocSidebarNavbarItem';
import DocsVersionDropdownNavbarItem from '@theme/NavbarItem/DocsVersionDropdownNavbarItem';
import DocsVersionNavbarItem from '@theme/NavbarItem/DocsVersionNavbarItem';
import DropdownNavbarItem from '@theme/NavbarItem/DropdownNavbarItem';
import HtmlNavbarItem from '@theme/NavbarItem/HtmlNavbarItem';
import LocaleDropdownNavbarItem from '@theme/NavbarItem/LocaleDropdownNavbarItem';
import SearchNavbarItem from '@theme/NavbarItem/SearchNavbarItem';
import React from 'react';

// const versions = require('../../../versions.json');
// const stable = versions[0];

function DocNavbarItem({
    docId,
    label: staticLabel,
    docsPluginId,
    ...props
}) {
    const { activeDoc } = useActiveDocContext(docsPluginId);
    const doc = useLayoutDoc(docId, docsPluginId);
    // Draft items are not displayed in the navbar.
    if (doc === null) {
        return null;
    }
    return (
        <DefaultNavbarItem
            exact
            {...props}
            isActive={() => activeDoc?.path.startsWith(doc.path)}
            label={staticLabel ?? doc.id}
            to={doc.path}
        />
    );
}

function ApiNavbarItem(ctx) {
    return (
        <DefaultNavbarItem
            exact
            {...ctx}
            label={ctx.label}
            to={`api/${ctx.to}`}
        />
    );

    // let version = {};
    //
    // try {
    //     // eslint-disable-next-line react-hooks/rules-of-hooks
    //     version = useDocsVersion();
    // } catch {
    //     version.version = stable;
    // }
    //
    // const { siteConfig } = useDocusaurusContext();
    //
    // if (siteConfig.presets[0][1].docs.disableVersioning || version.version === stable) {
    //     return (
    //         <DefaultNavbarItem
    //             exact
    //             {...ctx}
    //             label={ctx.label}
    //             to={`api/${ctx.to}`}
    //         />
    //     );
    // }
    //
    // return (
    //     <DefaultNavbarItem
    //         exact
    //         {...ctx}
    //         label={ctx.label}
    //         to={`api/${version.version === 'current' ? 'next' : version.version}/${ctx.to}`}
    //     />
    // );
}

const ComponentTypes = {
    'default': DefaultNavbarItem,
    'localeDropdown': LocaleDropdownNavbarItem,
    'search': SearchNavbarItem,
    'dropdown': DropdownNavbarItem,
    'html': HtmlNavbarItem,
    'custom-api': ApiNavbarItem,
    'doc': DocNavbarItem,
    'docSidebar': DocSidebarNavbarItem,
    'docsVersion': DocsVersionNavbarItem,
    'docsVersionDropdown': DocsVersionDropdownNavbarItem,
};
export default ComponentTypes;


================================================
FILE: website/static/.nojekyll
================================================


================================================
FILE: website/static/js/custom.js
================================================
function load() {
    const versions = document.querySelectorAll('.navbar .dropdown ul a');
    const basePath = '';
    const types = [`${basePath}/docs/next`, `${basePath}/docs`];
    let i = 0;

    for (const el of versions) {
        const match = el.href.match(/\/docs\/(\d+\.\d+(\.\d+)?)$/) || el.href.match(/\/docs\/(\d+\.\d+(\.\d+)?)/);

        if (!types[i++] && !match) {
            continue;
        }

        const version = (types[i++] || match[0]).replace('/docs', '/api');

        if (el.classList.contains('api-version-bound')) {
            continue;
        }

        el.addEventListener('click', (e) => {
            if (version && window.location.pathname.startsWith(`${basePath}/api`)) {
                window.location.href = version;
                e.preventDefault();
            }
        });
        el.classList.add('api-version-bound');
    }
}

setInterval(() => {
    if (document.querySelectorAll('.navbar .dropdown ul a').length > 0) {
        load();
    }
}, 500);

if (window.location.href.startsWith('https://apify.github.io/crawlee-python')) {
    window.location.href = window.location.href.replace('https://apify.github.io/crawlee-python', 'https://crawlee.dev/python');
}

if (window.location.href.startsWith('https://crawlee.dev/crawlee-python')) {
    window.location.href = window.location.href.replace('https://crawlee.dev/crawlee-python', 'https://crawlee.dev/python');
}


================================================
FILE: website/static/robots.txt
================================================
User-agent: *
Sitemap: https://crawlee.dev/python/sitemap.xml


================================================
FILE: website/tools/docs-prettier.config.js
================================================
/**
 * @type {import('prettier').Options}
 */
module.exports = {
    parser: 'markdown',
    arrowParens: 'avoid',
    trailingComma: 'all',
    singleQuote: true,
    tabWidth: 4,
    printWidth: 150,
    proseWrap: 'always',
};


================================================
FILE: website/tools/utils/externalLink.js
================================================
const { parse } = require('url');

const visit = import('unist-util-visit').then((m) => m.visit);

const internalUrls = ['crawlee.dev'];

/**
 * @param {import('url').UrlWithStringQuery} href
 */
function isInternal(href) {
    return internalUrls.some(
        (internalUrl) => href.host === internalUrl
            || (!href.protocol && !href.host && (href.pathname || href.hash)),
    );
}

/**
 * @type {import('unified').Plugin}
 */
exports.externalLinkProcessor = () => {
    return async (tree) => {
        (await visit)(tree, 'element', (node) => {
            if (
                node.tagName === 'a'
                && node.properties
                && typeof node.properties.href === 'string'
            ) {
                const href = parse(node.properties.href);

                if (!isInternal(href)) {
                    node.properties.target = '_blank';
                    node.properties.rel = 'noopener';
                } else {
                    node.properties.target = null;
                    node.properties.rel = null;
                }
            }
        });
    };
};


================================================
FILE: website/tools/website_gif/website_gif.mjs
================================================
/**
 * How to generate the gifs:
 *
 * 1. Set a breakpoint on the marked line
 * 2. Run the crawler with the debugger
 * 3. Setup your chrome and recording
 * 4. Resume, record, ???, profit!
 */

import { PuppeteerCrawler, sleep } from 'crawlee';

const crawler = new PuppeteerCrawler({
    headless: false,
    maxConcurrency: 1,
    navigationTimeoutSecs: 100000,
    requestHandlerTimeoutSecs: 10000,
    browserPoolOptions: {
        closeInactiveBrowserAfterSecs: 100000,
        operationTimeoutSecs: 100000,
    },
    async requestHandler({ request }) {
        if (request.userData.label === 'start') {
            console.log('Waiting 5s, prepare recording!');
            await sleep(5000); // <--- Set breakpoint here
        } else {
            await sleep(250);
        }
    },
});

await crawler.run([
    {
        url: 'https://crawlee.dev',
        userData: { label: 'start' },
        uniqueKey: 'dark-start'
    },
    {
        url: 'https://crawlee.dev/docs/quick-start',
        uniqueKey: 'dark-1'
    },
    {
        url: 'https://crawlee.dev/docs/introduction/setting-up',
        uniqueKey: 'dark-2'
    },
    {
        url: 'https://crawlee.dev/docs/introduction/first-crawler',
        uniqueKey: 'dark-3'
    },
    {
        url: 'https://crawlee.dev/docs/introduction/adding-urls',
        uniqueKey: 'dark-4'
    },
    {
        url: 'https://crawlee.dev/docs/introduction/real-world-project',
        uniqueKey: 'dark-5'
    },

    // Light theme
    {
        url: 'https://crawlee.dev',
        userData: { label: 'start' },
        uniqueKey: 'light th-start'
    },
    {
        url: 'https://crawlee.dev/docs/quick-start',
        uniqueKey: 'light th-1'
    },
    {
        url: 'https://crawlee.dev/docs/introduction/setting-up',
        uniqueKey: 'light th-2'
    },
    {
        url: 'https://crawlee.dev/docs/introduction/first-crawler',
        uniqueKey: 'light th-3'
    },
    {
        url: 'https://crawlee.dev/docs/introduction/adding-urls',
        uniqueKey: 'light th-4'
    },
    {
        url: 'https://crawlee.dev/docs/introduction/real-world-project',
        uniqueKey: 'light th-5'
    }
]);


================================================
FILE: website/tsconfig.eslint.json
================================================
{
	"extends": "@apify/tsconfig",
	"compilerOptions": {
		"jsx": "preserve"
	},
	"include": [
		"src/**/*.js",
		"src/**/*.ts",
		"src/**/*.jsx",
		"src/**/*.tsx"
	]
}