Repository: apify/crawlee-python Branch: master Commit: 9becf12908f8 Files: 635 Total size: 2.6 MB Directory structure: gitextract_o1cy5s8w/ ├── .editorconfig ├── .github/ │ ├── CODEOWNERS │ ├── pull_request_template.md │ └── workflows/ │ ├── _check_code.yaml │ ├── _check_docs.yaml │ ├── _release_docs.yaml │ ├── _tests.yaml │ ├── manual_release_stable.yaml │ ├── on_issue.yaml │ ├── on_master.yaml │ ├── on_pull_request.yaml │ └── on_schedule_tests.yaml ├── .gitignore ├── .markdownlint.yaml ├── .pre-commit-config.yaml ├── .rules.md ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── codecov.yaml ├── docs/ │ ├── deployment/ │ │ ├── apify_platform.mdx │ │ ├── aws_lambda.mdx │ │ ├── code_examples/ │ │ │ ├── apify/ │ │ │ │ ├── crawler_as_actor_example.py │ │ │ │ ├── get_public_url.py │ │ │ │ ├── log_with_config_example.py │ │ │ │ ├── proxy_advanced_example.py │ │ │ │ └── proxy_example.py │ │ │ ├── aws/ │ │ │ │ ├── beautifulsoup_crawler_lambda.py │ │ │ │ ├── playwright_crawler_lambda.py │ │ │ │ └── playwright_dockerfile │ │ │ └── google/ │ │ │ ├── cloud_run_example.py │ │ │ └── google_example.py │ │ ├── google_cloud.mdx │ │ └── google_cloud_run.mdx │ ├── examples/ │ │ ├── add_data_to_dataset.mdx │ │ ├── beautifulsoup_crawler.mdx │ │ ├── capture_screenshot_using_playwright.mdx │ │ ├── capturing_page_snapshots_with_error_snapshotter.mdx │ │ ├── code_examples/ │ │ │ ├── adaptive_playwright_crawler.py │ │ │ ├── add_data_to_dataset_bs.py │ │ │ ├── add_data_to_dataset_dataset.py │ │ │ ├── add_data_to_dataset_pw.py │ │ │ ├── beautifulsoup_crawler.py │ │ │ ├── beautifulsoup_crawler_keep_alive.py │ │ │ ├── beautifulsoup_crawler_stop.py │ │ │ ├── capture_screenshot_using_playwright.py │ │ │ ├── configure_json_logging.py │ │ │ ├── crawl_all_links_on_website_bs.py │ │ │ ├── crawl_all_links_on_website_pw.py │ │ │ ├── crawl_multiple_urls_bs.py │ │ │ ├── crawl_multiple_urls_pw.py │ │ │ ├── crawl_specific_links_on_website_bs.py │ │ │ ├── crawl_specific_links_on_website_pw.py │ │ │ ├── crawl_website_with_relative_links_all_links.py │ │ │ ├── crawl_website_with_relative_links_same_domain.py │ │ │ ├── crawl_website_with_relative_links_same_hostname.py │ │ │ ├── crawl_website_with_relative_links_same_origin.py │ │ │ ├── export_entire_dataset_to_file_csv.py │ │ │ ├── export_entire_dataset_to_file_json.py │ │ │ ├── extract_and_add_specific_links_on_website_bs.py │ │ │ ├── extract_and_add_specific_links_on_website_pw.py │ │ │ ├── fill_and_submit_web_form_crawler.py │ │ │ ├── fill_and_submit_web_form_request.py │ │ │ ├── parsel_crawler.py │ │ │ ├── parsel_crawler_with_error_snapshotter.py │ │ │ ├── playwright_block_requests.py │ │ │ ├── playwright_crawler.py │ │ │ ├── playwright_crawler_with_camoufox.py │ │ │ ├── playwright_crawler_with_error_snapshotter.py │ │ │ ├── playwright_crawler_with_fingerprint_generator.py │ │ │ ├── respect_robots_on_skipped_request.py │ │ │ ├── respect_robots_txt_file.py │ │ │ ├── resuming_paused_crawl.py │ │ │ ├── run_parallel_crawlers.py │ │ │ ├── using_browser_profiles_chrome.py │ │ │ ├── using_browser_profiles_firefox.py │ │ │ └── using_sitemap_request_loader.py │ │ ├── crawl_all_links_on_website.mdx │ │ ├── crawl_multiple_urls.mdx │ │ ├── crawl_specific_links_on_website.mdx │ │ ├── crawl_website_with_relative_links.mdx │ │ ├── crawler_keep_alive.mdx │ │ ├── crawler_stop.mdx │ │ ├── export_entire_dataset_to_file.mdx │ │ ├── fill_and_submit_web_form.mdx │ │ ├── json_logging.mdx │ │ ├── parsel_crawler.mdx │ │ ├── playwright_crawler.mdx │ │ ├── playwright_crawler_adaptive.mdx │ │ ├── playwright_crawler_with_block_requests.mdx │ │ ├── playwright_crawler_with_camoufox.mdx │ │ ├── playwright_crawler_with_fingerprint_generator.mdx │ │ ├── respect_robots_txt_file.mdx │ │ ├── resuming_paused_crawl.mdx │ │ ├── run_parallel_crawlers.mdx │ │ ├── using_browser_profile.mdx │ │ └── using_sitemap_request_loader.mdx │ ├── guides/ │ │ ├── architecture_overview.mdx │ │ ├── avoid_blocking.mdx │ │ ├── code_examples/ │ │ │ ├── avoid_blocking/ │ │ │ │ ├── default_fingerprint_generator_with_args.py │ │ │ │ └── playwright_with_fingerprint_generator.py │ │ │ ├── creating_web_archive/ │ │ │ │ ├── manual_archiving_parsel_crawler.py │ │ │ │ ├── manual_archiving_playwright_crawler.py │ │ │ │ └── simple_pw_through_proxy_pywb_server.py │ │ │ ├── error_handling/ │ │ │ │ ├── change_handle_error_status.py │ │ │ │ ├── disable_retry.py │ │ │ │ └── handle_proxy_error.py │ │ │ ├── http_clients/ │ │ │ │ ├── parsel_curl_impersonate_example.py │ │ │ │ ├── parsel_httpx_example.py │ │ │ │ └── parsel_impit_example.py │ │ │ ├── http_crawlers/ │ │ │ │ ├── __init__.py │ │ │ │ ├── beautifulsoup_example.py │ │ │ │ ├── custom_crawler_example.py │ │ │ │ ├── http_example.py │ │ │ │ ├── lexbor_parser.py │ │ │ │ ├── lxml_parser.py │ │ │ │ ├── lxml_saxonche_parser.py │ │ │ │ ├── parsel_example.py │ │ │ │ ├── pyquery_parser.py │ │ │ │ ├── scrapling_parser.py │ │ │ │ ├── selectolax_adaptive_run.py │ │ │ │ ├── selectolax_context.py │ │ │ │ ├── selectolax_crawler.py │ │ │ │ ├── selectolax_crawler_run.py │ │ │ │ └── selectolax_parser.py │ │ │ ├── login_crawler/ │ │ │ │ ├── http_login.py │ │ │ │ └── playwright_login.py │ │ │ ├── playwright_crawler/ │ │ │ │ ├── browser_configuration_example.py │ │ │ │ ├── browser_pool_page_hooks_example.py │ │ │ │ ├── multiple_launch_example.py │ │ │ │ ├── navigation_hooks_example.py │ │ │ │ └── plugin_browser_configuration_example.py │ │ │ ├── playwright_crawler_adaptive/ │ │ │ │ ├── handler.py │ │ │ │ ├── init_beautifulsoup.py │ │ │ │ ├── init_parsel.py │ │ │ │ ├── init_prediction.py │ │ │ │ └── pre_nav_hooks.py │ │ │ ├── playwright_crawler_stagehand/ │ │ │ │ ├── __init__.py │ │ │ │ ├── browser_classes.py │ │ │ │ ├── stagehand_run.py │ │ │ │ └── support_classes.py │ │ │ ├── proxy_management/ │ │ │ │ ├── inspecting_bs_example.py │ │ │ │ ├── inspecting_pw_example.py │ │ │ │ ├── integration_bs_example.py │ │ │ │ ├── integration_pw_example.py │ │ │ │ ├── quick_start_example.py │ │ │ │ ├── session_bs_example.py │ │ │ │ ├── session_pw_example.py │ │ │ │ ├── tiers_bs_example.py │ │ │ │ └── tiers_pw_example.py │ │ │ ├── request_loaders/ │ │ │ │ ├── rl_basic_example.py │ │ │ │ ├── rl_basic_example_with_persist.py │ │ │ │ ├── rl_tandem_example.py │ │ │ │ ├── rl_tandem_example_explicit.py │ │ │ │ ├── sitemap_basic_example.py │ │ │ │ ├── sitemap_example_with_persist.py │ │ │ │ ├── sitemap_tandem_example.py │ │ │ │ └── sitemap_tandem_example_explicit.py │ │ │ ├── request_router/ │ │ │ │ ├── adaptive_crawler_handlers.py │ │ │ │ ├── basic_request_handlers.py │ │ │ │ ├── custom_router_default_only.py │ │ │ │ ├── error_handler.py │ │ │ │ ├── failed_request_handler.py │ │ │ │ ├── http_pre_navigation.py │ │ │ │ ├── playwright_pre_navigation.py │ │ │ │ └── simple_default_handler.py │ │ │ ├── running_in_web_server/ │ │ │ │ ├── __init__.py │ │ │ │ ├── crawler.py │ │ │ │ └── server.py │ │ │ ├── scaling_crawlers/ │ │ │ │ ├── max_tasks_per_minute_example.py │ │ │ │ └── min_and_max_concurrency_example.py │ │ │ ├── service_locator/ │ │ │ │ ├── service_conflicts.py │ │ │ │ ├── service_crawler_configuration.py │ │ │ │ ├── service_crawler_event_manager.py │ │ │ │ ├── service_crawler_storage_client.py │ │ │ │ ├── service_locator_configuration.py │ │ │ │ ├── service_locator_event_manager.py │ │ │ │ ├── service_locator_storage_client.py │ │ │ │ ├── service_storage_configuration.py │ │ │ │ └── service_storage_storage_client.py │ │ │ ├── session_management/ │ │ │ │ ├── multi_sessions_http.py │ │ │ │ ├── one_session_http.py │ │ │ │ ├── sm_basic.py │ │ │ │ ├── sm_beautifulsoup.py │ │ │ │ ├── sm_http.py │ │ │ │ ├── sm_parsel.py │ │ │ │ ├── sm_playwright.py │ │ │ │ └── sm_standalone.py │ │ │ ├── storage_clients/ │ │ │ │ ├── custom_storage_client_example.py │ │ │ │ ├── file_system_storage_client_basic_example.py │ │ │ │ ├── file_system_storage_client_configuration_example.py │ │ │ │ ├── memory_storage_client_basic_example.py │ │ │ │ ├── redis_storage_client_basic_example.py │ │ │ │ ├── redis_storage_client_configuration_example.py │ │ │ │ ├── registering_storage_clients_example.py │ │ │ │ ├── sql_storage_client_basic_example.py │ │ │ │ └── sql_storage_client_configuration_example.py │ │ │ ├── storages/ │ │ │ │ ├── cleaning_do_not_purge_example.py │ │ │ │ ├── cleaning_purge_explicitly_example.py │ │ │ │ ├── dataset_basic_example.py │ │ │ │ ├── dataset_with_crawler_example.py │ │ │ │ ├── dataset_with_crawler_explicit_example.py │ │ │ │ ├── helper_add_requests_example.py │ │ │ │ ├── helper_enqueue_links_example.py │ │ │ │ ├── kvs_basic_example.py │ │ │ │ ├── kvs_with_crawler_example.py │ │ │ │ ├── kvs_with_crawler_explicit_example.py │ │ │ │ ├── opening.py │ │ │ │ ├── rq_basic_example.py │ │ │ │ ├── rq_with_crawler_example.py │ │ │ │ └── rq_with_crawler_explicit_example.py │ │ │ └── trace_and_monitor_crawlers/ │ │ │ └── instrument_crawler.py │ │ ├── crawler_login.mdx │ │ ├── creating_web_archive.mdx │ │ ├── error_handling.mdx │ │ ├── http_clients.mdx │ │ ├── http_crawlers.mdx │ │ ├── playwright_crawler.mdx │ │ ├── playwright_crawler_adaptive.mdx │ │ ├── playwright_crawler_stagehand.mdx │ │ ├── proxy_management.mdx │ │ ├── request_loaders.mdx │ │ ├── request_router.mdx │ │ ├── running_in_web_server.mdx │ │ ├── scaling_crawlers.mdx │ │ ├── service_locator.mdx │ │ ├── session_management.mdx │ │ ├── storage_clients.mdx │ │ ├── storages.mdx │ │ └── trace_and_monitor_crawlers.mdx │ ├── introduction/ │ │ ├── 01_setting_up.mdx │ │ ├── 02_first_crawler.mdx │ │ ├── 03_adding_more_urls.mdx │ │ ├── 04_real_world_project.mdx │ │ ├── 05_crawling.mdx │ │ ├── 06_scraping.mdx │ │ ├── 07_saving_data.mdx │ │ ├── 08_refactoring.mdx │ │ ├── 09_running_in_cloud.mdx │ │ ├── code_examples/ │ │ │ ├── 02_bs.py │ │ │ ├── 02_bs_better.py │ │ │ ├── 02_request_queue.py │ │ │ ├── 03_enqueue_strategy.py │ │ │ ├── 03_finding_new_links.py │ │ │ ├── 03_globs.py │ │ │ ├── 03_original_code.py │ │ │ ├── 03_transform_request.py │ │ │ ├── 04_sanity_check.py │ │ │ ├── 05_crawling_detail.py │ │ │ ├── 05_crawling_listing.py │ │ │ ├── 06_scraping.py │ │ │ ├── 07_final_code.py │ │ │ ├── 07_first_code.py │ │ │ ├── 08_main.py │ │ │ ├── 08_routes.py │ │ │ ├── 09_apify_sdk.py │ │ │ ├── __init__.py │ │ │ └── routes.py │ │ └── index.mdx │ ├── pyproject.toml │ ├── quick-start/ │ │ ├── code_examples/ │ │ │ ├── beautifulsoup_crawler_example.py │ │ │ ├── parsel_crawler_example.py │ │ │ ├── playwright_crawler_example.py │ │ │ └── playwright_crawler_headful_example.py │ │ └── index.mdx │ └── upgrading/ │ ├── upgrading_to_v0x.md │ └── upgrading_to_v1.md ├── pyproject.toml ├── renovate.json ├── src/ │ └── crawlee/ │ ├── __init__.py │ ├── _autoscaling/ │ │ ├── __init__.py │ │ ├── _types.py │ │ ├── autoscaled_pool.py │ │ ├── py.typed │ │ ├── snapshotter.py │ │ └── system_status.py │ ├── _cli.py │ ├── _consts.py │ ├── _log_config.py │ ├── _request.py │ ├── _service_locator.py │ ├── _types.py │ ├── _utils/ │ │ ├── __init__.py │ │ ├── blocked.py │ │ ├── byte_size.py │ │ ├── console.py │ │ ├── context.py │ │ ├── crypto.py │ │ ├── docs.py │ │ ├── file.py │ │ ├── globs.py │ │ ├── html_to_text.py │ │ ├── models.py │ │ ├── raise_if_too_many_kwargs.py │ │ ├── recoverable_state.py │ │ ├── recurring_task.py │ │ ├── requests.py │ │ ├── robots.py │ │ ├── sitemap.py │ │ ├── system.py │ │ ├── time.py │ │ ├── try_import.py │ │ ├── urls.py │ │ ├── wait.py │ │ └── web.py │ ├── browsers/ │ │ ├── __init__.py │ │ ├── _browser_controller.py │ │ ├── _browser_plugin.py │ │ ├── _browser_pool.py │ │ ├── _playwright_browser.py │ │ ├── _playwright_browser_controller.py │ │ ├── _playwright_browser_plugin.py │ │ ├── _types.py │ │ └── py.typed │ ├── configuration.py │ ├── crawlers/ │ │ ├── __init__.py │ │ ├── _abstract_http/ │ │ │ ├── __init__.py │ │ │ ├── _abstract_http_crawler.py │ │ │ ├── _abstract_http_parser.py │ │ │ ├── _http_crawling_context.py │ │ │ └── py.typed │ │ ├── _adaptive_playwright/ │ │ │ ├── __init__.py │ │ │ ├── _adaptive_playwright_crawler.py │ │ │ ├── _adaptive_playwright_crawler_statistics.py │ │ │ ├── _adaptive_playwright_crawling_context.py │ │ │ ├── _rendering_type_predictor.py │ │ │ ├── _result_comparator.py │ │ │ └── _utils.py │ │ ├── _basic/ │ │ │ ├── __init__.py │ │ │ ├── _basic_crawler.py │ │ │ ├── _basic_crawling_context.py │ │ │ ├── _context_pipeline.py │ │ │ ├── _context_utils.py │ │ │ ├── _logging_utils.py │ │ │ └── py.typed │ │ ├── _beautifulsoup/ │ │ │ ├── __init__.py │ │ │ ├── _beautifulsoup_crawler.py │ │ │ ├── _beautifulsoup_crawling_context.py │ │ │ ├── _beautifulsoup_parser.py │ │ │ ├── _utils.py │ │ │ └── py.typed │ │ ├── _http/ │ │ │ ├── __init__.py │ │ │ ├── _http_crawler.py │ │ │ └── _http_parser.py │ │ ├── _parsel/ │ │ │ ├── __init__.py │ │ │ ├── _parsel_crawler.py │ │ │ ├── _parsel_crawling_context.py │ │ │ ├── _parsel_parser.py │ │ │ └── _utils.py │ │ ├── _playwright/ │ │ │ ├── __init__.py │ │ │ ├── _playwright_crawler.py │ │ │ ├── _playwright_crawling_context.py │ │ │ ├── _playwright_http_client.py │ │ │ ├── _playwright_post_nav_crawling_context.py │ │ │ ├── _playwright_pre_nav_crawling_context.py │ │ │ ├── _types.py │ │ │ └── _utils.py │ │ ├── _types.py │ │ └── py.typed │ ├── errors.py │ ├── events/ │ │ ├── __init__.py │ │ ├── _event_manager.py │ │ ├── _local_event_manager.py │ │ ├── _types.py │ │ └── py.typed │ ├── fingerprint_suite/ │ │ ├── __init__.py │ │ ├── _browserforge_adapter.py │ │ ├── _consts.py │ │ ├── _fingerprint_generator.py │ │ ├── _header_generator.py │ │ ├── _types.py │ │ └── py.typed │ ├── http_clients/ │ │ ├── __init__.py │ │ ├── _base.py │ │ ├── _curl_impersonate.py │ │ ├── _httpx.py │ │ └── _impit.py │ ├── otel/ │ │ ├── __init__.py │ │ └── crawler_instrumentor.py │ ├── project_template/ │ │ ├── cookiecutter.json │ │ ├── hooks/ │ │ │ ├── post_gen_project.py │ │ │ └── pre_gen_project.py │ │ ├── templates/ │ │ │ ├── main.py │ │ │ ├── main_beautifulsoup.py │ │ │ ├── main_parsel.py │ │ │ ├── main_playwright.py │ │ │ ├── main_playwright_camoufox.py │ │ │ ├── main_playwright_chrome.py │ │ │ ├── main_playwright_firefox.py │ │ │ ├── main_playwright_webkit.py │ │ │ ├── routes_beautifulsoup.py │ │ │ ├── routes_parsel.py │ │ │ └── routes_playwright.py │ │ └── {{cookiecutter.project_name}}/ │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── pyproject.toml │ │ ├── requirements.txt │ │ └── {{cookiecutter.__package_name}}/ │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── main.py │ │ └── routes.py │ ├── proxy_configuration.py │ ├── py.typed │ ├── request_loaders/ │ │ ├── __init__.py │ │ ├── _request_list.py │ │ ├── _request_loader.py │ │ ├── _request_manager.py │ │ ├── _request_manager_tandem.py │ │ └── _sitemap_request_loader.py │ ├── router.py │ ├── sessions/ │ │ ├── __init__.py │ │ ├── _cookies.py │ │ ├── _models.py │ │ ├── _session.py │ │ ├── _session_pool.py │ │ └── py.typed │ ├── statistics/ │ │ ├── __init__.py │ │ ├── _error_snapshotter.py │ │ ├── _error_tracker.py │ │ ├── _models.py │ │ └── _statistics.py │ ├── storage_clients/ │ │ ├── __init__.py │ │ ├── _base/ │ │ │ ├── __init__.py │ │ │ ├── _dataset_client.py │ │ │ ├── _key_value_store_client.py │ │ │ ├── _request_queue_client.py │ │ │ ├── _storage_client.py │ │ │ └── py.typed │ │ ├── _file_system/ │ │ │ ├── __init__.py │ │ │ ├── _dataset_client.py │ │ │ ├── _key_value_store_client.py │ │ │ ├── _request_queue_client.py │ │ │ ├── _storage_client.py │ │ │ ├── _utils.py │ │ │ └── py.typed │ │ ├── _memory/ │ │ │ ├── __init__.py │ │ │ ├── _dataset_client.py │ │ │ ├── _key_value_store_client.py │ │ │ ├── _request_queue_client.py │ │ │ ├── _storage_client.py │ │ │ └── py.typed │ │ ├── _redis/ │ │ │ ├── __init__.py │ │ │ ├── _client_mixin.py │ │ │ ├── _dataset_client.py │ │ │ ├── _key_value_store_client.py │ │ │ ├── _request_queue_client.py │ │ │ ├── _storage_client.py │ │ │ ├── _utils.py │ │ │ ├── lua_scripts/ │ │ │ │ ├── atomic_bloom_add_requests.lua │ │ │ │ ├── atomic_fetch_request.lua │ │ │ │ ├── atomic_set_add_requests.lua │ │ │ │ └── reclaim_stale_requests.lua │ │ │ └── py.typed │ │ ├── _sql/ │ │ │ ├── __init__.py │ │ │ ├── _client_mixin.py │ │ │ ├── _dataset_client.py │ │ │ ├── _db_models.py │ │ │ ├── _key_value_store_client.py │ │ │ ├── _request_queue_client.py │ │ │ ├── _storage_client.py │ │ │ └── py.typed │ │ ├── models.py │ │ └── py.typed │ └── storages/ │ ├── __init__.py │ ├── _base.py │ ├── _dataset.py │ ├── _key_value_store.py │ ├── _request_queue.py │ ├── _storage_instance_manager.py │ ├── _utils.py │ └── py.typed ├── tests/ │ ├── __init__.py │ ├── e2e/ │ │ ├── __init__.py │ │ ├── conftest.py │ │ └── project_template/ │ │ ├── test_static_crawlers_templates.py │ │ └── utils.py │ └── unit/ │ ├── README.md │ ├── __init__.py │ ├── _autoscaling/ │ │ ├── test_autoscaled_pool.py │ │ ├── test_snapshotter.py │ │ └── test_system_status.py │ ├── _statistics/ │ │ ├── test_error_tracker.py │ │ ├── test_periodic_logging.py │ │ ├── test_persistence.py │ │ ├── test_request_max_duration.py │ │ └── test_request_processing_record.py │ ├── _utils/ │ │ ├── test_byte_size.py │ │ ├── test_console.py │ │ ├── test_crypto.py │ │ ├── test_file.py │ │ ├── test_globs.py │ │ ├── test_html_to_text.py │ │ ├── test_measure_time.py │ │ ├── test_raise_if_too_many_kwargs.py │ │ ├── test_recurring_task.py │ │ ├── test_requests.py │ │ ├── test_robots.py │ │ ├── test_shared_timeout.py │ │ ├── test_sitemap.py │ │ ├── test_system.py │ │ ├── test_timedelta_ms.py │ │ └── test_urls.py │ ├── browsers/ │ │ ├── test_browser_pool.py │ │ ├── test_playwright_browser.py │ │ ├── test_playwright_browser_controller.py │ │ └── test_playwright_browser_plugin.py │ ├── conftest.py │ ├── crawlers/ │ │ ├── _adaptive_playwright/ │ │ │ ├── test_adaptive_playwright_crawler.py │ │ │ ├── test_adaptive_playwright_crawler_statistics.py │ │ │ ├── test_adaptive_playwright_crawling_context.py │ │ │ └── test_predictor.py │ │ ├── _basic/ │ │ │ ├── test_basic_crawler.py │ │ │ └── test_context_pipeline.py │ │ ├── _beautifulsoup/ │ │ │ └── test_beautifulsoup_crawler.py │ │ ├── _http/ │ │ │ └── test_http_crawler.py │ │ ├── _parsel/ │ │ │ └── test_parsel_crawler.py │ │ └── _playwright/ │ │ ├── test_playwright_crawler.py │ │ └── test_utils.py │ ├── events/ │ │ ├── test_event_manager.py │ │ └── test_local_event_manager.py │ ├── fingerprint_suite/ │ │ ├── test_adapters.py │ │ └── test_header_generator.py │ ├── http_clients/ │ │ ├── test_http_clients.py │ │ └── test_httpx.py │ ├── otel/ │ │ └── test_crawler_instrumentor.py │ ├── proxy_configuration/ │ │ ├── test_new_proxy_info.py │ │ └── test_tiers.py │ ├── request_loaders/ │ │ ├── test_request_list.py │ │ └── test_sitemap_request_loader.py │ ├── server.py │ ├── server_endpoints.py │ ├── server_static/ │ │ └── test.js │ ├── sessions/ │ │ ├── test_cookies.py │ │ ├── test_models.py │ │ ├── test_session.py │ │ └── test_session_pool.py │ ├── storage_clients/ │ │ ├── _file_system/ │ │ │ ├── test_fs_dataset_client.py │ │ │ ├── test_fs_kvs_client.py │ │ │ └── test_fs_rq_client.py │ │ ├── _memory/ │ │ │ ├── test_memory_dataset_client.py │ │ │ ├── test_memory_kvs_client.py │ │ │ └── test_memory_rq_client.py │ │ ├── _redis/ │ │ │ ├── test_redis_dataset_client.py │ │ │ ├── test_redis_kvs_client.py │ │ │ └── test_redis_rq_client.py │ │ └── _sql/ │ │ ├── test_sql_dataset_client.py │ │ ├── test_sql_kvs_client.py │ │ └── test_sql_rq_client.py │ ├── storages/ │ │ ├── conftest.py │ │ ├── test_dataset.py │ │ ├── test_key_value_store.py │ │ ├── test_request_manager_tandem.py │ │ ├── test_request_queue.py │ │ └── test_storage_instance_manager.py │ ├── test_cli.py │ ├── test_configuration.py │ ├── test_log_config.py │ ├── test_router.py │ ├── test_service_locator.py │ └── utils.py ├── typos.toml └── website/ ├── .eslintrc.json ├── .yarnrc.yml ├── babel.config.js ├── build_api_reference.sh ├── docusaurus.config.js ├── generate_module_shortcuts.py ├── package.json ├── patches/ │ ├── @docusaurus+core+3.4.0.patch │ └── @docusaurus+core+3.5.2.patch ├── roa-loader/ │ ├── index.js │ └── package.json ├── sidebars.js ├── src/ │ ├── components/ │ │ ├── ApiLink.jsx │ │ ├── Button.jsx │ │ ├── Button.module.css │ │ ├── CopyButton.jsx │ │ ├── CopyButton.module.css │ │ ├── Gradients.jsx │ │ ├── Highlights.jsx │ │ ├── Highlights.module.css │ │ ├── Homepage/ │ │ │ ├── HomepageCliExample.jsx │ │ │ ├── HomepageCliExample.module.css │ │ │ ├── HomepageCtaSection.jsx │ │ │ ├── HomepageCtaSection.module.css │ │ │ ├── HomepageHeroSection.jsx │ │ │ ├── HomepageHeroSection.module.css │ │ │ ├── LanguageInfoWidget.jsx │ │ │ ├── LanguageInfoWidget.module.css │ │ │ ├── LanguageSwitch.jsx │ │ │ ├── LanguageSwitch.module.css │ │ │ ├── RiverSection.jsx │ │ │ ├── RiverSection.module.css │ │ │ ├── ThreeCardsWithIcon.jsx │ │ │ └── ThreeCardsWithIcon.module.css │ │ ├── LLMButtons.jsx │ │ ├── LLMButtons.module.css │ │ ├── RunnableCodeBlock.jsx │ │ └── RunnableCodeBlock.module.css │ ├── css/ │ │ └── custom.css │ ├── pages/ │ │ ├── home_page_example.py │ │ ├── index.js │ │ └── index.module.css │ ├── plugins/ │ │ └── docusaurus-plugin-segment/ │ │ ├── index.js │ │ └── segment.js │ └── theme/ │ ├── ColorModeToggle/ │ │ ├── index.js │ │ └── styles.module.css │ ├── DocItem/ │ │ ├── Content/ │ │ │ ├── index.js │ │ │ └── styles.module.css │ │ └── Layout/ │ │ ├── index.js │ │ └── styles.module.css │ ├── Footer/ │ │ ├── LinkItem/ │ │ │ ├── index.js │ │ │ └── index.module.css │ │ ├── index.js │ │ └── index.module.css │ ├── MDXComponents/ │ │ └── A.js │ ├── Navbar/ │ │ ├── Content/ │ │ │ ├── index.js │ │ │ └── styles.module.css │ │ ├── Logo/ │ │ │ ├── index.js │ │ │ └── index.module.css │ │ └── MobileSidebar/ │ │ ├── Header/ │ │ │ ├── index.js │ │ │ └── index.module.css │ │ ├── Layout/ │ │ │ └── index.js │ │ ├── PrimaryMenu/ │ │ │ └── index.js │ │ └── index.js │ └── NavbarItem/ │ └── ComponentTypes.js ├── static/ │ ├── .nojekyll │ ├── js/ │ │ └── custom.js │ └── robots.txt ├── tools/ │ ├── docs-prettier.config.js │ ├── utils/ │ │ └── externalLink.js │ └── website_gif/ │ └── website_gif.mjs └── tsconfig.eslint.json ================================================ FILE CONTENTS ================================================ ================================================ FILE: .editorconfig ================================================ root = true [*] indent_style = space indent_size = 4 charset = utf-8 trim_trailing_whitespace = true insert_final_newline = true end_of_line = lf [Makefile] indent_style = tab [{*.yaml, *.yml}] indent_size = 2 ================================================ FILE: .github/CODEOWNERS ================================================ # Documentation codeowner /docs/*.md @TC-MO /docs/*.mdx @TC-MO ================================================ FILE: .github/pull_request_template.md ================================================ ### Description - TODO ### Issues - Closes: #TODO ### Testing - TODO ### Checklist - [ ] CI passed ================================================ FILE: .github/workflows/_check_code.yaml ================================================ name: Code checks on: # Runs when manually triggered from the GitHub UI. workflow_dispatch: # Runs when invoked by another workflow. workflow_call: permissions: contents: read jobs: actions_lint_check: name: Actions lint check runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v6 - name: Run actionlint uses: rhysd/actionlint@v1.7.11 spell_check: name: Spell check runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v6 - name: Check spelling with typos uses: crate-ci/typos@v1 lint_check: name: Lint check uses: apify/workflows/.github/workflows/python_lint_check.yaml@main with: python_versions: '["3.10", "3.11", "3.12", "3.13", "3.14"]' type_check: name: Type check uses: apify/workflows/.github/workflows/python_type_check.yaml@main with: python_versions: '["3.10", "3.11", "3.12", "3.13", "3.14"]' ================================================ FILE: .github/workflows/_check_docs.yaml ================================================ name: Doc checks on: # Runs when manually triggered from the GitHub UI. workflow_dispatch: # Runs when invoked by another workflow. workflow_call: permissions: contents: read jobs: doc_checks: name: Doc checks uses: apify/workflows/.github/workflows/python_docs_check.yaml@main ================================================ FILE: .github/workflows/_release_docs.yaml ================================================ name: Doc release on: # Runs when manually triggered from the GitHub UI. workflow_dispatch: # Runs when invoked by another workflow. workflow_call: inputs: ref: required: true type: string permissions: contents: read env: NODE_VERSION: 22 PYTHON_VERSION: 3.14 CHECKOUT_REF: ${{ github.event_name == 'workflow_call' && inputs.ref || github.ref }} jobs: release_docs: name: Doc release environment: name: github-pages permissions: contents: write pages: write id-token: write runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v6 with: token: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }} ref: ${{ env.CHECKOUT_REF }} - name: Set up Node uses: actions/setup-node@v6 with: node-version: ${{ env.NODE_VERSION }} - name: Set up Python uses: actions/setup-python@v6 with: python-version: ${{ env.PYTHON_VERSION }} - name: Set up uv package manager uses: astral-sh/setup-uv@v7 with: python-version: ${{ env.PYTHON_VERSION }} - name: Install Python dependencies run: uv run poe install-dev - name: Build Docusaurus docs run: uv run poe build-docs env: APIFY_SIGNING_TOKEN: ${{ secrets.APIFY_SIGNING_TOKEN }} SEGMENT_TOKEN: ${{ secrets.SEGMENT_TOKEN }} - name: Set up GitHub Pages uses: actions/configure-pages@v5 - name: Upload GitHub Pages artifact uses: actions/upload-pages-artifact@v4 with: path: ./website/build - name: Deploy artifact to GitHub Pages uses: actions/deploy-pages@v4 - name: Invalidate CloudFront cache run: | gh workflow run invalidate-cloudfront.yml \ --repo apify/apify-docs-private \ --field deployment=crawlee-web echo "✅ CloudFront cache invalidation workflow triggered successfully" env: GITHUB_TOKEN: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }} ================================================ FILE: .github/workflows/_tests.yaml ================================================ name: Tests on: # Runs when manually triggered from the GitHub UI. workflow_dispatch: # Runs when invoked by another workflow. workflow_call: permissions: contents: read jobs: unit_tests: name: Unit tests uses: apify/workflows/.github/workflows/python_unit_tests.yaml@main secrets: inherit with: python_versions: '["3.10", "3.11", "3.12", "3.13", "3.14"]' operating_systems: '["ubuntu-latest", "windows-latest", "macos-latest"]' python_version_for_codecov: "3.14" operating_system_for_codecov: ubuntu-latest tests_concurrency: "8" ================================================ FILE: .github/workflows/manual_release_stable.yaml ================================================ name: Stable release on: # Runs when manually triggered from the GitHub UI, with options to specify the type of release. workflow_dispatch: inputs: release_type: description: Release type required: true type: choice default: auto options: - auto - custom - patch - minor - major custom_version: description: The custom version to bump to (only for "custom" type) required: false type: string default: "" concurrency: group: release cancel-in-progress: false permissions: contents: read jobs: code_checks: name: Code checks uses: ./.github/workflows/_check_code.yaml release_prepare: name: Release prepare needs: [code_checks] runs-on: ubuntu-latest outputs: version_number: ${{ steps.release_prepare.outputs.version_number }} tag_name: ${{ steps.release_prepare.outputs.tag_name }} changelog: ${{ steps.release_prepare.outputs.changelog }} release_notes: ${{ steps.release_prepare.outputs.release_notes }} steps: - uses: apify/workflows/git-cliff-release@main name: Release prepare id: release_prepare with: release_type: ${{ inputs.release_type }} custom_version: ${{ inputs.custom_version }} existing_changelog_path: CHANGELOG.md changelog_update: name: Changelog update needs: [release_prepare] permissions: contents: write uses: apify/workflows/.github/workflows/python_bump_and_update_changelog.yaml@main with: version_number: ${{ needs.release_prepare.outputs.version_number }} changelog: ${{ needs.release_prepare.outputs.changelog }} secrets: inherit github_release: name: GitHub release needs: [release_prepare, changelog_update] runs-on: ubuntu-latest permissions: contents: write env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - name: GitHub release uses: softprops/action-gh-release@v2 with: tag_name: ${{ needs.release_prepare.outputs.tag_name }} name: ${{ needs.release_prepare.outputs.version_number }} target_commitish: ${{ needs.changelog_update.outputs.changelog_commitish }} body: ${{ needs.release_prepare.outputs.release_notes }} pypi_publish: name: PyPI publish needs: [release_prepare, changelog_update] runs-on: ubuntu-latest permissions: contents: write id-token: write # Required for OIDC authentication. environment: name: pypi url: https://pypi.org/project/crawlee steps: - name: Prepare distribution uses: apify/workflows/prepare-pypi-distribution@main with: package_name: crawlee is_prerelease: "" version_number: ${{ needs.release_prepare.outputs.version_number }} ref: ${{ needs.changelog_update.outputs.changelog_commitish }} # Publishes the package to PyPI using PyPA official GitHub action with OIDC authentication. - name: Publish package to PyPI uses: pypa/gh-action-pypi-publish@release/v1 # TODO: add job for publish package to Conda # https://github.com/apify/crawlee-python/issues/104 doc_release: name: Doc release needs: [changelog_update, pypi_publish] permissions: contents: write pages: write id-token: write uses: ./.github/workflows/_release_docs.yaml with: # Use the ref from the changelog update to include the updated changelog. ref: ${{ needs.changelog_update.outputs.changelog_commitish }} secrets: inherit ================================================ FILE: .github/workflows/on_issue.yaml ================================================ name: CI (issue) on: # Runs when a new issue is opened. issues: types: - opened permissions: contents: read jobs: label_issues: name: Add labels runs-on: ubuntu-latest permissions: issues: write steps: # Add the "t-tooling" label to all new issues - uses: actions/github-script@v8 with: script: | github.rest.issues.addLabels({ issue_number: context.issue.number, owner: context.repo.owner, repo: context.repo.repo, labels: ["t-tooling"] }) ================================================ FILE: .github/workflows/on_master.yaml ================================================ name: CI (master) on: push: branches: - master tags-ignore: - "**" # Ignore all tags to avoid duplicate executions triggered by tag pushes. concurrency: group: release cancel-in-progress: false permissions: contents: read jobs: doc_checks: name: Doc checks uses: ./.github/workflows/_check_docs.yaml doc_release: # Skip this for non-"docs" commits. if: startsWith(github.event.head_commit.message, 'docs') name: Doc release needs: [doc_checks] permissions: contents: write pages: write id-token: write uses: ./.github/workflows/_release_docs.yaml with: # Use the same ref as the one that triggered the workflow. ref: ${{ github.ref }} secrets: inherit code_checks: name: Code checks uses: ./.github/workflows/_check_code.yaml tests: # Skip this for "docs" commits. if: "!startsWith(github.event.head_commit.message, 'docs')" name: Tests uses: ./.github/workflows/_tests.yaml secrets: inherit release_prepare: # Run this only for "feat", "fix", "perf", "refactor" and "style" commits. if: >- startsWith(github.event.head_commit.message, 'feat') || startsWith(github.event.head_commit.message, 'fix') || startsWith(github.event.head_commit.message, 'perf') || startsWith(github.event.head_commit.message, 'refactor') || startsWith(github.event.head_commit.message, 'style') name: Release prepare needs: [code_checks, tests] runs-on: ubuntu-latest outputs: version_number: ${{ steps.release_prepare.outputs.version_number }} tag_name: ${{ steps.release_prepare.outputs.tag_name }} changelog: ${{ steps.release_prepare.outputs.changelog }} steps: - uses: apify/workflows/git-cliff-release@main id: release_prepare name: Release prepare with: release_type: prerelease existing_changelog_path: CHANGELOG.md changelog_update: name: Changelog update needs: [release_prepare] permissions: contents: write uses: apify/workflows/.github/workflows/python_bump_and_update_changelog.yaml@main with: version_number: ${{ needs.release_prepare.outputs.version_number }} changelog: ${{ needs.release_prepare.outputs.changelog }} secrets: inherit pypi_publish: name: PyPI publish needs: [release_prepare, changelog_update] runs-on: ubuntu-latest permissions: contents: write id-token: write # Required for OIDC authentication. environment: name: pypi url: https://pypi.org/project/crawlee steps: - name: Prepare distribution uses: apify/workflows/prepare-pypi-distribution@main with: package_name: crawlee is_prerelease: "yes" version_number: ${{ needs.release_prepare.outputs.version_number }} ref: ${{ needs.changelog_update.outputs.changelog_commitish }} - name: Publish package to PyPI uses: pypa/gh-action-pypi-publish@release/v1 doc_release_post_publish: name: Doc release post publish needs: [changelog_update, pypi_publish] permissions: contents: write pages: write id-token: write uses: ./.github/workflows/_release_docs.yaml with: # Use the ref from the changelog update to include the updated changelog. ref: ${{ needs.changelog_update.outputs.changelog_commitish }} secrets: inherit ================================================ FILE: .github/workflows/on_pull_request.yaml ================================================ name: CI (PR) on: # Runs whenever a pull request is opened or updated. pull_request: permissions: contents: read pull-requests: read jobs: pr_title_check: name: PR title check runs-on: ubuntu-latest steps: - uses: amannn/action-semantic-pull-request@v6.1.1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} doc_checks: name: Doc checks uses: ./.github/workflows/_check_docs.yaml code_checks: name: Code checks uses: ./.github/workflows/_check_code.yaml tests: name: Tests uses: ./.github/workflows/_tests.yaml secrets: inherit ================================================ FILE: .github/workflows/on_schedule_tests.yaml ================================================ name: Scheduled tests on: # Runs when manually triggered from the GitHub UI. workflow_dispatch: # Runs on a daily schedule at 06:00 UTC. schedule: - cron: '0 6 * * *' concurrency: group: scheduled-tests cancel-in-progress: false permissions: contents: read env: NODE_VERSION: 22 PYTHON_VERSION: 3.14 TESTS_CONCURRENCY: 1 jobs: end_to_end_tests: name: End-to-end tests strategy: fail-fast: false max-parallel: 12 matrix: crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup"] http-client: ["httpx", "curl_impersonate"] package-manager: ["pip", "uv", "poetry"] runs-on: "ubuntu-latest" steps: - name: Checkout repository uses: actions/checkout@v6 - name: Setup node uses: actions/setup-node@v6 with: node-version: ${{ env.NODE_VERSION }} - name: Install dependencies run: npm install -g apify-cli - name: Set up Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v6 with: python-version: ${{ env.PYTHON_VERSION }} # installed to be able to patch crawlee in the poetry.lock with custom wheel file for poetry based templates - name: Install poetry run: pipx install poetry - name: Set up uv package manager uses: astral-sh/setup-uv@v7 with: python-version: ${{ env.PYTHON_VERSION }} # Sync the project, but no need to install the browsers into the test runner environment. - name: Install Python dependencies run: uv run poe install-sync - name: Run templates end-to-end tests run: uv run poe e2e-templates-tests -m "${{ matrix.http-client }} and ${{ matrix.crawler-type }} and ${{ matrix.package-manager }}" env: APIFY_TEST_USER_API_TOKEN: ${{ secrets.APIFY_TEST_USER_API_TOKEN }} ================================================ FILE: .gitignore ================================================ # AI assistant files .agent .agents .ai .aider .claude .codeium .continue .copilot .cursor .gemini .llm .llms .openai .serena .windsurf .zed-ai AGENTS.local.md CLAUDE.local.md GEMINI.local.md # Cache __pycache__ .pytest_cache .ruff_cache .ty_cache .uv-cache # Virtual envs .direnv .env .envrc .python-version .venv # Other Python tools .ropeproject # Mise mise.toml .mise.toml # Egg and build artifacts *.egg-info/ *.egg dist/ build/ # Coverage reports .coverage* htmlcov coverage-unit.xml coverage-integration.xml # IDE, editors *~ .DS_Store .idea .nvim.lua .vscode .zed Session.vim # Docs docs/changelog.md # Website build artifacts, node dependencies website/build website/node_modules website/.yarn website/.docusaurus website/api-typedoc-generated.json website/apify-shared-docspec-dump.jsonl website/docspec-dump.jsonl website/module_shortcuts.json website/typedoc-types* # npm lockfile (we use yarn) website/package-lock.json # Default directory for memory storage storage/ # Tmp dir tmp/ ================================================ FILE: .markdownlint.yaml ================================================ default: true line-length: line_length: 120 MD007: indent: 4 MD004: style: dash no-inline-html: false ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: local hooks: - id: lint-check name: Lint check entry: uv run poe lint language: system pass_filenames: false - id: type-check name: Type check entry: uv run poe type-check language: system pass_filenames: false ================================================ FILE: .rules.md ================================================ # Coding guidelines This file provides guidance to programming agents when working with code in this repository. ## Development Commands All commands use `uv` (package manager) and `poe` (task runner): ```bash # Install all dependencies (dev + extras + pre-commit + playwright) uv run poe install-dev # Run full check suite (lint + type-check + unit tests) uv run poe check-code # Linting (ruff format check + ruff check) uv run poe lint # Auto-fix formatting uv run poe format # Type checking (ty) uv run poe type-check # Run all unit tests uv run poe unit-tests # Run a single test file uv run pytest tests/unit/path/to/test_file.py # Run a single test by name uv run pytest tests/unit/path/to/test_file.py::test_name -v # Run tests with coverage XML report uv run poe unit-tests-cov # Build package uv run poe build # Clean build artifacts uv run poe clean ``` Note: `uv run poe unit-tests` first runs tests marked `@pytest.mark.run_alone` in isolation, then runs the rest with `-x` (fail-fast) and parallelism via `pytest-xdist`. ## Code Style - **Linter/formatter**: Ruff with `select = ["ALL"]` and specific ignores - **Line length**: 120 characters - **Quotes**: Single quotes (double for docstrings) - **Docstrings**: Google format (enforced by Ruff) - **Type checker**: ty (Astral's type checker), target Python 3.10 - **Async mode**: pytest-asyncio in `auto` mode (no need for `@pytest.mark.asyncio`) - **Commit format**: Conventional Commits (`feat:`, `fix:`, `docs:`, `refactor:`, `test:`, etc.) ## Architecture ### Crawler Hierarchy ``` BasicCrawler[TCrawlingContext, TStatisticsState] ├── AbstractHttpCrawler → HttpCrawler, BeautifulSoupCrawler, ParselCrawler ├── PlaywrightCrawler └── AdaptivePlaywrightCrawler (extends PlaywrightCrawler) ``` - **BasicCrawler** (`src/crawlee/crawlers/_basic/`): Core request lifecycle, autoscaling pool, retries, session management, router dispatch. Generic over `TCrawlingContext`. - **AbstractHttpCrawler** (`src/crawlee/crawlers/_abstract_http/`): Adds HTTP client integration, response parsing, pre-navigation hooks. Generic over parser result type. - **PlaywrightCrawler** (`src/crawlee/crawlers/_playwright/`): Browser-based crawling with Playwright. ### Context Pipeline (Middleware Pattern) Contexts are progressively enhanced through `ContextPipeline` middleware: ``` BasicCrawlingContext → HttpCrawlingContext → ParsedHttpCrawlingContext → BeautifulSoupCrawlingContext ``` Each middleware is an async generator that wraps the next handler, enabling setup/teardown around request processing. ### Storage Layer Three-tier design: - **High-level**: `Dataset`, `KeyValueStore`, `RequestQueue` in `src/crawlee/storages/` - **Storage clients** (`src/crawlee/storage_clients/`): `FileSystemStorageClient` (default), `MemoryStorageClient`, `SqlStorageClient`, `RedisStorageClient` - **Instance caching**: `StorageInstanceManager` is a global singleton that caches storage instances by ID/name ### Service Locator `src/crawlee/_service_locator.py` is a global singleton managing `Configuration`, `EventManager`, `StorageClient`, and `StorageInstanceManager`. Prevents double-initialization with `ServiceConflictError`. ### HTTP Clients Pluggable via `HttpClient` interface in `src/crawlee/http_clients/`: - `ImpitHttpClient` (default), `HttpxHttpClient`, `CurlImpersonateHttpClient` - Each provides `crawl()` (for crawler pipeline) and `send_request()` (for in-handler use) ### Request Model `Request` (`src/crawlee/_request.py`) uses `unique_key` for deduplication. Lifecycle states: `UNPROCESSED → DONE`. Crawlee-specific metadata stored in `user_data['__crawlee']`. ### Router ```python @crawler.router.default_handler async def handler(context: BeautifulSoupCrawlingContext): ... @crawler.router.handler(label='detail') async def detail(context: BeautifulSoupCrawlingContext): ... ``` Requests are routed by their `label` field; unmatched requests go to the default handler. ### Key Directories - `src/crawlee/crawlers/` - All crawler implementations - `src/crawlee/storages/` - Dataset, KVS, RequestQueue - `src/crawlee/storage_clients/` - Backend implementations - `src/crawlee/http_clients/` - HTTP client implementations - `src/crawlee/browsers/` - Playwright browser pool and plugins - `src/crawlee/sessions/` - Session management with cookie persistence - `src/crawlee/events/` - Event system (persist state, progress, aborting) - `src/crawlee/_autoscaling/` - Autoscaled pool for concurrency control - `src/crawlee/fingerprint_suite/` - Anti-bot fingerprint generation - `src/crawlee/project_template/` - CLI scaffolding template (excluded from linting) - `tests/unit/` - Unit tests - `tests/e2e/` - End-to-end tests (require `apify-cli` + API token) ================================================ FILE: CHANGELOG.md ================================================ # Changelog All notable changes to this project will be documented in this file. ## [1.6.0](https://github.com/apify/crawlee-python/releases/tag/v1.6.0) (2026-03-20) ### 🚀 Features - Allow non-href links extract & enqueue ([#1781](https://github.com/apify/crawlee-python/pull/1781)) ([6db365d](https://github.com/apify/crawlee-python/commit/6db365d1625206d8d691256c9cd4b44a821238bb)) by [@kozlice](https://github.com/kozlice) - Add `post_navigation_hooks` to crawlers ([#1795](https://github.com/apify/crawlee-python/pull/1795)) ([38ceda6](https://github.com/apify/crawlee-python/commit/38ceda635a18cb2f14efc7c8e8b67f3adb7e53fd)) by [@Mantisus](https://github.com/Mantisus) - Add page lifecycle hooks to `BrowserPool` ([#1791](https://github.com/apify/crawlee-python/pull/1791)) ([6f2ac13](https://github.com/apify/crawlee-python/commit/6f2ac13fea4cfa8a65e6e41430d3e8d28cc3a787)) by [@Mantisus](https://github.com/Mantisus) - Expose `BrowserType` and `CrawleePage` ([#1798](https://github.com/apify/crawlee-python/pull/1798)) ([b50b9f2](https://github.com/apify/crawlee-python/commit/b50b9f2a8396dcee2bd7eaf76c94d24912c2bc5f)) by [@Mantisus](https://github.com/Mantisus) - Expose `use_state` in `BasicCrawler` ([#1799](https://github.com/apify/crawlee-python/pull/1799)) ([d121873](https://github.com/apify/crawlee-python/commit/d121873a7f5902b911dd04b4aa9eaf75a8449323)) by [@Mantisus](https://github.com/Mantisus) ### 🐛 Bug Fixes - **redis:** Do not remove handled request data from request queue ([#1787](https://github.com/apify/crawlee-python/pull/1787)) ([3008c61](https://github.com/apify/crawlee-python/commit/3008c61dcbe07ccdf3c43f198b37582cc1356c9a)) by [@kozlice](https://github.com/kozlice) - **redis:** Update actual `Request` state in request queue Redis storage client ([#1789](https://github.com/apify/crawlee-python/pull/1789)) ([787231c](https://github.com/apify/crawlee-python/commit/787231cebeb863ee2b4395964a79a37053dbec01)) by [@Mantisus](https://github.com/Mantisus) ## [1.5.0](https://github.com/apify/crawlee-python/releases/tag/v1.5.0) (2026-03-06) ### 🚀 Features - Use specialized Playwright docker images in templates ([#1757](https://github.com/apify/crawlee-python/pull/1757)) ([747c0cf](https://github.com/apify/crawlee-python/commit/747c0cf4a82296a2e3ea5cac5ef4c9578ea62a0c)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1756](https://github.com/apify/crawlee-python/issues/1756) - Add `discover_valid_sitemaps` utility ([#1777](https://github.com/apify/crawlee-python/pull/1777)) ([872447b](https://github.com/apify/crawlee-python/commit/872447b60bbdb3926068064a971492807b1bdfbb)) by [@Mantisus](https://github.com/Mantisus), closes [#1740](https://github.com/apify/crawlee-python/issues/1740) ### 🐛 Bug Fixes - Prevent list modification during iteration in BrowserPool ([#1703](https://github.com/apify/crawlee-python/pull/1703)) ([70309d9](https://github.com/apify/crawlee-python/commit/70309d9bf568d268a26b3ba6392be2b6ff284c65)) by [@vdusek](https://github.com/vdusek) - Fix ` max_requests_per_crawl` excluding failed requests ([#1766](https://github.com/apify/crawlee-python/pull/1766)) ([d6bb0b4](https://github.com/apify/crawlee-python/commit/d6bb0b4a9dc5dd6668d076fbfa1b5e748deaee0d)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1765](https://github.com/apify/crawlee-python/issues/1765) - **playwright:** Dispose of `APIResponse` body for `send_request` ([#1771](https://github.com/apify/crawlee-python/pull/1771)) ([29d301b](https://github.com/apify/crawlee-python/commit/29d301bf9d7795f2fbaddb99235a7157b880f60c)) by [@kozlice](https://github.com/kozlice) - Return `None` from `add_request` when storage client fails to enqueue request ([#1775](https://github.com/apify/crawlee-python/pull/1775)) ([944753a](https://github.com/apify/crawlee-python/commit/944753a71956c30f3ce0896ffa24be7de5348933)) by [@Mantisus](https://github.com/Mantisus) - Re-use pre-existing browser context in `PlaywrightBrowserController` ([#1778](https://github.com/apify/crawlee-python/pull/1778)) ([4487543](https://github.com/apify/crawlee-python/commit/44875433df83d433aa69ada458b91df3ad569f5e)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1776](https://github.com/apify/crawlee-python/issues/1776) ## [1.4.0](https://github.com/apify/crawlee-python/releases/tag/v1.4.0) (2026-02-17) ### 🚀 Features - Dynamic memory snapshots ([#1715](https://github.com/apify/crawlee-python/pull/1715)) ([568a7b1](https://github.com/apify/crawlee-python/commit/568a7b186dedda19ad814ee8af3cd8e256cc4ad9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1704](https://github.com/apify/crawlee-python/issues/1704) - Add `MySQL` and `MariaDB` support for `SqlStorageClient` ([#1749](https://github.com/apify/crawlee-python/pull/1749)) ([202b500](https://github.com/apify/crawlee-python/commit/202b5009ea5d35ea779eb5b8db1fc575f90ca7bb)) by [@Mantisus](https://github.com/Mantisus) ### 🐛 Bug Fixes - Make log levels consistent in ServiceLocator ([#1746](https://github.com/apify/crawlee-python/pull/1746)) ([4163413](https://github.com/apify/crawlee-python/commit/4163413049485b035c38efd6a4a7d41502a44cfc)) by [@janbuchar](https://github.com/janbuchar) - Fix `PlaywrightCrawler` unintentionally setting the global configuration ([#1747](https://github.com/apify/crawlee-python/pull/1747)) ([fa58438](https://github.com/apify/crawlee-python/commit/fa58438026eb72a6002c8d494725bf4e48b4407e)) by [@Pijukatel](https://github.com/Pijukatel) - Fix `Snapshotter` handling of out of order samples ([#1735](https://github.com/apify/crawlee-python/pull/1735)) ([387c712](https://github.com/apify/crawlee-python/commit/387c712306055d901b1c0df4a9666967f039aefd)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1734](https://github.com/apify/crawlee-python/issues/1734) ### ⚡ Performance - Optimize metadata records processing in `SqlStorageClient` ([#1551](https://github.com/apify/crawlee-python/pull/1551)) ([df1347a](https://github.com/apify/crawlee-python/commit/df1347aacf05c05980000d15b36b65996119ea86)) by [@Mantisus](https://github.com/Mantisus), closes [#1533](https://github.com/apify/crawlee-python/issues/1533) ## [1.3.2](https://github.com/apify/crawlee-python/releases/tag/v1.3.2) (2026-02-09) ### 🐛 Bug Fixes - Use `max()` instead of `min()` for `request_max_duration` statistic ([#1701](https://github.com/apify/crawlee-python/pull/1701)) ([85c4335](https://github.com/apify/crawlee-python/commit/85c43351a05ada1369b720061f6f1a7e158340b6)) by [@vdusek](https://github.com/vdusek) - Prevent mutation of default URL patterns list in `block_requests` ([#1702](https://github.com/apify/crawlee-python/pull/1702)) ([fcf9adb](https://github.com/apify/crawlee-python/commit/fcf9adb6a0cfeaa87ca482372d4e066584eb28d6)) by [@vdusek](https://github.com/vdusek) - Keep None values for `user_data` in `Request` ([#1707](https://github.com/apify/crawlee-python/pull/1707)) ([3c575bc](https://github.com/apify/crawlee-python/commit/3c575bc2b0f1c89c99d134ad3a3fa7455ccc6910)) by [@Mantisus](https://github.com/Mantisus), closes [#1706](https://github.com/apify/crawlee-python/issues/1706) - Respect `max_open_pages_per_browser` limit for `PlaywrightBrowserController` on concurrent `new_page` calls ([#1712](https://github.com/apify/crawlee-python/pull/1712)) ([2e5534b](https://github.com/apify/crawlee-python/commit/2e5534b98913d5cbd6b721b2423d063772024417)) by [@Mantisus](https://github.com/Mantisus) ## [1.3.1](https://github.com/apify/crawlee-python/releases/tag/v1.3.1) (2026-01-30) ### 🐛 Bug Fixes - Reset all counter in metadata with `purge` for `RequestQueue` ([#1686](https://github.com/apify/crawlee-python/pull/1686)) ([ee09260](https://github.com/apify/crawlee-python/commit/ee0926084589f1b6e15840b6185ec5433be3b72f)) by [@Mantisus](https://github.com/Mantisus), closes [#1682](https://github.com/apify/crawlee-python/issues/1682) - Set default `http3=False` for `ImpitHttpClient` ([#1685](https://github.com/apify/crawlee-python/pull/1685)) ([3f390f6](https://github.com/apify/crawlee-python/commit/3f390f677540a3905038d7db6a6d1efad32fd045)) by [@Mantisus](https://github.com/Mantisus), closes [#1683](https://github.com/apify/crawlee-python/issues/1683) - Prevent get_request from permanently blocking requests ([#1684](https://github.com/apify/crawlee-python/pull/1684)) ([da416f9](https://github.com/apify/crawlee-python/commit/da416f98fb453904d62e7d29d8f24611ffb3ba8d)) by [@Mirza-Samad-Ahmed-Baig](https://github.com/Mirza-Samad-Ahmed-Baig) - Do not share state between different crawlers unless requested ([#1669](https://github.com/apify/crawlee-python/pull/1669)) ([64c246b](https://github.com/apify/crawlee-python/commit/64c246bedea14f86e607d23adc5bec644c578364)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1627](https://github.com/apify/crawlee-python/issues/1627) ## [1.3.0](https://github.com/apify/crawlee-python/releases/tag/v1.3.0) (2026-01-20) ### 🚀 Features - Expose `AdaptivePlaywrightCrawlerStatisticState` for `AdaptivePlaywrightCrawler` ([#1635](https://github.com/apify/crawlee-python/pull/1635)) ([1bb4bcb](https://github.com/apify/crawlee-python/commit/1bb4bcb4ccbec347ad9c14f70e9e946d48e3c38e)) by [@Mantisus](https://github.com/Mantisus) ### 🐛 Bug Fixes - Prevent race condition in concurrent storage creation ([#1626](https://github.com/apify/crawlee-python/pull/1626)) ([7f17a43](https://github.com/apify/crawlee-python/commit/7f17a4347d5884962767e757a92ec173688fed7b)) by [@Mantisus](https://github.com/Mantisus), closes [#1621](https://github.com/apify/crawlee-python/issues/1621) - Create correct statistics for `AdaptivePlaywrightCrawler` on initialization with a custom parser ([#1637](https://github.com/apify/crawlee-python/pull/1637)) ([bff7260](https://github.com/apify/crawlee-python/commit/bff726055dd0d7e07a2c546b15cbee22abd85960)) by [@Mantisus](https://github.com/Mantisus), closes [#1630](https://github.com/apify/crawlee-python/issues/1630) - Fix adding extra link for `EnqueueLinksFunction` with `limit` ([#1674](https://github.com/apify/crawlee-python/pull/1674)) ([71d7867](https://github.com/apify/crawlee-python/commit/71d7867b14f7f07cac06899f5da006091af4a954)) by [@Mantisus](https://github.com/Mantisus), closes [#1673](https://github.com/apify/crawlee-python/issues/1673) ## [1.2.1](https://github.com/apify/crawlee-python/releases/tag/v1.2.1) (2025-12-16) ### 🐛 Bug Fixes - Fix short error summary ([#1605](https://github.com/apify/crawlee-python/pull/1605)) ([b751208](https://github.com/apify/crawlee-python/commit/b751208d9a56e9d923e4559baeba35e2eede0450)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1602](https://github.com/apify/crawlee-python/issues/1602) - Freeze core `Request` fields ([#1603](https://github.com/apify/crawlee-python/pull/1603)) ([ae6d86b](https://github.com/apify/crawlee-python/commit/ae6d86b8c82900116032596201d94cd7875aaadc)) by [@Mantisus](https://github.com/Mantisus) - Respect `enqueue_strategy` after redirects in `enqueue_links` ([#1607](https://github.com/apify/crawlee-python/pull/1607)) ([700df91](https://github.com/apify/crawlee-python/commit/700df91bc9be1299388030a3e48e4dbc6f5b85a0)) by [@Mantisus](https://github.com/Mantisus), closes [#1606](https://github.com/apify/crawlee-python/issues/1606) - Protect `Request` from partial mutations on request handler failure ([#1585](https://github.com/apify/crawlee-python/pull/1585)) ([a69caf8](https://github.com/apify/crawlee-python/commit/a69caf87edecc755287c53c8cc0ca4725af5d411)) by [@Mantisus](https://github.com/Mantisus), closes [#1514](https://github.com/apify/crawlee-python/issues/1514) ## [1.2.0](https://github.com/apify/crawlee-python/releases/tag/v1.2.0) (2025-12-08) ### 🚀 Features - Add additional kwargs to Crawler's export_data ([#1597](https://github.com/apify/crawlee-python/pull/1597)) ([5977f37](https://github.com/apify/crawlee-python/commit/5977f376b93a7c0d4dd53f0d331a4b04fedba2c6)) by [@vdusek](https://github.com/vdusek), closes [#526](https://github.com/apify/crawlee-python/issues/526) - Add `goto_options` for `PlaywrightCrawler` ([#1599](https://github.com/apify/crawlee-python/pull/1599)) ([0b82f3b](https://github.com/apify/crawlee-python/commit/0b82f3b6fb175223ea2aa5b348afcd5fdb767972)) by [@Mantisus](https://github.com/Mantisus), closes [#1576](https://github.com/apify/crawlee-python/issues/1576) ### 🐛 Bug Fixes - Only apply requestHandlerTimeout to request handler ([#1474](https://github.com/apify/crawlee-python/pull/1474)) ([0dfb6c2](https://github.com/apify/crawlee-python/commit/0dfb6c2a13b6650736245fa39b3fbff397644df7)) by [@janbuchar](https://github.com/janbuchar) - Handle the case when `error_handler` returns `Request` ([#1595](https://github.com/apify/crawlee-python/pull/1595)) ([8a961a2](https://github.com/apify/crawlee-python/commit/8a961a2b07d0d33a7302dbb13c17f3d90999d390)) by [@Mantisus](https://github.com/Mantisus) - Align `Request.state` transitions with `Request` lifecycle ([#1601](https://github.com/apify/crawlee-python/pull/1601)) ([383225f](https://github.com/apify/crawlee-python/commit/383225f9f055d95ffb1302b8cf96f42ec264f1fc)) by [@Mantisus](https://github.com/Mantisus) ## [1.1.1](https://github.com/apify/crawlee-python/releases/tag/v1.1.1) (2025-12-02) ### 🐛 Bug Fixes - Unify separators in `unique_key` construction ([#1569](https://github.com/apify/crawlee-python/pull/1569)) ([af46a37](https://github.com/apify/crawlee-python/commit/af46a3733b059a8052489296e172f005def953f7)) by [@vdusek](https://github.com/vdusek), closes [#1512](https://github.com/apify/crawlee-python/issues/1512) - Fix `same-domain` strategy ignoring public suffix ([#1572](https://github.com/apify/crawlee-python/pull/1572)) ([3d018b2](https://github.com/apify/crawlee-python/commit/3d018b21a28a4bee493829783057188d6106a69b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1571](https://github.com/apify/crawlee-python/issues/1571) - Make context helpers work in `FailedRequestHandler` and `ErrorHandler` ([#1570](https://github.com/apify/crawlee-python/pull/1570)) ([b830019](https://github.com/apify/crawlee-python/commit/b830019350830ac33075316061659e2854f7f4a5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1532](https://github.com/apify/crawlee-python/issues/1532) - Fix non-ASCII character corruption in `FileSystemStorageClient` on systems without UTF-8 default encoding ([#1580](https://github.com/apify/crawlee-python/pull/1580)) ([f179f86](https://github.com/apify/crawlee-python/commit/f179f8671b0b6af9264450e4fef7e49d1cecd2bd)) by [@Mantisus](https://github.com/Mantisus), closes [#1579](https://github.com/apify/crawlee-python/issues/1579) - Respect `<base>` when enqueuing ([#1590](https://github.com/apify/crawlee-python/pull/1590)) ([de517a1](https://github.com/apify/crawlee-python/commit/de517a1629cc29b20568143eb64018f216d4ba33)) by [@Mantisus](https://github.com/Mantisus), closes [#1589](https://github.com/apify/crawlee-python/issues/1589) ## [1.1.0](https://github.com/apify/crawlee-python/releases/tag/v1.1.0) (2025-11-18) ### 🚀 Features - Add `chrome` `BrowserType` for `PlaywrightCrawler` to use the Chrome browser ([#1487](https://github.com/apify/crawlee-python/pull/1487)) ([b06937b](https://github.com/apify/crawlee-python/commit/b06937bbc3afe3c936b554bfc503365c1b2c526b)) by [@Mantisus](https://github.com/Mantisus), closes [#1071](https://github.com/apify/crawlee-python/issues/1071) - Add `RedisStorageClient` based on Redis v8.0+ ([#1406](https://github.com/apify/crawlee-python/pull/1406)) ([d08d13d](https://github.com/apify/crawlee-python/commit/d08d13d39203c24ab61fe254b0956d6744db3b5f)) by [@Mantisus](https://github.com/Mantisus) - Add support for Python 3.14 ([#1553](https://github.com/apify/crawlee-python/pull/1553)) ([89e9130](https://github.com/apify/crawlee-python/commit/89e9130cabee0fbc974b29c26483b7fa0edf627c)) by [@Mantisus](https://github.com/Mantisus) - Add `transform_request_function` parameter for `SitemapRequestLoader` ([#1525](https://github.com/apify/crawlee-python/pull/1525)) ([dc90127](https://github.com/apify/crawlee-python/commit/dc901271849b239ba2a947e8ebff8e1815e8c4fb)) by [@Mantisus](https://github.com/Mantisus) ### 🐛 Bug Fixes - Improve indexing of the `request_queue_records` table for `SqlRequestQueueClient` ([#1527](https://github.com/apify/crawlee-python/pull/1527)) ([6509534](https://github.com/apify/crawlee-python/commit/65095346a9d8b703b10c91e0510154c3c48a4176)) by [@Mantisus](https://github.com/Mantisus), closes [#1526](https://github.com/apify/crawlee-python/issues/1526) - Improve error handling for `RobotsTxtFile.load` ([#1524](https://github.com/apify/crawlee-python/pull/1524)) ([596a311](https://github.com/apify/crawlee-python/commit/596a31184914a254b3e7a81fd2f48ea8eda7db49)) by [@Mantisus](https://github.com/Mantisus) - Fix `crawler_runtime` not being updated during run and only in the end ([#1540](https://github.com/apify/crawlee-python/pull/1540)) ([0d6c3f6](https://github.com/apify/crawlee-python/commit/0d6c3f6d3337ddb6cab4873747c28cf95605d550)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1541](https://github.com/apify/crawlee-python/issues/1541) - Ensure persist state event emission when exiting `EventManager` context ([#1562](https://github.com/apify/crawlee-python/pull/1562)) ([6a44f17](https://github.com/apify/crawlee-python/commit/6a44f172600cbcacebab899082d6efc9105c4e03)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1560](https://github.com/apify/crawlee-python/issues/1560) ## [1.0.4](https://github.com/apify/crawlee-python/releases/tag/v1.0.4) (2025-10-24) ### 🐛 Bug Fixes - Respect `enqueue_strategy` in `enqueue_links` ([#1505](https://github.com/apify/crawlee-python/pull/1505)) ([6ee04bc](https://github.com/apify/crawlee-python/commit/6ee04bc08c50a70f2e956a79d4ce5072a726c3a8)) by [@Mantisus](https://github.com/Mantisus), closes [#1504](https://github.com/apify/crawlee-python/issues/1504) - Exclude incorrect links before checking `robots.txt` ([#1502](https://github.com/apify/crawlee-python/pull/1502)) ([3273da5](https://github.com/apify/crawlee-python/commit/3273da5fee62ec9254666b376f382474c3532a56)) by [@Mantisus](https://github.com/Mantisus), closes [#1499](https://github.com/apify/crawlee-python/issues/1499) - Resolve compatibility issue between `SqlStorageClient` and `AdaptivePlaywrightCrawler` ([#1496](https://github.com/apify/crawlee-python/pull/1496)) ([ce172c4](https://github.com/apify/crawlee-python/commit/ce172c425a8643a1d4c919db4f5e5a6e47e91deb)) by [@Mantisus](https://github.com/Mantisus), closes [#1495](https://github.com/apify/crawlee-python/issues/1495) - Fix `BasicCrawler` statistics persistence ([#1490](https://github.com/apify/crawlee-python/pull/1490)) ([1eb1c19](https://github.com/apify/crawlee-python/commit/1eb1c19aa6f9dda4a0e3f7eda23f77a554f95076)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1501](https://github.com/apify/crawlee-python/issues/1501) - Save context state in result for `AdaptivePlaywrightCrawler` after isolated processing in `SubCrawler` ([#1488](https://github.com/apify/crawlee-python/pull/1488)) ([62b7c70](https://github.com/apify/crawlee-python/commit/62b7c70b54085fc65a660062028014f4502beba9)) by [@Mantisus](https://github.com/Mantisus), closes [#1483](https://github.com/apify/crawlee-python/issues/1483) ## [1.0.3](https://github.com/apify/crawlee-python/releases/tag/v1.0.3) (2025-10-17) ### 🐛 Bug Fixes - Add support for Pydantic v2.12 ([#1471](https://github.com/apify/crawlee-python/pull/1471)) ([35c1108](https://github.com/apify/crawlee-python/commit/35c110878c2f445a2866be2522ea8703e9b371dd)) by [@Mantisus](https://github.com/Mantisus), closes [#1464](https://github.com/apify/crawlee-python/issues/1464) - Fix database version warning message ([#1485](https://github.com/apify/crawlee-python/pull/1485)) ([18a545e](https://github.com/apify/crawlee-python/commit/18a545ee8add92e844acd0068f9cb8580a82e1c9)) by [@Mantisus](https://github.com/Mantisus) - Fix `reclaim_request` in `SqlRequestQueueClient` to correctly update the request state ([#1486](https://github.com/apify/crawlee-python/pull/1486)) ([1502469](https://github.com/apify/crawlee-python/commit/150246957f8f7f1ceb77bb77e3a02a903c50cae1)) by [@Mantisus](https://github.com/Mantisus), closes [#1484](https://github.com/apify/crawlee-python/issues/1484) - Fix `KeyValueStore.auto_saved_value` failing in some scenarios ([#1438](https://github.com/apify/crawlee-python/pull/1438)) ([b35dee7](https://github.com/apify/crawlee-python/commit/b35dee78180e57161b826641d45a61b8d8f6ef51)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1354](https://github.com/apify/crawlee-python/issues/1354) ## [1.0.2](https://github.com/apify/crawlee-python/releases/tag/v1.0.2) (2025-10-08) ### 🐛 Bug Fixes - Use Self type in the open() method of storage clients ([#1462](https://github.com/apify/crawlee-python/pull/1462)) ([4ec6f6c](https://github.com/apify/crawlee-python/commit/4ec6f6c08f81632197f602ff99151338b3eba6e7)) by [@janbuchar](https://github.com/janbuchar) - Add storages name validation ([#1457](https://github.com/apify/crawlee-python/pull/1457)) ([84de11a](https://github.com/apify/crawlee-python/commit/84de11a3a603503076f5b7df487c9abab68a9015)) by [@Mantisus](https://github.com/Mantisus), closes [#1434](https://github.com/apify/crawlee-python/issues/1434) - Pin pydantic version to <2.12.0 to avoid compatibility issues ([#1467](https://github.com/apify/crawlee-python/pull/1467)) ([f11b86f](https://github.com/apify/crawlee-python/commit/f11b86f7ed57f98e83dc1b52f15f2017a919bf59)) by [@vdusek](https://github.com/vdusek) ## [1.0.1](https://github.com/apify/crawlee-python/releases/tag/v1.0.1) (2025-10-06) ### 🐛 Bug Fixes - Fix memory leak in `PlaywrightCrawler` on browser context creation ([#1446](https://github.com/apify/crawlee-python/pull/1446)) ([bb181e5](https://github.com/apify/crawlee-python/commit/bb181e58d8070fba38e62d6e57fe981a00e5f035)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1443](https://github.com/apify/crawlee-python/issues/1443) - Update templates to handle optional httpx client ([#1440](https://github.com/apify/crawlee-python/pull/1440)) ([c087efd](https://github.com/apify/crawlee-python/commit/c087efd39baedf46ca3e5cae1ddc1acd6396e6c1)) by [@Pijukatel](https://github.com/Pijukatel) ## [1.0.0](https://github.com/apify/crawlee-python/releases/tag/v1.0.0) (2025-09-29) - Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v1) for more details. - Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v1) to ensure a smooth update. ### 🚀 Features - Add utility for load and parse Sitemap and `SitemapRequestLoader` ([#1169](https://github.com/apify/crawlee-python/pull/1169)) ([66599f8](https://github.com/apify/crawlee-python/commit/66599f8d085f3a8622e130019b6fdce2325737de)) by [@Mantisus](https://github.com/Mantisus), closes [#1161](https://github.com/apify/crawlee-python/issues/1161) - Add periodic status logging and `status_message_callback` parameter for customization ([#1265](https://github.com/apify/crawlee-python/pull/1265)) ([b992fb2](https://github.com/apify/crawlee-python/commit/b992fb2a457dedd20fc3014d7a4a8afe14602342)) by [@Mantisus](https://github.com/Mantisus), closes [#96](https://github.com/apify/crawlee-python/issues/96) - Add crawlee-cli option to skip project installation ([#1294](https://github.com/apify/crawlee-python/pull/1294)) ([4d5aef0](https://github.com/apify/crawlee-python/commit/4d5aef05613d10c1442fe449d1cf0f63392c98e3)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1122](https://github.com/apify/crawlee-python/issues/1122) - Improve `Crawlee` CLI help text ([#1297](https://github.com/apify/crawlee-python/pull/1297)) ([afbe10f](https://github.com/apify/crawlee-python/commit/afbe10f15d93353f5bc551bf9f193414179d0dd7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1295](https://github.com/apify/crawlee-python/issues/1295) - Add basic `OpenTelemetry` instrumentation ([#1255](https://github.com/apify/crawlee-python/pull/1255)) ([a92d8b3](https://github.com/apify/crawlee-python/commit/a92d8b3f843ee795bba7e14710bb1faa1fdbf292)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1254](https://github.com/apify/crawlee-python/issues/1254) - Add `ImpitHttpClient` http-client client using the `impit` library ([#1151](https://github.com/apify/crawlee-python/pull/1151)) ([0d0d268](https://github.com/apify/crawlee-python/commit/0d0d2681a4379c0e7ba54c49c86dabfef641610f)) by [@Mantisus](https://github.com/Mantisus) - Prevent overloading system memory when running locally ([#1270](https://github.com/apify/crawlee-python/pull/1270)) ([30de3bd](https://github.com/apify/crawlee-python/commit/30de3bd7722cbc34db9fc582b4bda7dc2dfa90ff)) by [@janbuchar](https://github.com/janbuchar), closes [#1232](https://github.com/apify/crawlee-python/issues/1232) - Expose `PlaywrightPersistentBrowser` class ([#1314](https://github.com/apify/crawlee-python/pull/1314)) ([b5fa955](https://github.com/apify/crawlee-python/commit/b5fa95508d7c099ff3a342577f338439283a975f)) by [@Mantisus](https://github.com/Mantisus) - Add `impit` option for Crawlee CLI ([#1312](https://github.com/apify/crawlee-python/pull/1312)) ([508d7ce](https://github.com/apify/crawlee-python/commit/508d7ce4d998f37ab2adcf9c057c3c635a69f863)) by [@Mantisus](https://github.com/Mantisus) - Persist RequestList state ([#1274](https://github.com/apify/crawlee-python/pull/1274)) ([cc68014](https://github.com/apify/crawlee-python/commit/cc680147ba3cc8b35b9da70274e53e6f5dd92434)) by [@janbuchar](https://github.com/janbuchar), closes [#99](https://github.com/apify/crawlee-python/issues/99) - Persist `DefaultRenderingTypePredictor` state ([#1340](https://github.com/apify/crawlee-python/pull/1340)) ([fad4c25](https://github.com/apify/crawlee-python/commit/fad4c25fc712915c4a45b24e3290b6f5dbd8a683)) by [@Mantisus](https://github.com/Mantisus), closes [#1272](https://github.com/apify/crawlee-python/issues/1272) - Persist the `SitemapRequestLoader` state ([#1347](https://github.com/apify/crawlee-python/pull/1347)) ([27ef9ad](https://github.com/apify/crawlee-python/commit/27ef9ad194552ea9f1321d91a7a52054be9a8a51)) by [@Mantisus](https://github.com/Mantisus), closes [#1269](https://github.com/apify/crawlee-python/issues/1269) - Add support for NDU storages ([#1401](https://github.com/apify/crawlee-python/pull/1401)) ([5dbd212](https://github.com/apify/crawlee-python/commit/5dbd212663e7abc37535713f4c6e3a5bbf30a12e)) by [@vdusek](https://github.com/vdusek), closes [#1175](https://github.com/apify/crawlee-python/issues/1175) - Add RQ id, name, alias args to `add_requests` and `enqueue_links` methods ([#1413](https://github.com/apify/crawlee-python/pull/1413)) ([1cae2bc](https://github.com/apify/crawlee-python/commit/1cae2bca0b1508fcb3cb419dc239caf33e20a7ef)) by [@Mantisus](https://github.com/Mantisus), closes [#1402](https://github.com/apify/crawlee-python/issues/1402) - Add `SqlStorageClient` based on `sqlalchemy` v2+ ([#1339](https://github.com/apify/crawlee-python/pull/1339)) ([07c75a0](https://github.com/apify/crawlee-python/commit/07c75a078b443b58bfaaeb72eb2aa1439458dc47)) by [@Mantisus](https://github.com/Mantisus), closes [#307](https://github.com/apify/crawlee-python/issues/307) ### 🐛 Bug Fixes - Fix memory estimation not working on MacOS ([#1330](https://github.com/apify/crawlee-python/pull/1330)) ([ab020eb](https://github.com/apify/crawlee-python/commit/ab020eb821a75723225b652d64babd84c368183f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1329](https://github.com/apify/crawlee-python/issues/1329) - Fix retry count to not count the original request ([#1328](https://github.com/apify/crawlee-python/pull/1328)) ([74fa1d9](https://github.com/apify/crawlee-python/commit/74fa1d936cb3c29cf62d87862a96b4266694af2f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1326](https://github.com/apify/crawlee-python/issues/1326) - [**breaking**] Remove unused "stats" field from RequestQueueMetadata ([#1331](https://github.com/apify/crawlee-python/pull/1331)) ([0a63bef](https://github.com/apify/crawlee-python/commit/0a63bef514b0bdcd3d6f208b386f706d0fe561e6)) by [@vdusek](https://github.com/vdusek) - Ignore unknown parameters passed in cookies ([#1336](https://github.com/apify/crawlee-python/pull/1336)) ([50d3ef7](https://github.com/apify/crawlee-python/commit/50d3ef7540551383d26d40f3404b435bde35b47d)) by [@Mantisus](https://github.com/Mantisus), closes [#1333](https://github.com/apify/crawlee-python/issues/1333) - Fix `timeout` for `stream` method in `ImpitHttpClient` ([#1352](https://github.com/apify/crawlee-python/pull/1352)) ([54b693b](https://github.com/apify/crawlee-python/commit/54b693b838f135a596e1e9493b565bc558b19a3a)) by [@Mantisus](https://github.com/Mantisus) - Include reason in the session rotation warning logs ([#1363](https://github.com/apify/crawlee-python/pull/1363)) ([d6d7a45](https://github.com/apify/crawlee-python/commit/d6d7a45dd64a906419d9552c45062d726cbb1a0f)) by [@vdusek](https://github.com/vdusek), closes [#1318](https://github.com/apify/crawlee-python/issues/1318) - Improve crawler statistics logging ([#1364](https://github.com/apify/crawlee-python/pull/1364)) ([1eb6da5](https://github.com/apify/crawlee-python/commit/1eb6da5dd85870124593dcad877284ccaed9c0ce)) by [@vdusek](https://github.com/vdusek), closes [#1317](https://github.com/apify/crawlee-python/issues/1317) - Do not add a request that is already in progress to `MemoryRequestQueueClient` ([#1384](https://github.com/apify/crawlee-python/pull/1384)) ([3af326c](https://github.com/apify/crawlee-python/commit/3af326c9dfa8fffd56a42ca42981374613739e39)) by [@Mantisus](https://github.com/Mantisus), closes [#1383](https://github.com/apify/crawlee-python/issues/1383) - Save `RequestQueueState` for `FileSystemRequestQueueClient` in default KVS ([#1411](https://github.com/apify/crawlee-python/pull/1411)) ([6ee60a0](https://github.com/apify/crawlee-python/commit/6ee60a08ac1f9414e1b792f4935cc3799cb5089a)) by [@Mantisus](https://github.com/Mantisus), closes [#1410](https://github.com/apify/crawlee-python/issues/1410) - Set default desired concurrency for non-browser crawlers to 10 ([#1419](https://github.com/apify/crawlee-python/pull/1419)) ([1cc9401](https://github.com/apify/crawlee-python/commit/1cc940197600d2539bda967880d7f9d241eb8c3e)) by [@vdusek](https://github.com/vdusek) ### 🚜 Refactor - [**breaking**] Introduce new storage client system ([#1194](https://github.com/apify/crawlee-python/pull/1194)) ([de1c03f](https://github.com/apify/crawlee-python/commit/de1c03f70dbd4ae1773fd49c632b3cfcfab82c26)) by [@vdusek](https://github.com/vdusek), closes [#92](https://github.com/apify/crawlee-python/issues/92), [#147](https://github.com/apify/crawlee-python/issues/147), [#783](https://github.com/apify/crawlee-python/issues/783), [#1247](https://github.com/apify/crawlee-python/issues/1247) - [**breaking**] Split `BrowserType` literal into two different literals based on context ([#1070](https://github.com/apify/crawlee-python/pull/1070)) ([72b5698](https://github.com/apify/crawlee-python/commit/72b5698fa0647ea02b08da5651736cc37c4c0f6a)) by [@Pijukatel](https://github.com/Pijukatel) - [**breaking**] Change method `HttpResponse.read` from sync to async ([#1296](https://github.com/apify/crawlee-python/pull/1296)) ([83fa8a4](https://github.com/apify/crawlee-python/commit/83fa8a416b6d2d4e27c678b9bf99bd1b8799f57b)) by [@Mantisus](https://github.com/Mantisus) - [**breaking**] Replace `HttpxHttpClient` with `ImpitHttpClient` as default HTTP client ([#1307](https://github.com/apify/crawlee-python/pull/1307)) ([c803a97](https://github.com/apify/crawlee-python/commit/c803a976776a76846866d533e3a3ee8144e248c4)) by [@Mantisus](https://github.com/Mantisus), closes [#1079](https://github.com/apify/crawlee-python/issues/1079) - [**breaking**] Change Dataset unwind parameter to accept list of strings ([#1357](https://github.com/apify/crawlee-python/pull/1357)) ([862a203](https://github.com/apify/crawlee-python/commit/862a20398f00fe91802fe7a1ccd58b05aee053a1)) by [@vdusek](https://github.com/vdusek) - [**breaking**] Remove `Request.id` field ([#1366](https://github.com/apify/crawlee-python/pull/1366)) ([32f3580](https://github.com/apify/crawlee-python/commit/32f3580e9775a871924ab1233085d0c549c4cd52)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1358](https://github.com/apify/crawlee-python/issues/1358) - [**breaking**] Refactor storage creation and caching, configuration and services ([#1386](https://github.com/apify/crawlee-python/pull/1386)) ([04649bd](https://github.com/apify/crawlee-python/commit/04649bde60d46b2bc18ae4f6e3fd9667d02a9cef)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1379](https://github.com/apify/crawlee-python/issues/1379) ## [0.6.12](https://github.com/apify/crawlee-python/releases/tag/v0.6.12) (2025-07-30) ### 🚀 Features - Add `retire_browser_after_page_count` parameter for `BrowserPool` ([#1266](https://github.com/apify/crawlee-python/pull/1266)) ([603aa2b](https://github.com/apify/crawlee-python/commit/603aa2b192ef4bc42d88244bd009fffdb0614c06)) by [@Mantisus](https://github.com/Mantisus) ### 🐛 Bug Fixes - Use `perf_counter_ns` for request duration tracking ([#1260](https://github.com/apify/crawlee-python/pull/1260)) ([9e92f6b](https://github.com/apify/crawlee-python/commit/9e92f6b54400ce5004fbab770e2e4ac42f73148f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1256](https://github.com/apify/crawlee-python/issues/1256) - Fix memory estimation not working on MacOS (#1330) ([8558954](https://github.com/apify/crawlee-python/commit/8558954feeb7d5e91378186974a29851fedae9c8)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1329](https://github.com/apify/crawlee-python/issues/1329) - Fix retry count to not count the original request (#1328) ([1aff3aa](https://github.com/apify/crawlee-python/commit/1aff3aaf0cdbe452a3731192449a445e5b2d7a63)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1326](https://github.com/apify/crawlee-python/issues/1326) - Ignore unknown parameters passed in cookies (#1336) ([0f2610c](https://github.com/apify/crawlee-python/commit/0f2610c0ee1154dc004de60fc57fe7c9f478166a)) by [@Mantisus](https://github.com/Mantisus), closes [#1333](https://github.com/apify/crawlee-python/issues/1333) ## [0.6.11](https://github.com/apify/crawlee-python/releases/tag/v0.6.11) (2025-06-23) ### 🚀 Features - Add `stream` method for `HttpClient` ([#1241](https://github.com/apify/crawlee-python/pull/1241)) ([95c68b0](https://github.com/apify/crawlee-python/commit/95c68b0b2d0bf9e093c1d0ee1002625172f7a868)) by [@Mantisus](https://github.com/Mantisus) ### 🐛 Bug Fixes - Fix `ClientSnapshot` overload calculation ([#1228](https://github.com/apify/crawlee-python/pull/1228)) ([a4fc1b6](https://github.com/apify/crawlee-python/commit/a4fc1b6e83143650666108c289c084ea0463b80c)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1207](https://github.com/apify/crawlee-python/issues/1207) - Use `PSS` instead of `RSS` to estimate children process memory usage on Linux ([#1210](https://github.com/apify/crawlee-python/pull/1210)) ([436032f](https://github.com/apify/crawlee-python/commit/436032f2de5f7d7fa1016033f1bb224159a8e6bf)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1206](https://github.com/apify/crawlee-python/issues/1206) - Do not raise an error to check 'same-domain' if there is no hostname in the url ([#1251](https://github.com/apify/crawlee-python/pull/1251)) ([a6c3aab](https://github.com/apify/crawlee-python/commit/a6c3aabf5f8341f215275077b6768a56118bc656)) by [@Mantisus](https://github.com/Mantisus) ## [0.6.10](https://github.com/apify/crawlee-python/releases/tag/v0.6.10) (2025-06-02) ### 🐛 Bug Fixes - Allow config change on `PlaywrightCrawler` ([#1186](https://github.com/apify/crawlee-python/pull/1186)) ([f17bf31](https://github.com/apify/crawlee-python/commit/f17bf31456b702631aa7e0c26d4f07fd5eb7d1bd)) by [@mylank](https://github.com/mylank), closes [#1185](https://github.com/apify/crawlee-python/issues/1185) - Add `payload` to `SendRequestFunction` to support `POST` request ([#1202](https://github.com/apify/crawlee-python/pull/1202)) ([e7449f2](https://github.com/apify/crawlee-python/commit/e7449f206c580cb8383a66e4c9ff5f67c5ce8409)) by [@Mantisus](https://github.com/Mantisus) - Fix match check for specified enqueue strategy for requests with redirect ([#1199](https://github.com/apify/crawlee-python/pull/1199)) ([d84c30c](https://github.com/apify/crawlee-python/commit/d84c30cbd7c94d6525d3b6e8e86b379050454c0e)) by [@Mantisus](https://github.com/Mantisus), closes [#1198](https://github.com/apify/crawlee-python/issues/1198) - Set `WindowsSelectorEventLoopPolicy` only for curl-impersonate template without `playwright` ([#1209](https://github.com/apify/crawlee-python/pull/1209)) ([f3b839f](https://github.com/apify/crawlee-python/commit/f3b839ffc2ccc1b889b6d5928f35f57b725e27f1)) by [@Mantisus](https://github.com/Mantisus), closes [#1204](https://github.com/apify/crawlee-python/issues/1204) - Add support non-GET requests for `PlaywrightCrawler` ([#1208](https://github.com/apify/crawlee-python/pull/1208)) ([dbb9f44](https://github.com/apify/crawlee-python/commit/dbb9f44c71af15e1f86766fa0ba68281dd85fd9e)) by [@Mantisus](https://github.com/Mantisus), closes [#1201](https://github.com/apify/crawlee-python/issues/1201) - Respect `EnqueueLinksKwargs` for `extract_links` function ([#1213](https://github.com/apify/crawlee-python/pull/1213)) ([c9907d6](https://github.com/apify/crawlee-python/commit/c9907d6ff4c3a4a719b279cea77694c00a5a963d)) by [@Mantisus](https://github.com/Mantisus), closes [#1212](https://github.com/apify/crawlee-python/issues/1212) ## [0.6.9](https://github.com/apify/crawlee-python/releases/tag/v0.6.9) (2025-05-02) ### 🚀 Features - Add an internal `HttpClient` to be used in `send_request` for `PlaywrightCrawler` using `APIRequestContext` bound to the browser context ([#1134](https://github.com/apify/crawlee-python/pull/1134)) ([e794f49](https://github.com/apify/crawlee-python/commit/e794f4985d3a018ee76d634fe2b2c735fb450272)) by [@Mantisus](https://github.com/Mantisus), closes [#928](https://github.com/apify/crawlee-python/issues/928) - Make timeout error log cleaner ([#1170](https://github.com/apify/crawlee-python/pull/1170)) ([78ea9d2](https://github.com/apify/crawlee-python/commit/78ea9d23e0b2d73286043b68393e462f636625c9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1158](https://github.com/apify/crawlee-python/issues/1158) - Add `on_skipped_request` decorator, to process links skipped according to `robots.txt` rules ([#1166](https://github.com/apify/crawlee-python/pull/1166)) ([bd16f14](https://github.com/apify/crawlee-python/commit/bd16f14a834eebf485aea6b6a83f2b18bf16b504)) by [@Mantisus](https://github.com/Mantisus), closes [#1160](https://github.com/apify/crawlee-python/issues/1160) ### 🐛 Bug Fixes - Fix handle error without `args` in `_get_error_message` for `ErrorTracker` ([#1181](https://github.com/apify/crawlee-python/pull/1181)) ([21944d9](https://github.com/apify/crawlee-python/commit/21944d908b8404d2ad6c182104e7a8c27be12a6e)) by [@Mantisus](https://github.com/Mantisus), closes [#1179](https://github.com/apify/crawlee-python/issues/1179) - Temporarily add `certifi<=2025.1.31` dependency ([#1183](https://github.com/apify/crawlee-python/pull/1183)) ([25ff961](https://github.com/apify/crawlee-python/commit/25ff961990f9abc9d0673ba6573dfcf46dd6e53f)) by [@Pijukatel](https://github.com/Pijukatel) ## [0.6.8](https://github.com/apify/crawlee-python/releases/tag/v0.6.8) (2025-04-25) ### 🚀 Features - Handle unprocessed requests in `add_requests_batched` ([#1159](https://github.com/apify/crawlee-python/pull/1159)) ([7851175](https://github.com/apify/crawlee-python/commit/7851175304d63e455223b25853021cfbe15d68bd)) by [@Pijukatel](https://github.com/Pijukatel), closes [#456](https://github.com/apify/crawlee-python/issues/456) - Add `respect_robots_txt_file` option ([#1162](https://github.com/apify/crawlee-python/pull/1162)) ([c23f365](https://github.com/apify/crawlee-python/commit/c23f365bfd263b4357edf82c14a7c6ff8dee45e4)) by [@Mantisus](https://github.com/Mantisus) ### 🐛 Bug Fixes - Update `UnprocessedRequest` to match actual data ([#1155](https://github.com/apify/crawlee-python/pull/1155)) ([a15a1f3](https://github.com/apify/crawlee-python/commit/a15a1f3528c7cbcf78d3bda5a236bcee1d492764)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1150](https://github.com/apify/crawlee-python/issues/1150) - Fix the order in which cookies are saved to the `SessionCookies` and the handler is executed for `PlaywrightCrawler` ([#1163](https://github.com/apify/crawlee-python/pull/1163)) ([82ff69a](https://github.com/apify/crawlee-python/commit/82ff69acd8e409f56be56dd061aae0f854ec25b4)) by [@Mantisus](https://github.com/Mantisus) - Call `failed_request_handler` for `SessionError` when session rotation count exceeds maximum ([#1147](https://github.com/apify/crawlee-python/pull/1147)) ([b3637b6](https://github.com/apify/crawlee-python/commit/b3637b68ec7eae9de7f1b923fa2f68885da64b90)) by [@Mantisus](https://github.com/Mantisus) ## [0.6.7](https://github.com/apify/crawlee-python/releases/tag/v0.6.7) (2025-04-17) ### 🚀 Features - Add `ErrorSnapshotter` to `ErrorTracker` ([#1125](https://github.com/apify/crawlee-python/pull/1125)) ([9666092](https://github.com/apify/crawlee-python/commit/9666092c6a59ac4d34409038d5476e5b6fb58a26)) by [@Pijukatel](https://github.com/Pijukatel), closes [#151](https://github.com/apify/crawlee-python/issues/151) ### 🐛 Bug Fixes - Improve validation errors in Crawlee CLI ([#1140](https://github.com/apify/crawlee-python/pull/1140)) ([f2d33df](https://github.com/apify/crawlee-python/commit/f2d33dff178a3d3079eb3807feb9645a25cc7a93)) by [@vdusek](https://github.com/vdusek), closes [#1138](https://github.com/apify/crawlee-python/issues/1138) - Disable logger propagation to prevent duplicate logs ([#1156](https://github.com/apify/crawlee-python/pull/1156)) ([0b3648d](https://github.com/apify/crawlee-python/commit/0b3648d5d399f0af23520f7fb8ee635d38b512c4)) by [@vdusek](https://github.com/vdusek) ## [0.6.6](https://github.com/apify/crawlee-python/releases/tag/v0.6.6) (2025-04-03) ### 🚀 Features - Add `statistics_log_format` parameter to `BasicCrawler` ([#1061](https://github.com/apify/crawlee-python/pull/1061)) ([635ae4a](https://github.com/apify/crawlee-python/commit/635ae4a56c65e434783ca721f4164203f465abf0)) by [@Mantisus](https://github.com/Mantisus), closes [#700](https://github.com/apify/crawlee-python/issues/700) - Add Session binding capability via `session_id` in `Request` ([#1086](https://github.com/apify/crawlee-python/pull/1086)) ([cda7b31](https://github.com/apify/crawlee-python/commit/cda7b314ffda3104e4fd28a5e85c9e238d8866a4)) by [@Mantisus](https://github.com/Mantisus), closes [#1076](https://github.com/apify/crawlee-python/issues/1076) - Add `requests` argument to `EnqueueLinksFunction` ([#1024](https://github.com/apify/crawlee-python/pull/1024)) ([fc8444c](https://github.com/apify/crawlee-python/commit/fc8444c245c7607d3e378a4835d7d3355c4059be)) by [@Pijukatel](https://github.com/Pijukatel) ### 🐛 Bug Fixes - Add port for `same-origin` strategy check ([#1096](https://github.com/apify/crawlee-python/pull/1096)) ([9e24598](https://github.com/apify/crawlee-python/commit/9e245987d0aab0ba9c763689f12958b5a332db46)) by [@Mantisus](https://github.com/Mantisus) - Fix handling of loading empty `metadata` file for queue ([#1042](https://github.com/apify/crawlee-python/pull/1042)) ([b00876e](https://github.com/apify/crawlee-python/commit/b00876e8dcb30a12d3737bd31237da9daada46bb)) by [@Mantisus](https://github.com/Mantisus), closes [#1029](https://github.com/apify/crawlee-python/issues/1029) - Update favicon ([#1114](https://github.com/apify/crawlee-python/pull/1114)) ([eba900f](https://github.com/apify/crawlee-python/commit/eba900fc1e8d918c6fc464657c53004a3e0fe668)) by [@baldasseva](https://github.com/baldasseva) - **website:** Use correct image source ([#1115](https://github.com/apify/crawlee-python/pull/1115)) ([ee7806f](https://github.com/apify/crawlee-python/commit/ee7806fc2f9b7b590d9668cc9f86009a898a3da6)) by [@baldasseva](https://github.com/baldasseva) ## [0.6.5](https://github.com/apify/crawlee-python/releases/tag/v0.6.5) (2025-03-13) ### 🐛 Bug Fixes - Update to `browserforge` workaround ([#1075](https://github.com/apify/crawlee-python/pull/1075)) ([2378cf8](https://github.com/apify/crawlee-python/commit/2378cf84ab1ed06473049a9ddfca2ba6f166306d)) by [@Pijukatel](https://github.com/Pijukatel) ## [0.6.4](https://github.com/apify/crawlee-python/releases/tag/v0.6.4) (2025-03-12) ### 🐛 Bug Fixes - Add a check thread before set `add_signal_handler` ([#1068](https://github.com/apify/crawlee-python/pull/1068)) ([6983bda](https://github.com/apify/crawlee-python/commit/6983bda2dbc202b3ecbf7db62b11deee007b4b5f)) by [@Mantisus](https://github.com/Mantisus) - Temporary workaround for `browserforge` import time code execution ([#1073](https://github.com/apify/crawlee-python/pull/1073)) ([17d914f](https://github.com/apify/crawlee-python/commit/17d914f78242078f88c07d686a567d1091255eb1)) by [@Pijukatel](https://github.com/Pijukatel) ## [0.6.3](https://github.com/apify/crawlee-python/releases/tag/v0.6.3) (2025-03-07) ### 🚀 Features - Add project template with `uv` package manager ([#1057](https://github.com/apify/crawlee-python/pull/1057)) ([9ec06e5](https://github.com/apify/crawlee-python/commit/9ec06e58032aa11af46ac9cd1ea7bb002a18eb13)) by [@Mantisus](https://github.com/Mantisus), closes [#1053](https://github.com/apify/crawlee-python/issues/1053) - Use fingerprint generator in `PlaywrightCrawler` by default ([#1060](https://github.com/apify/crawlee-python/pull/1060)) ([09cec53](https://github.com/apify/crawlee-python/commit/09cec532911043623eeb475aa8552c70bd94f8b7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1054](https://github.com/apify/crawlee-python/issues/1054) ### 🐛 Bug Fixes - Update project templates for Poetry v2.x compatibility ([#1049](https://github.com/apify/crawlee-python/pull/1049)) ([96dc2f9](https://github.com/apify/crawlee-python/commit/96dc2f9b53b0a2d0f1d0c73d10e5244114e849ff)) by [@Mantisus](https://github.com/Mantisus), closes [#954](https://github.com/apify/crawlee-python/issues/954) - Remove tmp folder for PlaywrightCrawler in non-headless mode ([#1046](https://github.com/apify/crawlee-python/pull/1046)) ([3a7f444](https://github.com/apify/crawlee-python/commit/3a7f444fb7ee9a0ab1867c8c9586b15aab1e7df2)) by [@Mantisus](https://github.com/Mantisus) ## [0.6.2](https://github.com/apify/crawlee-python/releases/tag/v0.6.2) (2025-03-05) ### 🚀 Features - Extend ErrorTracker with error grouping ([#1014](https://github.com/apify/crawlee-python/pull/1014)) ([561de5c](https://github.com/apify/crawlee-python/commit/561de5c6b76af386cad5ac804a22fb7af227e460)) by [@Pijukatel](https://github.com/Pijukatel) ## [0.6.1](https://github.com/apify/crawlee-python/releases/tag/v0.6.1) (2025-03-03) ### 🐛 Bug Fixes - Add `browserforge` to mandatory dependencies ([#1044](https://github.com/apify/crawlee-python/pull/1044)) ([ddfbde8](https://github.com/apify/crawlee-python/commit/ddfbde89dd3e3cbef0f3954936f4a41c3d6df909)) by [@Pijukatel](https://github.com/Pijukatel) ## [0.6.0](https://github.com/apify/crawlee-python/releases/tag/v0.6.0) (2025-03-03) - Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v06) for more details. - Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v06) to ensure a smooth update. ### 🚀 Features - Integrate browserforge fingerprints ([#829](https://github.com/apify/crawlee-python/pull/829)) ([2b156b4](https://github.com/apify/crawlee-python/commit/2b156b4ba688f9111195422e6058dff30eb1f782)) by [@Pijukatel](https://github.com/Pijukatel), closes [#549](https://github.com/apify/crawlee-python/issues/549) - Add AdaptivePlaywrightCrawler ([#872](https://github.com/apify/crawlee-python/pull/872)) ([5ba70b6](https://github.com/apify/crawlee-python/commit/5ba70b6e846a908a55db461ab0c85e3946f2bc7c)) by [@Pijukatel](https://github.com/Pijukatel) - Implement `_snapshot_client` for `Snapshotter` ([#957](https://github.com/apify/crawlee-python/pull/957)) ([ba4d384](https://github.com/apify/crawlee-python/commit/ba4d384228d030c20c580ed01fae0e78af3a9543)) by [@Mantisus](https://github.com/Mantisus), closes [#60](https://github.com/apify/crawlee-python/issues/60) - Add adaptive context helpers ([#964](https://github.com/apify/crawlee-python/pull/964)) ([e248f17](https://github.com/apify/crawlee-python/commit/e248f17fad7b6d1fc5e23a0a1e961db66068a411)) by [@Pijukatel](https://github.com/Pijukatel), closes [#249](https://github.com/apify/crawlee-python/issues/249) - [**breaking**] Enable additional status codes arguments to PlaywrightCrawler ([#959](https://github.com/apify/crawlee-python/pull/959)) ([87cf446](https://github.com/apify/crawlee-python/commit/87cf446a7cbaa900e28abd93d4c8a2e0d1747059)) by [@Pijukatel](https://github.com/Pijukatel), closes [#953](https://github.com/apify/crawlee-python/issues/953) - Replace `HeaderGenerator` implementation by `browserforge` implementation ([#960](https://github.com/apify/crawlee-python/pull/960)) ([c2f8c93](https://github.com/apify/crawlee-python/commit/c2f8c93a4ad57c4ede354545bf925bf3707899c9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#937](https://github.com/apify/crawlee-python/issues/937) ### 🐛 Bug Fixes - Fix playwright template and dockerfile ([#972](https://github.com/apify/crawlee-python/pull/972)) ([c33b34d](https://github.com/apify/crawlee-python/commit/c33b34dd6e253b1261c700857bb5c4bbec6d5c14)) by [@janbuchar](https://github.com/janbuchar), closes [#969](https://github.com/apify/crawlee-python/issues/969) - Fix installing dependencies via pip in project template ([#977](https://github.com/apify/crawlee-python/pull/977)) ([1e3b8eb](https://github.com/apify/crawlee-python/commit/1e3b8eb1cdb57bf2f7256e8ae5f0706b0afc3ba9)) by [@janbuchar](https://github.com/janbuchar), closes [#975](https://github.com/apify/crawlee-python/issues/975) - Fix default migration storage ([#1018](https://github.com/apify/crawlee-python/pull/1018)) ([6a0c4d9](https://github.com/apify/crawlee-python/commit/6a0c4d94593f7e94f24eee8a97fc7bc83c4d02e1)) by [@Pijukatel](https://github.com/Pijukatel), closes [#991](https://github.com/apify/crawlee-python/issues/991) - Fix logger name for http based loggers ([#1023](https://github.com/apify/crawlee-python/pull/1023)) ([bfb3944](https://github.com/apify/crawlee-python/commit/bfb394446351c8f3b9879a9905607f7c929f2542)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1021](https://github.com/apify/crawlee-python/issues/1021) - Remove allow_redirects override in CurlImpersonateHttpClient ([#1017](https://github.com/apify/crawlee-python/pull/1017)) ([01d855a](https://github.com/apify/crawlee-python/commit/01d855a43389a6b4b16ec74767624fa7eb13151f)) by [@2tunnels](https://github.com/2tunnels), closes [#1016](https://github.com/apify/crawlee-python/issues/1016) - Remove follow_redirects override in HttpxHttpClient ([#1015](https://github.com/apify/crawlee-python/pull/1015)) ([88afda3](https://github.com/apify/crawlee-python/commit/88afda33e77be84bc91ad1239740b8e661bef2a2)) by [@2tunnels](https://github.com/2tunnels), closes [#1013](https://github.com/apify/crawlee-python/issues/1013) - Fix flaky test_common_headers_and_user_agent ([#1030](https://github.com/apify/crawlee-python/pull/1030)) ([58aa70e](https://github.com/apify/crawlee-python/commit/58aa70e9600d313b823a1376ab9b36fb416c1c4a)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1027](https://github.com/apify/crawlee-python/issues/1027) ### 🚜 Refactor - [**breaking**] Remove unused config properties ([#978](https://github.com/apify/crawlee-python/pull/978)) ([4b7fe29](https://github.com/apify/crawlee-python/commit/4b7fe2930540a5fbd753135e3ce29dc80f80c543)) by [@vdusek](https://github.com/vdusek) - [**breaking**] Remove Base prefix from abstract class names ([#980](https://github.com/apify/crawlee-python/pull/980)) ([8ccb5d4](https://github.com/apify/crawlee-python/commit/8ccb5d41a1dae9b02088b433266ac89bd089561a)) by [@vdusek](https://github.com/vdusek) - [**breaking**] Сhange default `incognito context` to `persistent context` for `Playwright` ([#985](https://github.com/apify/crawlee-python/pull/985)) ([f01520d](https://github.com/apify/crawlee-python/commit/f01520d22b31af9f0f13ca162cc47e6aa9744c6d)) by [@Mantisus](https://github.com/Mantisus), closes [#721](https://github.com/apify/crawlee-python/issues/721), [#963](https://github.com/apify/crawlee-python/issues/963) - [**breaking**] Change `Session` cookies from `dict` to `SessionCookies` with `CookieJar` ([#984](https://github.com/apify/crawlee-python/pull/984)) ([6523b3a](https://github.com/apify/crawlee-python/commit/6523b3ade0eed53b0363ddce250c557024339b5e)) by [@Mantisus](https://github.com/Mantisus), closes [#710](https://github.com/apify/crawlee-python/issues/710), [#933](https://github.com/apify/crawlee-python/issues/933) - [**breaking**] Replace enum with literal for `EnqueueStrategy` ([#1019](https://github.com/apify/crawlee-python/pull/1019)) ([d2481ef](https://github.com/apify/crawlee-python/commit/d2481ef71d3539979c5b1129387e72b4126fe366)) by [@vdusek](https://github.com/vdusek) - [**breaking**] Update status code handling ([#1028](https://github.com/apify/crawlee-python/pull/1028)) ([6b59471](https://github.com/apify/crawlee-python/commit/6b5947125e63abdfff481b0669398fc9a7293e55)) by [@Mantisus](https://github.com/Mantisus), closes [#830](https://github.com/apify/crawlee-python/issues/830), [#998](https://github.com/apify/crawlee-python/issues/998) - [**breaking**] Move `cli` dependencies to optional dependencies ([#1011](https://github.com/apify/crawlee-python/pull/1011)) ([4382959](https://github.com/apify/crawlee-python/commit/43829590c6b4efd1dc9b833373f82a842a0a1a8e)) by [@Mantisus](https://github.com/Mantisus), closes [#703](https://github.com/apify/crawlee-python/issues/703), [#1010](https://github.com/apify/crawlee-python/issues/1010) ## [0.5.4](https://github.com/apify/crawlee-python/releases/tag/v0.5.4) (2025-02-05) ### 🚀 Features - Add support `use_incognito_pages` for `browser_launch_options` in `PlaywrightCrawler` ([#941](https://github.com/apify/crawlee-python/pull/941)) ([eae3a33](https://github.com/apify/crawlee-python/commit/eae3a33a1842ebbdac5f9c51866a4be4bcf1ae2c)) by [@Mantisus](https://github.com/Mantisus) ### 🐛 Bug Fixes - Fix session management with retire ([#947](https://github.com/apify/crawlee-python/pull/947)) ([caee03f](https://github.com/apify/crawlee-python/commit/caee03fe3a43cc1d7a8d3f9e19b42df1bdb1c0aa)) by [@Mantisus](https://github.com/Mantisus) - Fix templates - poetry-plugin-export version and camoufox template name ([#952](https://github.com/apify/crawlee-python/pull/952)) ([7addea6](https://github.com/apify/crawlee-python/commit/7addea6605359cceba208e16ec9131724bdb3e9b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#951](https://github.com/apify/crawlee-python/issues/951) - Fix convert relative link to absolute in `enqueue_links` for response with redirect ([#956](https://github.com/apify/crawlee-python/pull/956)) ([694102e](https://github.com/apify/crawlee-python/commit/694102e163bb9021a4830d2545d153f6f8f3de90)) by [@Mantisus](https://github.com/Mantisus), closes [#955](https://github.com/apify/crawlee-python/issues/955) - Fix `CurlImpersonateHttpClient` cookies handler ([#946](https://github.com/apify/crawlee-python/pull/946)) ([ed415c4](https://github.com/apify/crawlee-python/commit/ed415c433da2a40b0ee62534f0730d0737e991b8)) by [@Mantisus](https://github.com/Mantisus) ## [0.5.3](https://github.com/apify/crawlee-python/releases/tag/v0.5.3) (2025-01-31) ### 🚀 Features - Add keep_alive flag to `crawler.__init__` ([#921](https://github.com/apify/crawlee-python/pull/921)) ([7a82d0c](https://github.com/apify/crawlee-python/commit/7a82d0cbdbe6c8739d4bf6a9b014e31f07e5a520)) by [@Pijukatel](https://github.com/Pijukatel), closes [#891](https://github.com/apify/crawlee-python/issues/891) - Add `block_requests` helper for `PlaywrightCrawler` ([#919](https://github.com/apify/crawlee-python/pull/919)) ([1030459](https://github.com/apify/crawlee-python/commit/103045994908f80cffee5ccfff91a040e0042f48)) by [@Mantisus](https://github.com/Mantisus), closes [#848](https://github.com/apify/crawlee-python/issues/848) - Return request handlers from decorator methods to allow further decoration ([#934](https://github.com/apify/crawlee-python/pull/934)) ([9ec0aae](https://github.com/apify/crawlee-python/commit/9ec0aae54e2a340d29c893567ae80bf8bd4510a9)) by [@mylank](https://github.com/mylank) - Add `transform_request_function` for `enqueue_links` ([#923](https://github.com/apify/crawlee-python/pull/923)) ([6b15957](https://github.com/apify/crawlee-python/commit/6b159578f612251e6d2253a72b6521430f4f9b09)) by [@Mantisus](https://github.com/Mantisus), closes [#894](https://github.com/apify/crawlee-python/issues/894) - Add `time_remaining_secs` property to `MIGRATING` event data ([#940](https://github.com/apify/crawlee-python/pull/940)) ([b44501b](https://github.com/apify/crawlee-python/commit/b44501bcadbd12673a8f47aa92f12da8e404f60b)) by [@fnesveda](https://github.com/fnesveda) - Add LogisticalRegressionPredictor - rendering type predictor for adaptive crawling ([#930](https://github.com/apify/crawlee-python/pull/930)) ([8440499](https://github.com/apify/crawlee-python/commit/8440499468db115a4c478e9bcdb692554d1655c5)) by [@Pijukatel](https://github.com/Pijukatel) ### 🐛 Bug Fixes - Fix crawler not retrying user handler if there was timeout in the handler ([#909](https://github.com/apify/crawlee-python/pull/909)) ([f4090ef](https://github.com/apify/crawlee-python/commit/f4090ef0ea0281d53dab16a77ceea2ef6ac43d76)) by [@Pijukatel](https://github.com/Pijukatel), closes [#907](https://github.com/apify/crawlee-python/issues/907) - Optimize memory consumption for `HttpxHttpClient`, fix proxy handling ([#905](https://github.com/apify/crawlee-python/pull/905)) ([d7ad480](https://github.com/apify/crawlee-python/commit/d7ad480834263ae0480049cb0a8db4dfc3946d8d)) by [@Mantisus](https://github.com/Mantisus), closes [#895](https://github.com/apify/crawlee-python/issues/895) - Fix `BrowserPool` and `PlaywrightBrowserPlugin` closure ([#932](https://github.com/apify/crawlee-python/pull/932)) ([997543d](https://github.com/apify/crawlee-python/commit/997543d2fa5afba49929f4407ee95d7a4933a50d)) by [@Mantisus](https://github.com/Mantisus) ## [0.5.2](https://github.com/apify/crawlee-python/releases/tag/v0.5.2) (2025-01-17) ### 🐛 Bug Fixes - Avoid `use_state` race conditions. Remove key argument to `use_state` ([#868](https://github.com/apify/crawlee-python/pull/868)) ([000b976](https://github.com/apify/crawlee-python/commit/000b9761211502d86a893a31e3ca21998a6e3b99)) by [@Pijukatel](https://github.com/Pijukatel), closes [#856](https://github.com/apify/crawlee-python/issues/856) - Restore proxy functionality for PlaywrightCrawler broken in v0.5 ([#889](https://github.com/apify/crawlee-python/pull/889)) ([908c944](https://github.com/apify/crawlee-python/commit/908c944ff9b1fc8ed7eb35f0078a1de71e34d5c5)) by [@Mantisus](https://github.com/Mantisus), closes [#887](https://github.com/apify/crawlee-python/issues/887) - Fix the usage of Configuration ([#899](https://github.com/apify/crawlee-python/pull/899)) ([0f1cf6f](https://github.com/apify/crawlee-python/commit/0f1cf6f0b52c92ca4e465a2a01f8111cd9ab42ec)) by [@vdusek](https://github.com/vdusek), closes [#670](https://github.com/apify/crawlee-python/issues/670) ## [0.5.1](https://github.com/apify/crawlee-python/releases/tag/v0.5.1) (2025-01-07) ### 🐛 Bug Fixes - Make result of RequestList.is_empty independent of fetch_next_request calls ([#876](https://github.com/apify/crawlee-python/pull/876)) ([d50249e](https://github.com/apify/crawlee-python/commit/d50249ecbfe2a04f508fcdc3261e050349bd0da2)) by [@janbuchar](https://github.com/janbuchar) ## [0.5.0](https://github.com/apify/crawlee-python/releases/tag/v0.5.0) (2025-01-02) - Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v05) for more details. - Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v05) to ensure a smooth update. ### 🚀 Features - Add possibility to use None as no proxy in tiered proxies ([#760](https://github.com/apify/crawlee-python/pull/760)) ([0fbd017](https://github.com/apify/crawlee-python/commit/0fbd01723b9fe2e3410e0f358cab2f22848b08d0)) by [@Pijukatel](https://github.com/Pijukatel), closes [#687](https://github.com/apify/crawlee-python/issues/687) - Add `use_state` context method ([#682](https://github.com/apify/crawlee-python/pull/682)) ([868b41e](https://github.com/apify/crawlee-python/commit/868b41ebd4c8003fa60ab07887577d0fb85b6ecc)) by [@Mantisus](https://github.com/Mantisus), closes [#191](https://github.com/apify/crawlee-python/issues/191) - Add pre-navigation hooks router to AbstractHttpCrawler ([#791](https://github.com/apify/crawlee-python/pull/791)) ([0f23205](https://github.com/apify/crawlee-python/commit/0f23205923065074c522b3de9d47218a204dfa78)) by [@Pijukatel](https://github.com/Pijukatel), closes [#635](https://github.com/apify/crawlee-python/issues/635) - Add example of how to integrate Camoufox into PlaywrightCrawler ([#789](https://github.com/apify/crawlee-python/pull/789)) ([246cfc4](https://github.com/apify/crawlee-python/commit/246cfc4ebc8bce1d15e1dddd62d652bd65869328)) by [@Pijukatel](https://github.com/Pijukatel), closes [#684](https://github.com/apify/crawlee-python/issues/684) - Expose event types, improve on/emit signature, allow parameterless listeners ([#800](https://github.com/apify/crawlee-python/pull/800)) ([c102c4c](https://github.com/apify/crawlee-python/commit/c102c4c894a00b09adfd5f4911563c81cf3e98b4)) by [@janbuchar](https://github.com/janbuchar), closes [#561](https://github.com/apify/crawlee-python/issues/561) - Add stop method to BasicCrawler ([#807](https://github.com/apify/crawlee-python/pull/807)) ([6d01af4](https://github.com/apify/crawlee-python/commit/6d01af4231d02b4349a8719f5ed18d812843fde5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#651](https://github.com/apify/crawlee-python/issues/651) - Add `html_to_text` helper function ([#792](https://github.com/apify/crawlee-python/pull/792)) ([2b9d970](https://github.com/apify/crawlee-python/commit/2b9d97009dd653870681bb3cadbb46b214ff1a73)) by [@Pijukatel](https://github.com/Pijukatel), closes [#659](https://github.com/apify/crawlee-python/issues/659) - [**breaking**] Implement `RequestManagerTandem`, remove `add_request` from `RequestList`, accept any iterable in `RequestList` constructor ([#777](https://github.com/apify/crawlee-python/pull/777)) ([4172652](https://github.com/apify/crawlee-python/commit/4172652079e5e91190c1cc5e2138fd41a7c84a6b)) by [@janbuchar](https://github.com/janbuchar) ### 🐛 Bug Fixes - Fix circular import in `KeyValueStore` ([#805](https://github.com/apify/crawlee-python/pull/805)) ([8bdf49d](https://github.com/apify/crawlee-python/commit/8bdf49d1cb2a94b66f69fd1b77063a4113517fae)) by [@Mantisus](https://github.com/Mantisus), closes [#804](https://github.com/apify/crawlee-python/issues/804) - [**breaking**] Refactor service usage to rely on `service_locator` ([#691](https://github.com/apify/crawlee-python/pull/691)) ([1d31c6c](https://github.com/apify/crawlee-python/commit/1d31c6c7e7a9ec7cee5b2de900568d9f77db65ba)) by [@vdusek](https://github.com/vdusek), closes [#369](https://github.com/apify/crawlee-python/issues/369), [#539](https://github.com/apify/crawlee-python/issues/539), [#699](https://github.com/apify/crawlee-python/issues/699) - Pass `verify` in httpx client ([#802](https://github.com/apify/crawlee-python/pull/802)) ([074d083](https://github.com/apify/crawlee-python/commit/074d0836b55e52f13726e7cd1c21602623fda4fc)) by [@Mantisus](https://github.com/Mantisus), closes [#798](https://github.com/apify/crawlee-python/issues/798) - Fix `page_options` for `PlaywrightBrowserPlugin` ([#796](https://github.com/apify/crawlee-python/pull/796)) ([bd3bdd4](https://github.com/apify/crawlee-python/commit/bd3bdd4046c2ddea62feb77322033cad50f382dd)) by [@Mantisus](https://github.com/Mantisus), closes [#755](https://github.com/apify/crawlee-python/issues/755) - Fix event migrating handler in `RequestQueue` ([#825](https://github.com/apify/crawlee-python/pull/825)) ([fd6663f](https://github.com/apify/crawlee-python/commit/fd6663f903bc7eecd1000da89e06197b43dfb962)) by [@Mantisus](https://github.com/Mantisus), closes [#815](https://github.com/apify/crawlee-python/issues/815) - Respect user configuration for work with status codes ([#812](https://github.com/apify/crawlee-python/pull/812)) ([8daf4bd](https://github.com/apify/crawlee-python/commit/8daf4bd49c1b09a0924f827daedebf7600ac609b)) by [@Mantisus](https://github.com/Mantisus), closes [#708](https://github.com/apify/crawlee-python/issues/708), [#756](https://github.com/apify/crawlee-python/issues/756) - `abort-on-error` for successive runs ([#834](https://github.com/apify/crawlee-python/pull/834)) ([0cea673](https://github.com/apify/crawlee-python/commit/0cea67387bf366800b447de784af580159b199ee)) by [@Mantisus](https://github.com/Mantisus) - Relax ServiceLocator restrictions ([#837](https://github.com/apify/crawlee-python/pull/837)) ([aa3667f](https://github.com/apify/crawlee-python/commit/aa3667f344d78945df3eca77431e1409f43f8bb5)) by [@janbuchar](https://github.com/janbuchar), closes [#806](https://github.com/apify/crawlee-python/issues/806) - Fix typo in exports ([#841](https://github.com/apify/crawlee-python/pull/841)) ([8fa6ac9](https://github.com/apify/crawlee-python/commit/8fa6ac994fe4f3f6430cb796a0c6a732c93c672b)) by [@janbuchar](https://github.com/janbuchar) ### 🚜 Refactor - [**breaking**] Refactor HttpCrawler, BeautifulSoupCrawler, ParselCrawler inheritance ([#746](https://github.com/apify/crawlee-python/pull/746)) ([9d3c269](https://github.com/apify/crawlee-python/commit/9d3c2697c91ce93028ca86a91d85d465d36c1ad7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#350](https://github.com/apify/crawlee-python/issues/350) - [**breaking**] Remove `json_` and `order_no` from `Request` ([#788](https://github.com/apify/crawlee-python/pull/788)) ([5381d13](https://github.com/apify/crawlee-python/commit/5381d13aa51a757fc1906f400788555df090a1af)) by [@Mantisus](https://github.com/Mantisus), closes [#94](https://github.com/apify/crawlee-python/issues/94) - [**breaking**] Rename PwPreNavContext to PwPreNavCrawlingContext ([#827](https://github.com/apify/crawlee-python/pull/827)) ([84b61a3](https://github.com/apify/crawlee-python/commit/84b61a3d25bee42faed4e81cd156663f251b3d3d)) by [@vdusek](https://github.com/vdusek) - [**breaking**] Rename PlaywrightCrawler kwargs: browser_options, page_options ([#831](https://github.com/apify/crawlee-python/pull/831)) ([ffc6048](https://github.com/apify/crawlee-python/commit/ffc6048e9dc5c5e862271fa50c48bb0fb6f0a18f)) by [@Pijukatel](https://github.com/Pijukatel) - [**breaking**] Update the crawlers & storage clients structure ([#828](https://github.com/apify/crawlee-python/pull/828)) ([0ba04d1](https://github.com/apify/crawlee-python/commit/0ba04d1633881043928a408678932c46fb90e21f)) by [@vdusek](https://github.com/vdusek), closes [#764](https://github.com/apify/crawlee-python/issues/764) ## [0.4.5](https://github.com/apify/crawlee-python/releases/tag/v0.4.5) (2024-12-06) ### 🚀 Features - Improve project bootstrapping ([#538](https://github.com/apify/crawlee-python/pull/538)) ([367899c](https://github.com/apify/crawlee-python/commit/367899cbad5021674f6e41c4dd7eb2266fe043aa)) by [@janbuchar](https://github.com/janbuchar), closes [#317](https://github.com/apify/crawlee-python/issues/317), [#414](https://github.com/apify/crawlee-python/issues/414), [#495](https://github.com/apify/crawlee-python/issues/495), [#511](https://github.com/apify/crawlee-python/issues/511) ### 🐛 Bug Fixes - Add upper bound of HTTPX version ([#775](https://github.com/apify/crawlee-python/pull/775)) ([b59e34d](https://github.com/apify/crawlee-python/commit/b59e34d6301e26825d88608152ffb337ef602a9f)) by [@vdusek](https://github.com/vdusek) - Fix incorrect use of desired concurrency ratio ([#780](https://github.com/apify/crawlee-python/pull/780)) ([d1f8bfb](https://github.com/apify/crawlee-python/commit/d1f8bfb68ce2ef13b550ce415a3689858112a4c7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#759](https://github.com/apify/crawlee-python/issues/759) - Remove pydantic constraint <2.10.0 and update timedelta validator, serializer type hints ([#757](https://github.com/apify/crawlee-python/pull/757)) ([c0050c0](https://github.com/apify/crawlee-python/commit/c0050c0ee76e5deb28f174ecf276b0e6abf68b9d)) by [@Pijukatel](https://github.com/Pijukatel) ## [0.4.4](https://github.com/apify/crawlee-python/releases/tag/v0.4.4) (2024-11-29) ### 🚀 Features - Expose browser_options and page_options to PlaywrightCrawler ([#730](https://github.com/apify/crawlee-python/pull/730)) ([dbe85b9](https://github.com/apify/crawlee-python/commit/dbe85b90e59def281cfc6617a0eb869a4adf2fc0)) by [@vdusek](https://github.com/vdusek), closes [#719](https://github.com/apify/crawlee-python/issues/719) - Add `abort_on_error` property ([#731](https://github.com/apify/crawlee-python/pull/731)) ([6dae03a](https://github.com/apify/crawlee-python/commit/6dae03a68a2d23c68c78d8d44611d43e40eb9404)) by [@Mantisus](https://github.com/Mantisus), closes [#704](https://github.com/apify/crawlee-python/issues/704) ### 🐛 Bug Fixes - Fix init of context managers and context handling in `BasicCrawler` ([#714](https://github.com/apify/crawlee-python/pull/714)) ([486fe6d](https://github.com/apify/crawlee-python/commit/486fe6d6cd56cb560ab51a32ec0286d9e32267cb)) by [@vdusek](https://github.com/vdusek) ## [0.4.3](https://github.com/apify/crawlee-python/releases/tag/v0.4.3) (2024-11-21) ### 🐛 Bug Fixes - Pydantic 2.10.0 issues ([#716](https://github.com/apify/crawlee-python/pull/716)) ([8d8b3fc](https://github.com/apify/crawlee-python/commit/8d8b3fcff8be10edf5351f5324c7ba112c1d2ba0)) by [@Pijukatel](https://github.com/Pijukatel) ## [0.4.2](https://github.com/apify/crawlee-python/releases/tag/v0.4.2) (2024-11-20) ### 🐛 Bug Fixes - Respect custom HTTP headers in `PlaywrightCrawler` ([#685](https://github.com/apify/crawlee-python/pull/685)) ([a84125f](https://github.com/apify/crawlee-python/commit/a84125f031347426de44b8f015c87882c8f96f72)) by [@Mantisus](https://github.com/Mantisus) - Fix serialization payload in Request. Fix Docs for Post Request ([#683](https://github.com/apify/crawlee-python/pull/683)) ([e8b4d2d](https://github.com/apify/crawlee-python/commit/e8b4d2d4989fd9967403b828c914cb7ae2ef9b8b)) by [@Mantisus](https://github.com/Mantisus), closes [#668](https://github.com/apify/crawlee-python/issues/668) - Accept string payload in the Request constructor ([#697](https://github.com/apify/crawlee-python/pull/697)) ([19f5add](https://github.com/apify/crawlee-python/commit/19f5addc0223d68389eea47864830c709335ab6e)) by [@vdusek](https://github.com/vdusek) - Fix snapshots handling ([#692](https://github.com/apify/crawlee-python/pull/692)) ([4016c0d](https://github.com/apify/crawlee-python/commit/4016c0d8121a8950ab1df22188eac838a011c39f)) by [@Pijukatel](https://github.com/Pijukatel) ## [0.4.1](https://github.com/apify/crawlee-python/releases/tag/v0.4.1) (2024-11-11) ### 🚀 Features - Add `max_crawl_depth` option to `BasicCrawler` ([#637](https://github.com/apify/crawlee-python/pull/637)) ([77deaa9](https://github.com/apify/crawlee-python/commit/77deaa964e2c1e74af1c5117a13d8d8257f0e27e)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#460](https://github.com/apify/crawlee-python/issues/460) - Add BeautifulSoupParser type alias ([#674](https://github.com/apify/crawlee-python/pull/674)) ([b2cf88f](https://github.com/apify/crawlee-python/commit/b2cf88ffea8d75808c9210850a03fcc70b0b9e3d)) by [@Pijukatel](https://github.com/Pijukatel) ### 🐛 Bug Fixes - Fix total_size usage in memory size monitoring ([#661](https://github.com/apify/crawlee-python/pull/661)) ([c2a3239](https://github.com/apify/crawlee-python/commit/c2a32397eecd5cc7f412c2af7269b004a8b2eaf2)) by [@janbuchar](https://github.com/janbuchar) - Add HttpHeaders to module exports ([#664](https://github.com/apify/crawlee-python/pull/664)) ([f0c5ca7](https://github.com/apify/crawlee-python/commit/f0c5ca717d9f9e304d375da2c23552c26ca870da)) by [@vdusek](https://github.com/vdusek), closes [#663](https://github.com/apify/crawlee-python/issues/663) - Fix unhandled ValueError in request handler result processing ([#666](https://github.com/apify/crawlee-python/pull/666)) ([0a99d7f](https://github.com/apify/crawlee-python/commit/0a99d7f693245eb9a065016fb6f2d268f6956805)) by [@janbuchar](https://github.com/janbuchar) - Fix BaseDatasetClient.iter_items type hints ([#680](https://github.com/apify/crawlee-python/pull/680)) ([a968b1b](https://github.com/apify/crawlee-python/commit/a968b1be6fceb56676b0198a044c8fceac7c92a6)) by [@Pijukatel](https://github.com/Pijukatel) ## [0.4.0](https://github.com/apify/crawlee-python/releases/tag/v0.4.0) (2024-11-01) - Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v04) to ensure a smooth update. ### 🚀 Features - [**breaking**] Add headers in unique key computation ([#609](https://github.com/apify/crawlee-python/pull/609)) ([6c4746f](https://github.com/apify/crawlee-python/commit/6c4746fa8ff86952a812b32a1d70dc910e76b43e)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#548](https://github.com/apify/crawlee-python/issues/548) - Add `pre_navigation_hooks` to `PlaywrightCrawler` ([#631](https://github.com/apify/crawlee-python/pull/631)) ([5dd5b60](https://github.com/apify/crawlee-python/commit/5dd5b60e2a44d5bd3748b613790e1bee3232d6f3)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#427](https://github.com/apify/crawlee-python/issues/427) - Add `always_enqueue` option to bypass URL deduplication ([#621](https://github.com/apify/crawlee-python/pull/621)) ([4e59fa4](https://github.com/apify/crawlee-python/commit/4e59fa46daaec05e52262cf62c26f28ddcd772af)) by [@Rutam21](https://github.com/Rutam21), closes [#547](https://github.com/apify/crawlee-python/issues/547) - Split and add extra configuration to export_data method ([#580](https://github.com/apify/crawlee-python/pull/580)) ([6751635](https://github.com/apify/crawlee-python/commit/6751635e1785a4a27f60092c82f5dd0c40193d52)) by [@deshansh](https://github.com/deshansh), closes [#526](https://github.com/apify/crawlee-python/issues/526) ### 🐛 Bug Fixes - Use strip in headers normalization ([#614](https://github.com/apify/crawlee-python/pull/614)) ([a15b21e](https://github.com/apify/crawlee-python/commit/a15b21e51deaf2b67738f95bc2b15c1c16d1775f)) by [@vdusek](https://github.com/vdusek) - [**breaking**] Merge payload and data fields of Request ([#542](https://github.com/apify/crawlee-python/pull/542)) ([d06fcef](https://github.com/apify/crawlee-python/commit/d06fcef3fee44616ded5f587b9c7313b82a57cc7)) by [@vdusek](https://github.com/vdusek), closes [#560](https://github.com/apify/crawlee-python/issues/560) - Default ProxyInfo port if httpx.URL port is None ([#619](https://github.com/apify/crawlee-python/pull/619)) ([8107a6f](https://github.com/apify/crawlee-python/commit/8107a6f97e8f16a330e7d02d3fc6ea34c5f78d77)) by [@steffansafey](https://github.com/steffansafey), closes [#618](https://github.com/apify/crawlee-python/issues/618) ### ⚙️ Miscellaneous Tasks - [**breaking**] Remove Request.query_params field ([#639](https://github.com/apify/crawlee-python/pull/639)) ([6ec0ec4](https://github.com/apify/crawlee-python/commit/6ec0ec4fa0cef9b8bf893e70d99f068675c9c54c)) by [@vdusek](https://github.com/vdusek), closes [#615](https://github.com/apify/crawlee-python/issues/615) ## [0.3.9](https://github.com/apify/crawlee-python/releases/tag/v0.3.9) (2024-10-23) ### 🚀 Features - Key-value store context helpers ([#584](https://github.com/apify/crawlee-python/pull/584)) ([fc15622](https://github.com/apify/crawlee-python/commit/fc156222c3747fc4cc7bd7666a21769845c7d0d5)) by [@janbuchar](https://github.com/janbuchar) - Added get_public_url method to KeyValueStore ([#572](https://github.com/apify/crawlee-python/pull/572)) ([3a4ba8f](https://github.com/apify/crawlee-python/commit/3a4ba8f459903b6288aec40de2c3ca862e36abec)) by [@akshay11298](https://github.com/akshay11298), closes [#514](https://github.com/apify/crawlee-python/issues/514) ### 🐛 Bug Fixes - Workaround for JSON value typing problems ([#581](https://github.com/apify/crawlee-python/pull/581)) ([403496a](https://github.com/apify/crawlee-python/commit/403496a53c12810351139a6e073238143ecc5930)) by [@janbuchar](https://github.com/janbuchar), closes [#563](https://github.com/apify/crawlee-python/issues/563) ## [0.3.8](https://github.com/apify/crawlee-python/releases/tag/v0.3.8) (2024-10-02) ### 🚀 Features - Mask Playwright's "headless" headers ([#545](https://github.com/apify/crawlee-python/pull/545)) ([d1445e4](https://github.com/apify/crawlee-python/commit/d1445e4858fd804bb4a2e35efa1d2f5254d8df6b)) by [@vdusek](https://github.com/vdusek), closes [#401](https://github.com/apify/crawlee-python/issues/401) - Add new model for `HttpHeaders` ([#544](https://github.com/apify/crawlee-python/pull/544)) ([854f2c1](https://github.com/apify/crawlee-python/commit/854f2c1e2e09cf398e04b1e153534282add1247e)) by [@vdusek](https://github.com/vdusek) ### 🐛 Bug Fixes - Call `error_handler` for `SessionError` ([#557](https://github.com/apify/crawlee-python/pull/557)) ([e75ac4b](https://github.com/apify/crawlee-python/commit/e75ac4b70cd48a4ca9f8245cea3c5f3c188b8824)) by [@vdusek](https://github.com/vdusek), closes [#546](https://github.com/apify/crawlee-python/issues/546) - Extend from `StrEnum` in `RequestState` to fix serialization ([#556](https://github.com/apify/crawlee-python/pull/556)) ([6bf35ba](https://github.com/apify/crawlee-python/commit/6bf35ba4a6913819706ebd1d2c1156a4c62f944e)) by [@vdusek](https://github.com/vdusek), closes [#551](https://github.com/apify/crawlee-python/issues/551) - Add equality check to UserData model ([#562](https://github.com/apify/crawlee-python/pull/562)) ([899a25c](https://github.com/apify/crawlee-python/commit/899a25ca63f570b3c4d8d56c85a838b371fd3924)) by [@janbuchar](https://github.com/janbuchar) ## [0.3.7](https://github.com/apify/crawlee-python/releases/tag/v0.3.7) (2024-09-25) ### 🐛 Bug Fixes - Improve `Request.user_data` serialization ([#540](https://github.com/apify/crawlee-python/pull/540)) ([de29c0e](https://github.com/apify/crawlee-python/commit/de29c0e6b737a9d2544c5382472618dde76eb2a5)) by [@janbuchar](https://github.com/janbuchar), closes [#524](https://github.com/apify/crawlee-python/issues/524) - Adopt new version of curl-cffi ([#543](https://github.com/apify/crawlee-python/pull/543)) ([f6fcf48](https://github.com/apify/crawlee-python/commit/f6fcf48d99bfcb4b8e75c5c9c38dc8c265164a10)) by [@vdusek](https://github.com/vdusek) ## [0.3.6](https://github.com/apify/crawlee-python/releases/tag/v0.3.6) (2024-09-19) ### 🚀 Features - Add HTTP/2 support for HTTPX client ([#513](https://github.com/apify/crawlee-python/pull/513)) ([0eb0a33](https://github.com/apify/crawlee-python/commit/0eb0a33411096011198e52c393f35730f1a0b6ac)) by [@vdusek](https://github.com/vdusek), closes [#512](https://github.com/apify/crawlee-python/issues/512) - Expose extended unique key when creating a new Request ([#515](https://github.com/apify/crawlee-python/pull/515)) ([1807f41](https://github.com/apify/crawlee-python/commit/1807f419e47a815dd706d09acb0f3b3af8cfc691)) by [@vdusek](https://github.com/vdusek) - Add header generator and integrate it into HTTPX client ([#530](https://github.com/apify/crawlee-python/pull/530)) ([b63f9f9](https://github.com/apify/crawlee-python/commit/b63f9f98c6613e095546ef544eab271d433e3379)) by [@vdusek](https://github.com/vdusek), closes [#402](https://github.com/apify/crawlee-python/issues/402) ### 🐛 Bug Fixes - Use explicitly UTF-8 encoding in local storage ([#533](https://github.com/apify/crawlee-python/pull/533)) ([a3a0ab2](https://github.com/apify/crawlee-python/commit/a3a0ab2f6809b7a06319a77dfbf289df78638dea)) by [@vdusek](https://github.com/vdusek), closes [#532](https://github.com/apify/crawlee-python/issues/532) ## [0.3.5](https://github.com/apify/crawlee-python/releases/tag/v0.3.5) (2024-09-10) ### 🚀 Features - Memory usage limit configuration via environment variables ([#502](https://github.com/apify/crawlee-python/pull/502)) ([c62e554](https://github.com/apify/crawlee-python/commit/c62e5545de6a1836f0514ebd3dd695e4fd856844)) by [@janbuchar](https://github.com/janbuchar) ### 🐛 Bug Fixes - Http clients detect 4xx as errors by default ([#498](https://github.com/apify/crawlee-python/pull/498)) ([1895dca](https://github.com/apify/crawlee-python/commit/1895dca538f415feca37b4a030525c7c0d32f114)) by [@vdusek](https://github.com/vdusek), closes [#496](https://github.com/apify/crawlee-python/issues/496) - Correctly handle log level configuration ([#508](https://github.com/apify/crawlee-python/pull/508)) ([7ea8fe6](https://github.com/apify/crawlee-python/commit/7ea8fe69f4a6146a1e417bebff60c08a85e2ca27)) by [@janbuchar](https://github.com/janbuchar) ## [0.3.4](https://github.com/apify/crawlee-python/releases/tag/v0.3.4) (2024-09-05) ### 🐛 Bug Fixes - Expose basic crawling context ([#501](https://github.com/apify/crawlee-python/pull/501)) ([b484535](https://github.com/apify/crawlee-python/commit/b484535dbacc5d206a026f55a1d3e58edd375e91)) by [@vdusek](https://github.com/vdusek) ## [0.3.3](https://github.com/apify/crawlee-python/releases/tag/v0.3.3) (2024-09-05) ### 🐛 Bug Fixes - Deduplicate requests by unique key before submitting them to the queue ([#499](https://github.com/apify/crawlee-python/pull/499)) ([6a3e0e7](https://github.com/apify/crawlee-python/commit/6a3e0e78490851c43cefb0497ce34ca52a31a25c)) by [@janbuchar](https://github.com/janbuchar) ## [0.3.2](https://github.com/apify/crawlee-python/releases/tag/v0.3.2) (2024-09-02) ### 🐛 Bug Fixes - Double incrementation of `item_count` ([#443](https://github.com/apify/crawlee-python/pull/443)) ([cd9adf1](https://github.com/apify/crawlee-python/commit/cd9adf15731e8c4a39cb142b6d1a62909cafdc51)) by [@cadlagtrader](https://github.com/cadlagtrader), closes [#442](https://github.com/apify/crawlee-python/issues/442) - Field alias in `BatchRequestsOperationResponse` ([#485](https://github.com/apify/crawlee-python/pull/485)) ([126a862](https://github.com/apify/crawlee-python/commit/126a8629cb5b989a0f9fe22156fb09731a34acd2)) by [@janbuchar](https://github.com/janbuchar) - JSON handling with Parsel ([#490](https://github.com/apify/crawlee-python/pull/490)) ([ebf5755](https://github.com/apify/crawlee-python/commit/ebf575539ffb631ae131a1b801cec8f21dd0cf4c)) by [@janbuchar](https://github.com/janbuchar), closes [#488](https://github.com/apify/crawlee-python/issues/488) ## [0.3.1](https://github.com/apify/crawlee-python/releases/tag/v0.3.1) (2024-08-30) ### 🚀 Features - Curl http client selects chrome impersonation by default ([#473](https://github.com/apify/crawlee-python/pull/473)) ([82dc939](https://github.com/apify/crawlee-python/commit/82dc93957b1a380ea975564dea5c6ba4639be548)) by [@vdusek](https://github.com/vdusek) ## [0.3.0](https://github.com/apify/crawlee-python/releases/tag/v0.3.0) (2024-08-27) - Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v03) to ensure a smooth update. ### 🚀 Features - Implement ParselCrawler that adds support for Parsel ([#348](https://github.com/apify/crawlee-python/pull/348)) ([a3832e5](https://github.com/apify/crawlee-python/commit/a3832e527f022f32cce4a80055da3b7967b74522)) by [@asymness](https://github.com/asymness), closes [#335](https://github.com/apify/crawlee-python/issues/335) - Add support for filling a web form ([#453](https://github.com/apify/crawlee-python/pull/453)) ([5a125b4](https://github.com/apify/crawlee-python/commit/5a125b464b2619000b92dacad4c3a7faa1869f29)) by [@vdusek](https://github.com/vdusek), closes [#305](https://github.com/apify/crawlee-python/issues/305) ### 🐛 Bug Fixes - Remove indentation from statistics logging and print the data in tables ([#322](https://github.com/apify/crawlee-python/pull/322)) ([359b515](https://github.com/apify/crawlee-python/commit/359b515d647f064886f91441c2c01d3099e21035)) by [@TymeeK](https://github.com/TymeeK), closes [#306](https://github.com/apify/crawlee-python/issues/306) - Remove redundant log, fix format ([#408](https://github.com/apify/crawlee-python/pull/408)) ([8d27e39](https://github.com/apify/crawlee-python/commit/8d27e3928c605d6eceb51a948453a15024fa2aa2)) by [@janbuchar](https://github.com/janbuchar) - Dequeue items from RequestQueue in the correct order ([#411](https://github.com/apify/crawlee-python/pull/411)) ([96fc33e](https://github.com/apify/crawlee-python/commit/96fc33e2cc4631cae3c50dad9eace6407103a2a9)) by [@janbuchar](https://github.com/janbuchar) - Relative URLS supports & If not a URL, pass #417 ([#431](https://github.com/apify/crawlee-python/pull/431)) ([ccd8145](https://github.com/apify/crawlee-python/commit/ccd81454166ece68391cdffedb8efe9e663361d9)) by [@black7375](https://github.com/black7375), closes [#417](https://github.com/apify/crawlee-python/issues/417) - Typo in ProlongRequestLockResponse ([#458](https://github.com/apify/crawlee-python/pull/458)) ([30ccc3a](https://github.com/apify/crawlee-python/commit/30ccc3a4763bc3706a3bbeaedc95f9648f5ba09a)) by [@janbuchar](https://github.com/janbuchar) - Add missing __all__ to top-level __init__.py file ([#463](https://github.com/apify/crawlee-python/pull/463)) ([353a1ce](https://github.com/apify/crawlee-python/commit/353a1ce28cd38c97ffb36dc1e6b0e86d3aef1a48)) by [@janbuchar](https://github.com/janbuchar) ### 🚜 Refactor - [**breaking**] RequestQueue and service management rehaul ([#429](https://github.com/apify/crawlee-python/pull/429)) ([b155a9f](https://github.com/apify/crawlee-python/commit/b155a9f602a163e891777bef5608072fb5d0156f)) by [@janbuchar](https://github.com/janbuchar), closes [#83](https://github.com/apify/crawlee-python/issues/83), [#174](https://github.com/apify/crawlee-python/issues/174), [#203](https://github.com/apify/crawlee-python/issues/203), [#423](https://github.com/apify/crawlee-python/issues/423) - [**breaking**] Declare private and public interface ([#456](https://github.com/apify/crawlee-python/pull/456)) ([d6738df](https://github.com/apify/crawlee-python/commit/d6738df30586934e8d1aba50b9cd437a0ea40400)) by [@vdusek](https://github.com/vdusek) ## [0.2.1](https://github.com/apify/crawlee-python/releases/tag/v0.2.1) (2024-08-05) ### 🐛 Bug Fixes - Do not import curl impersonate in http clients init ([#396](https://github.com/apify/crawlee-python/pull/396)) ([3bb8009](https://github.com/apify/crawlee-python/commit/3bb80093e61c1615f869ecd5ab80b061e0e5db36)) by [@vdusek](https://github.com/vdusek) ## [0.2.0](https://github.com/apify/crawlee-python/releases/tag/v0.2.0) (2024-08-05) ### 🚀 Features - Add new curl impersonate HTTP client ([#387](https://github.com/apify/crawlee-python/pull/387)) ([9c06260](https://github.com/apify/crawlee-python/commit/9c06260c0ee958522caa9322001a3186e9e43af4)) by [@vdusek](https://github.com/vdusek), closes [#292](https://github.com/apify/crawlee-python/issues/292) - **playwright:** `infinite_scroll` helper ([#393](https://github.com/apify/crawlee-python/pull/393)) ([34f74bd](https://github.com/apify/crawlee-python/commit/34f74bdcffb42a6c876a856e1c89923d9b3e60bd)) by [@janbuchar](https://github.com/janbuchar) ## [0.1.2](https://github.com/apify/crawlee-python/releases/tag/v0.1.2) (2024-07-30) ### 🚀 Features - Add URL validation ([#343](https://github.com/apify/crawlee-python/pull/343)) ([1514538](https://github.com/apify/crawlee-python/commit/15145388009c85ab54dc72ea8f2d07efd78f80fd)) by [@vdusek](https://github.com/vdusek), closes [#300](https://github.com/apify/crawlee-python/issues/300) ### 🐛 Bug Fixes - Minor log fix ([#341](https://github.com/apify/crawlee-python/pull/341)) ([0688bf1](https://github.com/apify/crawlee-python/commit/0688bf1860534ab6b2a85dc850bf3d56507ab154)) by [@souravjain540](https://github.com/souravjain540) - Also use error_handler for context pipeline errors ([#331](https://github.com/apify/crawlee-python/pull/331)) ([7a66445](https://github.com/apify/crawlee-python/commit/7a664456b45c7e429b4c90aaf1c09d5796b93e3d)) by [@janbuchar](https://github.com/janbuchar), closes [#296](https://github.com/apify/crawlee-python/issues/296) - Strip whitespace from href in enqueue_links ([#346](https://github.com/apify/crawlee-python/pull/346)) ([8a3174a](https://github.com/apify/crawlee-python/commit/8a3174aed24f9eb4f9ac415a79a58685a081cde2)) by [@janbuchar](https://github.com/janbuchar), closes [#337](https://github.com/apify/crawlee-python/issues/337) - Warn instead of crashing when an empty dataset is being exported ([#342](https://github.com/apify/crawlee-python/pull/342)) ([22b95d1](https://github.com/apify/crawlee-python/commit/22b95d1948d4acd23a010898fa6af2f491e7f514)) by [@janbuchar](https://github.com/janbuchar), closes [#334](https://github.com/apify/crawlee-python/issues/334) - Avoid Github rate limiting in project bootstrapping test ([#364](https://github.com/apify/crawlee-python/pull/364)) ([992f07f](https://github.com/apify/crawlee-python/commit/992f07f266f7b8433d99e9a179f277995f81eb17)) by [@janbuchar](https://github.com/janbuchar) - Pass crawler configuration to storages ([#375](https://github.com/apify/crawlee-python/pull/375)) ([b2d3a52](https://github.com/apify/crawlee-python/commit/b2d3a52712abe21f4a4a5db4e20c80afe72c27de)) by [@janbuchar](https://github.com/janbuchar) - Purge request queue on repeated crawler runs ([#377](https://github.com/apify/crawlee-python/pull/377)) ([7ad3d69](https://github.com/apify/crawlee-python/commit/7ad3d6908e153c590bff72478af7ee3239a249bc)) by [@janbuchar](https://github.com/janbuchar), closes [#152](https://github.com/apify/crawlee-python/issues/152) ## [0.1.1](https://github.com/apify/crawlee-python/releases/tag/v0.1.1) (2024-07-19) ### 🚀 Features - Expose crawler log ([#316](https://github.com/apify/crawlee-python/pull/316)) ([ae475fa](https://github.com/apify/crawlee-python/commit/ae475fa450c4fe053620d7b7eb475f3d58804674)) by [@vdusek](https://github.com/vdusek), closes [#303](https://github.com/apify/crawlee-python/issues/303) - Integrate proxies into `PlaywrightCrawler` ([#325](https://github.com/apify/crawlee-python/pull/325)) ([2e072b6](https://github.com/apify/crawlee-python/commit/2e072b6ad7d5d82d96a7b489cafb87e7bfaf6e83)) by [@vdusek](https://github.com/vdusek) - Blocking detection for playwright crawler ([#328](https://github.com/apify/crawlee-python/pull/328)) ([49ff6e2](https://github.com/apify/crawlee-python/commit/49ff6e25c12a97550eee718d64bb4130f9990189)) by [@vdusek](https://github.com/vdusek), closes [#239](https://github.com/apify/crawlee-python/issues/239) ### 🐛 Bug Fixes - Pylance reportPrivateImportUsage errors ([#313](https://github.com/apify/crawlee-python/pull/313)) ([09d7203](https://github.com/apify/crawlee-python/commit/09d72034d5db8c47f461111ec093761935a3e2ef)) by [@vdusek](https://github.com/vdusek), closes [#283](https://github.com/apify/crawlee-python/issues/283) - Set httpx logging to warning ([#314](https://github.com/apify/crawlee-python/pull/314)) ([1585def](https://github.com/apify/crawlee-python/commit/1585defffb2c0c844fab39bbc0e0b793d6169cbf)) by [@vdusek](https://github.com/vdusek), closes [#302](https://github.com/apify/crawlee-python/issues/302) - Byte size serialization in MemoryInfo ([#245](https://github.com/apify/crawlee-python/pull/245)) ([a030174](https://github.com/apify/crawlee-python/commit/a0301746c2df076d281708344fb906e1c42e0790)) by [@janbuchar](https://github.com/janbuchar) - Project bootstrapping in existing folder ([#318](https://github.com/apify/crawlee-python/pull/318)) ([c630818](https://github.com/apify/crawlee-python/commit/c630818538e0c37217ab73f6c6da05505ed8b364)) by [@janbuchar](https://github.com/janbuchar), closes [#301](https://github.com/apify/crawlee-python/issues/301) ## [0.1.0](https://github.com/apify/crawlee-python/releases/tag/v0.1.0) (2024-07-08) ### 🚀 Features - Project templates ([#237](https://github.com/apify/crawlee-python/pull/237)) ([c23c12c](https://github.com/apify/crawlee-python/commit/c23c12c66688f825f74deb39702f07cc6c6bbc46)) by [@janbuchar](https://github.com/janbuchar), closes [#215](https://github.com/apify/crawlee-python/issues/215) ### 🐛 Bug Fixes - CLI UX improvements ([#271](https://github.com/apify/crawlee-python/pull/271)) ([123d515](https://github.com/apify/crawlee-python/commit/123d515b224c663577bfe0fab387d0aa11e5e4d4)) by [@janbuchar](https://github.com/janbuchar), closes [#267](https://github.com/apify/crawlee-python/issues/267) - Error handling in CLI and templates documentation ([#273](https://github.com/apify/crawlee-python/pull/273)) ([61083c3](https://github.com/apify/crawlee-python/commit/61083c33434d431a118538f15bfa9a68c312ab03)) by [@vdusek](https://github.com/vdusek), closes [#268](https://github.com/apify/crawlee-python/issues/268) ## [0.0.7](https://github.com/apify/crawlee-python/releases/tag/v0.0.7) (2024-06-27) ### 🐛 Bug Fixes - Do not wait for consistency in request queue ([#235](https://github.com/apify/crawlee-python/pull/235)) ([03ff138](https://github.com/apify/crawlee-python/commit/03ff138aadaf8e915abc7fafb854fe12947b9696)) by [@vdusek](https://github.com/vdusek) - Selector handling in BeautifulSoupCrawler enqueue_links ([#231](https://github.com/apify/crawlee-python/pull/231)) ([896501e](https://github.com/apify/crawlee-python/commit/896501edb44f801409fec95cb3e5f2bcfcb4188d)) by [@janbuchar](https://github.com/janbuchar), closes [#230](https://github.com/apify/crawlee-python/issues/230) - Handle blocked request ([#234](https://github.com/apify/crawlee-python/pull/234)) ([f8ef79f](https://github.com/apify/crawlee-python/commit/f8ef79ffcb7410713182af716d37dbbaad66fdbc)) by [@Mantisus](https://github.com/Mantisus) - Improve AutoscaledPool state management ([#241](https://github.com/apify/crawlee-python/pull/241)) ([fdea3d1](https://github.com/apify/crawlee-python/commit/fdea3d16b13afe70039d864de861486c760aa0ba)) by [@janbuchar](https://github.com/janbuchar), closes [#236](https://github.com/apify/crawlee-python/issues/236) ## [0.0.6](https://github.com/apify/crawlee-python/releases/tag/v0.0.6) (2024-06-25) ### 🚀 Features - Maintain a global configuration instance ([#207](https://github.com/apify/crawlee-python/pull/207)) ([e003aa6](https://github.com/apify/crawlee-python/commit/e003aa63d859bec8199d0c890b5c9604f163ccd3)) by [@janbuchar](https://github.com/janbuchar) - Add max requests per crawl to `BasicCrawler` ([#198](https://github.com/apify/crawlee-python/pull/198)) ([b5b3053](https://github.com/apify/crawlee-python/commit/b5b3053f43381601274e4034d07b4bf41720c7c2)) by [@vdusek](https://github.com/vdusek) - Add support decompress *br* response content ([#226](https://github.com/apify/crawlee-python/pull/226)) ([a3547b9](https://github.com/apify/crawlee-python/commit/a3547b9c882dc5333a4fcd1223687ef85e79138d)) by [@Mantisus](https://github.com/Mantisus) - BasicCrawler.export_data helper ([#222](https://github.com/apify/crawlee-python/pull/222)) ([237ec78](https://github.com/apify/crawlee-python/commit/237ec789b7dccc17cc57ef47ec56bcf73c6ca006)) by [@janbuchar](https://github.com/janbuchar), closes [#211](https://github.com/apify/crawlee-python/issues/211) - Automatic logging setup ([#229](https://github.com/apify/crawlee-python/pull/229)) ([a67b72f](https://github.com/apify/crawlee-python/commit/a67b72faacd75674071bae496d59e1c60636350c)) by [@janbuchar](https://github.com/janbuchar), closes [#214](https://github.com/apify/crawlee-python/issues/214) ### 🐛 Bug Fixes - Handling of relative URLs in add_requests ([#213](https://github.com/apify/crawlee-python/pull/213)) ([8aa8c57](https://github.com/apify/crawlee-python/commit/8aa8c57f44149caa0e01950a5d773726f261699a)) by [@janbuchar](https://github.com/janbuchar), closes [#202](https://github.com/apify/crawlee-python/issues/202), [#204](https://github.com/apify/crawlee-python/issues/204) - Graceful exit in BasicCrawler.run ([#224](https://github.com/apify/crawlee-python/pull/224)) ([337286e](https://github.com/apify/crawlee-python/commit/337286e1b721cf61f57bc0ff3ead08df1f4f5448)) by [@janbuchar](https://github.com/janbuchar), closes [#212](https://github.com/apify/crawlee-python/issues/212) ## [0.0.5](https://github.com/apify/crawlee-python/releases/tag/v0.0.5) (2024-06-21) ### 🚀 Features - Browser rotation and better browser abstraction ([#177](https://github.com/apify/crawlee-python/pull/177)) ([a42ae6f](https://github.com/apify/crawlee-python/commit/a42ae6f53c5e24678f04011c3684290b68684016)) by [@vdusek](https://github.com/vdusek), closes [#131](https://github.com/apify/crawlee-python/issues/131) - Add emit persist state event to event manager ([#181](https://github.com/apify/crawlee-python/pull/181)) ([97f6c68](https://github.com/apify/crawlee-python/commit/97f6c68275b65f76c62b6d16d94354fc7f00d336)) by [@vdusek](https://github.com/vdusek) - Batched request addition in RequestQueue ([#186](https://github.com/apify/crawlee-python/pull/186)) ([f48c806](https://github.com/apify/crawlee-python/commit/f48c8068fe16ce3dd4c46fc248733346c0621411)) by [@vdusek](https://github.com/vdusek) - Add storage helpers to crawler & context ([#192](https://github.com/apify/crawlee-python/pull/192)) ([f8f4066](https://github.com/apify/crawlee-python/commit/f8f4066d8b32d6e7dc0d999a5aa8db75f99b43b8)) by [@vdusek](https://github.com/vdusek), closes [#98](https://github.com/apify/crawlee-python/issues/98), [#100](https://github.com/apify/crawlee-python/issues/100), [#172](https://github.com/apify/crawlee-python/issues/172) - Handle all supported configuration options ([#199](https://github.com/apify/crawlee-python/pull/199)) ([23c901c](https://github.com/apify/crawlee-python/commit/23c901cd68cf14b4041ee03568622ee32822e94b)) by [@janbuchar](https://github.com/janbuchar), closes [#84](https://github.com/apify/crawlee-python/issues/84) - Add Playwright's enqueue links helper ([#196](https://github.com/apify/crawlee-python/pull/196)) ([849d73c](https://github.com/apify/crawlee-python/commit/849d73cc7d137171b98f9f2ab85374e8beec0dad)) by [@vdusek](https://github.com/vdusek) ### 🐛 Bug Fixes - Tmp path in tests is working ([#164](https://github.com/apify/crawlee-python/pull/164)) ([382b6f4](https://github.com/apify/crawlee-python/commit/382b6f48174bdac3931cc379eaf770ab06f826dc)) by [@vdusek](https://github.com/vdusek), closes [#159](https://github.com/apify/crawlee-python/issues/159) - Add explicit err msgs for missing pckg extras during import ([#165](https://github.com/apify/crawlee-python/pull/165)) ([200ebfa](https://github.com/apify/crawlee-python/commit/200ebfa63d6e20e17c8ca29544ef7229ed0df308)) by [@vdusek](https://github.com/vdusek), closes [#155](https://github.com/apify/crawlee-python/issues/155) - Make timedelta_ms accept string-encoded numbers ([#190](https://github.com/apify/crawlee-python/pull/190)) ([d8426ff](https://github.com/apify/crawlee-python/commit/d8426ff41e36f701af459ad17552fee39637674d)) by [@janbuchar](https://github.com/janbuchar) - **deps:** Update dependency psutil to v6 ([#193](https://github.com/apify/crawlee-python/pull/193)) ([eb91f51](https://github.com/apify/crawlee-python/commit/eb91f51e19da406e3f9293e5336c1f85fc7885a4)) by [@renovate[bot]](https://github.com/renovate[bot]) - Improve compatibility between ProxyConfiguration and its SDK counterpart ([#201](https://github.com/apify/crawlee-python/pull/201)) ([1a76124](https://github.com/apify/crawlee-python/commit/1a76124080d561e0153a4dda0bdb0d9863c3aab6)) by [@janbuchar](https://github.com/janbuchar) - Correct return type of storage get_info methods ([#200](https://github.com/apify/crawlee-python/pull/200)) ([332673c](https://github.com/apify/crawlee-python/commit/332673c4fb519b80846df7fb8cd8bb521538a8a4)) by [@janbuchar](https://github.com/janbuchar) - Type error in statistics persist state ([#206](https://github.com/apify/crawlee-python/pull/206)) ([96ceef6](https://github.com/apify/crawlee-python/commit/96ceef697769cd57bd1a50b6615cf1e70549bd2d)) by [@vdusek](https://github.com/vdusek), closes [#194](https://github.com/apify/crawlee-python/issues/194) ## [0.0.4](https://github.com/apify/crawlee-python/releases/tag/v0.0.4) (2024-05-30) ### 🚀 Features - Capture statistics about the crawler run ([#142](https://github.com/apify/crawlee-python/pull/142)) ([eeebe9b](https://github.com/apify/crawlee-python/commit/eeebe9b1e24338d68a0a55228bbfc717f4d9d295)) by [@janbuchar](https://github.com/janbuchar), closes [#97](https://github.com/apify/crawlee-python/issues/97) - Proxy configuration ([#156](https://github.com/apify/crawlee-python/pull/156)) ([5c3753a](https://github.com/apify/crawlee-python/commit/5c3753a5527b1d01f7260b9e4c566e43f956a5e8)) by [@janbuchar](https://github.com/janbuchar), closes [#136](https://github.com/apify/crawlee-python/issues/136) - Add first version of browser pool and playwright crawler ([#161](https://github.com/apify/crawlee-python/pull/161)) ([2d2a050](https://github.com/apify/crawlee-python/commit/2d2a0505b1c2b1529a8835163ca97d1ec2a6e44a)) by [@vdusek](https://github.com/vdusek) ## [0.0.3](https://github.com/apify/crawlee-python/releases/tag/v0.0.3) (2024-05-13) ### 🚀 Features - AutoscaledPool implementation ([#55](https://github.com/apify/crawlee-python/pull/55)) ([621ada2](https://github.com/apify/crawlee-python/commit/621ada2bd1ba4e2346fb948dc02686e2b37e3856)) by [@janbuchar](https://github.com/janbuchar), closes [#19](https://github.com/apify/crawlee-python/issues/19) - Add Snapshotter ([#20](https://github.com/apify/crawlee-python/pull/20)) ([492ee38](https://github.com/apify/crawlee-python/commit/492ee38c893b8f54e9583dd492576c5106e29881)) by [@vdusek](https://github.com/vdusek) - Implement BasicCrawler ([#56](https://github.com/apify/crawlee-python/pull/56)) ([6da971f](https://github.com/apify/crawlee-python/commit/6da971fcddbf8b6795346c88e295dada28e7b1d3)) by [@janbuchar](https://github.com/janbuchar), closes [#30](https://github.com/apify/crawlee-python/issues/30) - BeautifulSoupCrawler ([#107](https://github.com/apify/crawlee-python/pull/107)) ([4974dfa](https://github.com/apify/crawlee-python/commit/4974dfa20c7911ee073438fd388e60ba4b2c07db)) by [@janbuchar](https://github.com/janbuchar), closes [#31](https://github.com/apify/crawlee-python/issues/31) - Add_requests and enqueue_links context helpers ([#120](https://github.com/apify/crawlee-python/pull/120)) ([dc850a5](https://github.com/apify/crawlee-python/commit/dc850a5778b105ff09e19eaecbb0a12d94798a62)) by [@janbuchar](https://github.com/janbuchar), closes [#5](https://github.com/apify/crawlee-python/issues/5) - Use SessionPool in BasicCrawler ([#128](https://github.com/apify/crawlee-python/pull/128)) ([9fc4648](https://github.com/apify/crawlee-python/commit/9fc464837e596b3b5a7cd818b6d617550e249352)) by [@janbuchar](https://github.com/janbuchar), closes [#110](https://github.com/apify/crawlee-python/issues/110) - Add base storage client and resource subclients ([#138](https://github.com/apify/crawlee-python/pull/138)) ([44d6597](https://github.com/apify/crawlee-python/commit/44d65974e4837576918069d7e63f8b804964971a)) by [@vdusek](https://github.com/vdusek) ### 🐛 Bug Fixes - **deps:** Update dependency docutils to ^0.21.0 ([#101](https://github.com/apify/crawlee-python/pull/101)) ([534b613](https://github.com/apify/crawlee-python/commit/534b613f7cdfe7adf38b548ee48537db3167d1ec)) by [@renovate[bot]](https://github.com/renovate[bot]) - **deps:** Update dependency eval-type-backport to ^0.2.0 ([#124](https://github.com/apify/crawlee-python/pull/124)) ([c9e69a8](https://github.com/apify/crawlee-python/commit/c9e69a8534f4d82d9a6314947d76a86bcb744607)) by [@renovate[bot]](https://github.com/renovate[bot]) - Fire local SystemInfo events every second ([#144](https://github.com/apify/crawlee-python/pull/144)) ([f1359fa](https://github.com/apify/crawlee-python/commit/f1359fa7eea23f8153ad711287c073e45d498401)) by [@vdusek](https://github.com/vdusek) - Storage manager & purging the defaults ([#150](https://github.com/apify/crawlee-python/pull/150)) ([851042f](https://github.com/apify/crawlee-python/commit/851042f25ad07e25651768e476f098ef0ed21914)) by [@vdusek](https://github.com/vdusek) ================================================ FILE: CONTRIBUTING.md ================================================ # Development Here you'll find a contributing guide to get started with development. ## Environment For local development, it is required to have Python 3.10 (or a later version) installed. We use [uv](https://docs.astral.sh/uv/) for project management. Install it and set up your IDE accordingly. We use [Poe the Poet](https://poethepoet.natn.io/) as a task runner, similar to npm scripts in `package.json`. All tasks are defined in `pyproject.toml` under `[tool.poe.tasks]` and can be run with `uv run poe `. ### Available tasks | Task | Description | | ---- | ----------- | | `install-dev` | Install development dependencies | | `check-code` | Run lint, type-check, and unit-tests | | `lint` | Run linter | | `format` | Fix lint issues and format code | | `type-check` | Run type checker | | `unit-tests` | Run unit tests | | `unit-tests-cov` | Run unit tests with coverage | | `e2e-templates-tests` | Run end-to-end template tests | | `build-docs` | Build documentation website | | `run-docs` | Run documentation website locally | | `build` | Build package | | `clean` | Remove build artifacts and clean caches | ## Dependencies To install this package and its development dependencies, run: ```sh uv run poe install-dev ``` ## Code checking To execute all code checking tools together, run: ```sh uv run poe check-code ``` ### Linting We utilize [ruff](https://docs.astral.sh/ruff/) for linting, which analyzes code for potential issues and enforces consistent style. Refer to `pyproject.toml` for configuration details. To run linting: ```sh uv run poe lint ``` ### Formatting Our automated code formatting also leverages [ruff](https://docs.astral.sh/ruff/), ensuring uniform style and addressing fixable linting issues. Configuration specifics are outlined in `pyproject.toml`. To run formatting: ```sh uv run poe format ``` ### Type checking Type checking is handled by [ty](https://docs.astral.sh/ty/), verifying code against type annotations. Configuration settings can be found in `pyproject.toml`. To run type checking: ```sh uv run poe type-check ``` ### Unit tests We use [pytest](https://docs.pytest.org/) as a testing framework with many plugins. Check `pyproject.toml` for configuration details and installed plugins. To run unit tests: ```sh uv run poe unit-tests ``` To run unit tests with coverage report: ```sh uv run poe unit-tests-cov ``` ## End-to-end tests Prerequisites: - [apify-cli](https://docs.apify.com/cli/docs/installation) installed and available in `PATH` - Set `APIFY_TEST_USER_API_TOKEN` to your [Apify API token](https://docs.apify.com/platform/integrations/api#api-token) To run end-to-end tests: ```sh uv run poe e2e-templates-tests ``` ## Documentation We follow the [Google docstring format](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) for code documentation. All user-facing classes and functions must be documented. Documentation standards are enforced using [Ruff](https://docs.astral.sh/ruff/). Our API documentation is generated from these docstrings using [pydoc-markdown](https://pypi.org/project/pydoc-markdown/) with custom post-processing. Additional content is provided through markdown files in the `docs/` directory. The final documentation is rendered using [Docusaurus](https://docusaurus.io/) and published to GitHub Pages. To run the documentation locally, ensure you have `Node.js` 20+ installed, then run: ```sh uv run poe run-docs ``` ## Commits We use [Conventional Commits](https://www.conventionalcommits.org/) format for commit messages. This convention is used to automatically determine version bumps during the release process. ### Available commit types | Type | Description | | ---- | ----------- | | `feat` | A new feature | | `fix` | A bug fix | | `docs` | Documentation only changes | | `style` | Changes that do not affect the meaning of the code (white-space, formatting, missing semi-colons, etc) | | `refactor` | A code change that neither fixes a bug nor adds a feature | | `perf` | A code change that improves performance | | `test` | Adding missing tests or correcting existing tests | | `build` | Changes that affect the build system or external dependencies (example scopes: gulp, broccoli, npm) | | `ci` | Changes to our CI configuration files and scripts (example scopes: Travis, Circle, BrowserStack, SauceLabs) | | `chore` | Other changes that don't modify src or test files | | `revert` | Reverts a previous commit | ## Release process Publishing new versions to [PyPI](https://pypi.org/project/crawlee) is automated through GitHub Actions. - **Beta releases**: On each commit to the master branch, a new beta release is automatically published. The version number is determined based on the latest release and conventional commits. The beta version suffix is incremented by 1 from the last beta release on PyPI. - **Stable releases**: A stable version release may be created by triggering the `release` GitHub Actions workflow. The version number is determined based on the latest release and conventional commits (`auto` release type), or it may be overridden using the `custom` release type. ### Publishing to PyPI manually 1. **Do not do this unless absolutely necessary.** In all conceivable scenarios, you should use the `release` workflow instead. 2. **Make sure you know what you're doing.** 3. Update the version number: - Modify the `version` field under `project` in `pyproject.toml`. ```toml [project] name = "crawlee" version = "x.z.y" ``` 4. Build the package: ```sh uv run poe build ``` 5. Upload to PyPI: ```sh uv publish --token YOUR_API_TOKEN ``` ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2023 Apify Technologies s.r.o. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================

Crawlee
A web scraping and browser automation library

apify%2Fcrawlee-python | Trendshift

PyPI package version PyPI package downloads Codecov report PyPI Python version Chat on Discord

Crawlee covers your crawling and scraping end-to-end and **helps you build reliable scrapers. Fast.** Your crawlers will appear almost human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it. > 👉 **View full documentation, guides and examples on the [Crawlee project website](https://crawlee.dev/python/)** 👈 We also have a TypeScript implementation of the Crawlee, which you can explore and utilize for your projects. Visit our GitHub repository for more information [Crawlee for JS/TS on GitHub](https://github.com/apify/crawlee). ## Installation We recommend visiting the [Introduction tutorial](https://crawlee.dev/python/docs/introduction) in Crawlee documentation for more information. Crawlee is available as [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal. To install Crawlee with all features, run the following command: ```sh python -m pip install 'crawlee[all]' ``` Then, install the [Playwright](https://playwright.dev/) dependencies: ```sh playwright install ``` Verify that Crawlee is successfully installed: ```sh python -c 'import crawlee; print(crawlee.__version__)' ``` For detailed installation instructions see the [Setting up](https://crawlee.dev/python/docs/introduction/setting-up) documentation page. ### With Crawlee CLI The quickest way to get started with Crawlee is by using the Crawlee CLI and selecting one of the prepared templates. First, ensure you have [uv](https://pypi.org/project/uv/) installed: ```sh uv --help ``` If [uv](https://pypi.org/project/uv/) is not installed, follow the official [installation guide](https://docs.astral.sh/uv/getting-started/installation/). Then, run the CLI and choose from the available templates: ```sh uvx 'crawlee[cli]' create my-crawler ``` If you already have `crawlee` installed, you can spin it up by running: ```sh crawlee create my-crawler ``` ## Examples Here are some practical examples to help you get started with different types of crawlers in Crawlee. Each example demonstrates how to set up and run a crawler for specific use cases, whether you need to handle simple HTML pages or interact with JavaScript-heavy sites. A crawler run will create a `storage/` directory in your current working directory. ### BeautifulSoupCrawler The [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler) downloads web pages using an HTTP library and provides HTML-parsed content to the user. By default it uses [`HttpxHttpClient`](https://crawlee.dev/python/api/class/HttpxHttpClient) for HTTP communication and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) for parsing HTML. It is ideal for projects that require efficient extraction of data from HTML content. This crawler has very good performance since it does not use a browser. However, if you need to execute client-side JavaScript, to get your content, this is not going to be enough and you will need to use [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler). Also if you want to use this crawler, make sure you install `crawlee` with `beautifulsoup` extra. ```python import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, } # Push the extracted data to the default dataset. await context.push_data(data) # Enqueue all links found on the page. await context.enqueue_links() # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ``` ### PlaywrightCrawler The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) uses a headless browser to download web pages and provides an API for data extraction. It is built on [Playwright](https://playwright.dev/), an automation library designed for managing headless browsers. It excels at retrieving web pages that rely on client-side JavaScript for content generation, or tasks requiring interaction with JavaScript-driven content. For scenarios where JavaScript execution is unnecessary or higher performance is required, consider using the [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler). Also if you want to use this crawler, make sure you install `crawlee` with `playwright` extra. ```python import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: crawler = PlaywrightCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. data = { 'url': context.request.url, 'title': await context.page.title(), } # Push the extracted data to the default dataset. await context.push_data(data) # Enqueue all links found on the page. await context.enqueue_links() # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ``` ### More examples Explore our [Examples](https://crawlee.dev/python/docs/examples) page in the Crawlee documentation for a wide range of additional use cases and demonstrations. ## Features Why Crawlee is the preferred choice for web scraping and crawling? ### Why use Crawlee instead of just a random HTTP library with an HTML parser? - Unified interface for **HTTP & headless browser** crawling. - Automatic **parallel crawling** based on available system resources. - Written in Python with **type hints** - enhances DX (IDE autocompletion) and reduces bugs (static type checking). - Automatic **retries** on errors or when you’re getting blocked. - Integrated **proxy rotation** and session management. - Configurable **request routing** - direct URLs to the appropriate handlers. - Persistent **queue for URLs** to crawl. - Pluggable **storage** of both tabular data and files. - Robust **error handling**. ### Why to use Crawlee rather than Scrapy? - **Asyncio-based** – Leveraging the standard [Asyncio](https://docs.python.org/3/library/asyncio.html) library, Crawlee delivers better performance and seamless compatibility with other modern asynchronous libraries. - **Type hints** – Newer project built with modern Python, and complete type hint coverage for a better developer experience. - **Simple integration** – Crawlee crawlers are regular Python scripts, requiring no additional launcher executor. This flexibility allows to integrate a crawler directly into other applications. - **State persistence** – Supports state persistence during interruptions, saving time and costs by avoiding the need to restart scraping pipelines from scratch after an issue. - **Organized data storages** – Allows saving of multiple types of results in a single scraping run. Offers several storing options (see [datasets](https://crawlee.dev/python/api/class/Dataset) & [key-value stores](https://crawlee.dev/python/api/class/KeyValueStore)). ## Running on the Apify platform Crawlee is open-source and runs anywhere, but since it's developed by [Apify](https://apify.com), it's easy to set up on the Apify platform and run in the cloud. Visit the [Apify SDK website](https://docs.apify.com/sdk/python/) to learn more about deploying Crawlee to the Apify platform. ## Support If you find any bug or issue with Crawlee, please [submit an issue on GitHub](https://github.com/apify/crawlee-python/issues). For questions, you can ask on [Stack Overflow](https://stackoverflow.com/questions/tagged/apify), in GitHub Discussions or you can join our [Discord server](https://discord.com/invite/jyEM2PRvMU). ## Contributing Your code contributions are welcome, and you'll be praised for eternity! If you have any ideas for improvements, either submit an issue or create a pull request. For contribution guidelines and the code of conduct, see [CONTRIBUTING.md](https://github.com/apify/crawlee-python/blob/master/CONTRIBUTING.md). ## License This project is licensed under the Apache License 2.0 - see the [LICENSE](https://github.com/apify/crawlee-python/blob/master/LICENSE) file for details. ================================================ FILE: codecov.yaml ================================================ coverage: status: project: default: target: auto threshold: 0.10% # tolerate up to 0.10% decrease informational: true # CI check reports status but never fails patch: default: target: 50% # error only if patch coverage drops below 50% informational: true # CI check reports status but never fails ================================================ FILE: docs/deployment/apify_platform.mdx ================================================ --- id: apify-platform title: Apify platform description: Apify platform - large-scale and high-performance web scraping --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import LogWithConfigExample from '!!raw-loader!./code_examples/apify/log_with_config_example.py'; import CrawlerAsActorExample from '!!raw-loader!./code_examples/apify/crawler_as_actor_example.py'; import ProxyExample from '!!raw-loader!./code_examples/apify/proxy_example.py'; import ProxyAdvancedExample from '!!raw-loader!./code_examples/apify/proxy_advanced_example.py'; Apify is a [platform](https://apify.com) built to serve large-scale and high-performance web scraping and automation needs. It provides easy access to [compute instances (Actors)](#what-is-an-actor), convenient request and result storages, [proxies](../guides/proxy-management), scheduling, webhooks and [more](https://docs.apify.com/), accessible through a [web interface](https://console.apify.com) or an [API](https://docs.apify.com/api). While we think that the Apify platform is super cool, and it's definitely worth signing up for a [free account](https://console.apify.com/sign-up), **Crawlee is and will always be open source**, runnable locally or on any cloud infrastructure. :::note We do not test Crawlee in other cloud environments such as Lambda or on specific architectures such as Raspberry PI. We strive to make it work, but there are no guarantees. ::: ## Requirements To run your Crawlee code on Apify platform, you need an Apify account. If you don't have one yet, you can sign up [here](https://console.apify.com/sign-up). Additionally, you must have the [Apify CLI](https://docs.apify.com/cli/) installed on your computer. For installation instructions, refer to the [Installation guide](https://docs.apify.com/cli/docs/installation). Finally, ensure that the [Apify SDK] (https://docs.apify.com/sdk/python/) is installed in your project. You can install it using `pip`: ```bash pip install apify ``` ## Logging into Apify platform from Crawlee To access your [Apify account](https://console.apify.com/sign-up) from Crawlee, you must provide credentials - your [API token](https://console.apify.com/account?tab=integrations). You can do that either by utilizing [Apify CLI](https://docs.apify.com/cli/) or with environment variables. Once you provide credentials to your Apify CLI installation, you will be able to use all the Apify platform features, such as calling Actors, saving to cloud storages, using Apify proxies, setting up webhooks and so on. ### Log in with CLI Apify CLI allows you to log in to your Apify account on your computer. If you then run your crawler using the CLI, your credentials will automatically be added. ```bash npm install -g apify-cli apify login -t YOUR_API_TOKEN ``` ### Log in with environment variables Alternatively, you can always provide credentials to your Actor by setting the [`APIFY_TOKEN`](#apify_token) environment variable to your API token. > There's also the [`APIFY_PROXY_PASSWORD`](#apify_proxy_password) > environment variable. Actor automatically infers that from your token, but it can be useful > when you need to access proxies from a different account than your token represents. ### Log in with Configuration Another option is to use the [`Configuration`](https://docs.apify.com/sdk/python/reference/class/Configuration) instance and set your api token there. {LogWithConfigExample} ## What is an Actor When you deploy your script to the Apify platform, it becomes an [Actor](https://apify.com/actors). An Actor is a serverless microservice that accepts an input and produces an output. It can run for a few seconds, hours or even infinitely. An Actor can perform anything from a simple action such as filling out a web form or sending an email, to complex operations such as crawling an entire website and removing duplicates from a large dataset. Actors can be shared in the [Apify Store](https://apify.com/store) so that other people can use them. But don't worry, if you share your Actor in the store and somebody uses it, it runs under their account, not yours. **Related links** - [Store of existing Actors](https://apify.com/store) - [Documentation](https://docs.apify.com/actors) - [View Actors in Apify Console](https://console.apify.com/actors) - [API reference](https://apify.com/docs/api/v2#/reference/actors) ## Running an Actor locally First let's create a boilerplate of the new Actor. You could use Apify CLI and just run: ```bash apify create my-hello-world ``` The CLI will prompt you to select a project boilerplate template - let's pick "Crawlee + BeautifulSoup". The tool will create a directory called `my-hello-world` with Python project files. You can run the Actor as follows: ```bash cd my-hello-world apify run ``` ## Running Crawlee code as an Actor For running Crawlee code as an Actor on [Apify platform](https://apify.com/actors) you need to wrap the body of the main function of your crawler with `async with Actor`. :::info NOTE Adding `async with Actor` is the only important thing needed to run it on Apify platform as an Actor. It is needed to initialize your Actor (e.g. to set the correct storage implementation) and to correctly handle exiting the process. ::: Let's look at the `BeautifulSoupCrawler` example from the [Quick start](../quick-start) guide: {CrawlerAsActorExample} Note that you could also run your Actor (that is using Crawlee) locally with Apify CLI. You could start it via the following command in your project folder: ```bash apify run ``` ## Deploying an Actor to Apify platform Now (assuming you are already logged in to your Apify account) you can easily deploy your code to the Apify platform by running: ```bash apify push ``` Your script will be uploaded to and built on the Apify platform so that it can be run there. For more information, view the [Apify Actor](https://docs.apify.com/cli) documentation. ## Usage on Apify platform You can also develop your Actor in an online code editor directly on the platform (you'll need an Apify Account). Let's go to the [Actors](https://console.apify.com/actors) page in the app, click *Create new* and then go to the *Source* tab and start writing the code or paste one of the examples from the [Examples](../examples) section. ## Storages There are several things worth mentioning here. ### Helper functions for default Key-Value Store and Dataset To simplify access to the _default_ storages, instead of using the helper functions of respective storage classes, you could use: - [`Actor.set_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#set_value), [`Actor.get_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_value), [`Actor.get_input()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_input) for [`Key-Value Store`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore) - [`Actor.push_data()`](https://docs.apify.com/sdk/python/reference/class/Actor#push_data) for [`Dataset`](https://docs.apify.com/sdk/python/reference/class/Dataset) ### Using platform storage in a local Actor When you plan to use the platform storage while developing and running your Actor locally, you should use [`Actor.open_key_value_store()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_key_value_store), [`Actor.open_dataset()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_dataset) and [`Actor.open_request_queue()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_request_queue) to open the respective storage. Using each of these methods allows to pass the `force_cloud` keyword argument. If set to `True`, cloud storage will be used instead of the folder on the local disk. :::note If you don't plan to force usage of the platform storages when running the Actor locally, there is no need to use the [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) class for it. The Crawlee variants `KeyValueStore.open()`, `Dataset.open()` and `RequestQueue.open()` will work the same. ::: {/* ### Getting public url of an item in the platform storage If you need to share a link to some file stored in a [Key-Value](https://docs.apify.com/sdk/python/reference/class/KeyValueStore) Store on Apify platform, you can use [`get_public_url()`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore#get_public_url) method. It accepts only one parameter: `key` - the key of the item you want to share. {GetPublicUrlSource} */} ### Exporting dataset data When the `Dataset` is stored on the [Apify platform](https://apify.com/actors), you can export its data to the following formats: HTML, JSON, CSV, Excel, XML and RSS. The datasets are displayed on the Actor run details page and in the [Storage](https://console.apify.com/storage) section in the Apify Console. The actual data is exported using the [Get dataset items](https://apify.com/docs/api/v2#/reference/datasets/item-collection/get-items) Apify API endpoint. This way you can easily share the crawling results. **Related links** - [Apify platform storage documentation](https://docs.apify.com/storage) - [View storage in Apify Console](https://console.apify.com/storage) - [Key-value stores API reference](https://apify.com/docs/api/v2#/reference/key-value-stores) - [Datasets API reference](https://docs.apify.com/api/v2#/reference/datasets) - [Request queues API reference](https://docs.apify.com/api/v2#/reference/request-queues) ## Environment variables The following describes select environment variables set by the Apify platform. For a complete list, see the [Environment variables](https://docs.apify.com/platform/actors/development/programming-interface/environment-variables) section in the Apify platform documentation. :::note It's important to notice that `CRAWLEE_` environment variables don't need to be replaced with equivalent `APIFY_` ones. Likewise, Crawlee understands `APIFY_` environment variables. ::: ### `APIFY_TOKEN` The API token for your Apify account. It is used to access the Apify API, e.g. to access cloud storage or to run an Actor on the Apify platform. You can find your API token on the [Account Settings / Integrations](https://console.apify.com/account?tab=integrations) page. ### Combinations of `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR` By combining the env vars in various ways, you can greatly influence the Actor's behavior. | Env Vars | API | Storages | | --------------------------------------- | --- | ---------------- | | none OR `CRAWLEE_STORAGE_DIR` | no | local | | `APIFY_TOKEN` | yes | Apify platform | | `APIFY_TOKEN` AND `CRAWLEE_STORAGE_DIR` | yes | local + platform | When using both `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR`, you can use all the Apify platform features and your data will be stored locally by default. If you want to access platform storages, you can use the `force_cloud=true` option in their respective functions. ### `APIFY_PROXY_PASSWORD` Optional password to [Apify Proxy](https://docs.apify.com/proxy) for IP address rotation. Assuming Apify Account was already created, you can find the password on the [Proxy page](https://console.apify.com/proxy) in the Apify Console. The password is automatically inferred using the `APIFY_TOKEN` env var, so in most cases, you don't need to touch it. You should use it when, for some reason, you need access to Apify Proxy, but not access to Apify API, or when you need access to proxy from a different account than your token represents. ## Proxy management In addition to your own proxy servers and proxy servers acquired from third-party providers used together with Crawlee, you can also rely on [Apify Proxy](https://apify.com/proxy) for your scraping needs. ### Apify proxy If you are already subscribed to Apify Proxy, you can start using them immediately in only a few lines of code (for local usage you first should be [logged in](#logging-into-apify-platform-from-crawlee) to your Apify account. {ProxyExample} Note that unlike using your own proxies in Crawlee, you shouldn't use the constructor to create `ProxyConfiguration` instances. For using the Apify Proxy you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function instead. ### Advanced Apify proxy configuration With Apify Proxy, you can select specific proxy groups to use, or countries to connect from. This allows you to get better proxy performance after some initial research. {ProxyAdvancedExample} Now your crawlers will use only Residential proxies from the US. Note that you must first get access to a proxy group before you are able to use it. You can check proxy groups available to you in the [proxy dashboard](https://console.apify.com/proxy). ### Apify proxy vs. own proxies The [`ProxyConfiguration`](https://docs.apify.com/sdk/python/reference/class/ProxyConfiguration) class covers both Apify Proxy and custom proxy URLs so that you can easily switch between proxy providers. However, some features of the class are available only to Apify Proxy users, mainly because Apify Proxy is what one would call a super-proxy. It's not a single proxy server, but an API endpoint that allows connection through millions of different IP addresses. So the class essentially has two modes: Apify Proxy or Own (third party) proxy. The difference is easy to remember. - If you're using your own proxies - you should create a `ProxyConfiguration` instance directly. - If you are planning to use Apify Proxy - you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function. The `new_url_function` parameter enables the use of your custom proxy URLs, whereas all the other options are there to configure Apify Proxy. **Related links** - [Apify Proxy docs](https://docs.apify.com/proxy) ================================================ FILE: docs/deployment/aws_lambda.mdx ================================================ --- id: aws-lambda title: Deploy on AWS Lambda description: Prepare your crawler to run on AWS Lambda. --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import BeautifulSoupCrawlerLambda from '!!raw-loader!./code_examples/aws/beautifulsoup_crawler_lambda.py'; import PlaywrightCrawlerLambda from '!!raw-loader!./code_examples/aws/playwright_crawler_lambda.py'; import PlaywrightCrawlerDockerfile from '!!raw-loader!./code_examples/aws/playwright_dockerfile'; [AWS Lambda](https://docs.aws.amazon.com/lambda/latest/dg/welcome.html) is a serverless compute service that lets you run code without provisioning or managing servers. This guide covers deploying `BeautifulSoupCrawler` and `PlaywrightCrawler`. The code examples are based on the [BeautifulSoupCrawler example](../examples/beautifulsoup-crawler). ## BeautifulSoupCrawler on AWS Lambda For simple crawlers that don't require browser rendering, you can deploy using a ZIP archive. ### Updating the code When instantiating a crawler, use `MemoryStorageClient`. By default, Crawlee uses file-based storage, but the Lambda filesystem is read-only (except for `/tmp`). Using `MemoryStorageClient` tells Crawlee to use in-memory storage instead. Wrap the crawler logic in a `lambda_handler` function. This is the entry point that AWS will execute. :::important Make sure to always instantiate a new crawler for every Lambda invocation. AWS keeps the environment running for some time after the first execution (to reduce cold-start times), so subsequent calls may access an already-used crawler instance. **TL;DR: Keep your Lambda stateless.** ::: Finally, return the scraped data from the Lambda when the crawler run ends. {BeautifulSoupCrawlerLambda} ### Preparing the environment Lambda requires all dependencies to be included in the deployment package. Create a virtual environment and install dependencies: ```bash python3.14 -m venv .venv source .venv/bin/activate pip install 'crawlee[beautifulsoup]' 'boto3' 'aws-lambda-powertools' ``` [`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) is the AWS SDK for Python. Including it in your dependencies is recommended to avoid version misalignment issues with the Lambda runtime. ### Creating the ZIP archive Create a ZIP archive from your project, including dependencies from the virtual environment: ```bash cd .venv/lib/python3.14/site-packages zip -r ../../../../package.zip . cd ../../../../ zip package.zip lambda_function.py ``` :::note Large dependencies? AWS has a limit of 50 MB for direct upload and 250 MB for unzipped deployment package size. A better way to manage dependencies is by using Lambda Layers. With Layers, you can share files between multiple Lambda functions and keep the actual code as slim as possible. To create a Lambda Layer: 1. Create a `python/` folder and copy dependencies from `site-packages` into it 2. Create a zip archive: `zip -r layer.zip python/` 3. Create a new Lambda Layer from the archive (you may need to upload it to S3 first) 4. Attach the Layer to your Lambda function ::: ### Creating the Lambda function Create the Lambda function in the AWS Lambda Console: 1. Navigate to `Lambda` in [AWS Management Console](https://aws.amazon.com/console/). 2. Click **Create function**. 3. Select **Author from scratch**. 4. Enter a **Function name**, for example `BeautifulSoupTest`. 5. Choose a **Python runtime** that matches the version used in your virtual environment (for example, Python 3.14). 6. Click **Create function** to finish. Once created, upload `package.zip` as the code source in the AWS Lambda Console using the "Upload from" button. In Lambda Runtime Settings, set the handler. Since the file is named `lambda_function.py` and the function is `lambda_handler`, you can use the default value `lambda_function.lambda_handler`. :::tip Configuration In the Configuration tab, you can adjust: - **Memory**: Memory size can greatly affect execution speed. A minimum of 256-512 MB is recommended. - **Timeout**: Set according to the size of the website you are scraping (1 minute for the example code). - **Ephemeral storage**: Size of the `/tmp` directory. See the [official documentation](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html) to learn how performance and cost scale with memory. ::: After the Lambda deploys, you can test it by clicking the "Test" button. The event contents don't matter for a basic test, but you can parameterize your crawler by parsing the event object that AWS passes as the first argument to the handler. ## PlaywrightCrawler on AWS Lambda For crawlers that require browser rendering, you need to deploy using Docker container images because Playwright and browser binaries exceed Lambda's ZIP deployment size limits. ### Updating the code As with `BeautifulSoupCrawler`, use `MemoryStorageClient` and wrap the logic in a `lambda_handler` function. Additionally, configure `browser_launch_options` with flags optimized for serverless environments. These flags disable sandboxing and GPU features that aren't available in Lambda's containerized runtime. {PlaywrightCrawlerLambda} ### Installing and configuring AWS CLI Install AWS CLI following the [official documentation](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) according to your operating system. Authenticate by running: ```bash aws login ``` ### Preparing the project Initialize the project by running `uvx 'crawlee[cli]' create`. Or use a single command if you don't need interactive mode: ```bash uvx 'crawlee[cli]' create aws_playwright --crawler-type playwright --http-client impit --package-manager uv --no-apify --start-url 'https://crawlee.dev' --install ``` Add the following dependencies: ```bash uv add awslambdaric aws-lambda-powertools boto3 ``` [`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) is the AWS SDK for Python. Use it if your function integrates with any other AWS services. The project is created with a Dockerfile that needs to be modified for AWS Lambda by adding `ENTRYPOINT` and updating `CMD`: {PlaywrightCrawlerDockerfile} ### Building and pushing the Docker image Create a repository `lambda/aws-playwright` in [Amazon Elastic Container Registry](https://docs.aws.amazon.com/AmazonECR/latest/userguide/what-is-ecr.html) in the same region where your Lambda functions will run. To learn more, refer to the [official documentation](https://docs.aws.amazon.com/AmazonECR/latest/userguide/getting-started-cli.html). Navigate to the created repository and click the "View push commands" button. This will open a window with console commands for uploading the Docker image to your repository. Execute them. Example: ```bash aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin {user-specific-data} docker build --platform linux/amd64 --provenance=false -t lambda/aws-playwright . docker tag lambda/aws-playwright:latest {user-specific-data}/lambda/aws-playwright:latest docker push {user-specific-data}/lambda/aws-playwright:latest ``` ### Creating the Lambda function 1. Navigate to `Lambda` in [AWS Management Console](https://aws.amazon.com/console/). 2. Click **Create function**. 3. Select **Container image**. 4. Browse and select your ECR image. 5. Click **Create function** to finish. :::tip Configuration In the Configuration tab, you can adjust resources. Playwright crawlers require more resources than BeautifulSoup crawlers: - **Memory**: Minimum 1024 MB recommended. Browser operations are memory-intensive, so 2048 MB or more may be needed for complex pages. - **Timeout**: Set according to crawl size. Browser startup adds overhead, so allow at least 5 minutes even for simple crawls. - **Ephemeral storage**: Default 512 MB is usually sufficient unless downloading large files. See the [official documentation](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html) to learn how performance and cost scale with memory. ::: After the Lambda deploys, click the "Test" button to invoke it. The event contents don't matter for a basic test, but you can parameterize your crawler by parsing the event object that AWS passes as the first argument to the handler. ================================================ FILE: docs/deployment/code_examples/apify/crawler_as_actor_example.py ================================================ import asyncio from apify import Actor from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: # Wrap the crawler code in an Actor context manager. async with Actor: crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, } await context.push_data(data) await context.enqueue_links() await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/deployment/code_examples/apify/get_public_url.py ================================================ import asyncio from apify import Actor async def main() -> None: async with Actor: store = await Actor.open_key_value_store() await store.set_value('your-file', {'foo': 'bar'}) url = store.get_public_url('your-file') Actor.log.info(f'KVS public URL: {url}') # https://api.apify.com/v2/key-value-stores//records/your-file if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/deployment/code_examples/apify/log_with_config_example.py ================================================ import asyncio from apify import Actor, Configuration async def main() -> None: # Create a new configuration with your API key. You can find it at # https://console.apify.com/settings/integrations. It can be provided either # as a parameter "token" or as an environment variable "APIFY_TOKEN". config = Configuration( token='apify_api_YOUR_TOKEN', ) async with Actor(config): Actor.log.info('Hello from Apify platform!') if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/deployment/code_examples/apify/proxy_advanced_example.py ================================================ import asyncio from apify import Actor async def main() -> None: async with Actor: proxy_configuration = await Actor.create_proxy_configuration( password='apify_proxy_YOUR_PASSWORD', # Specify the proxy group to use. groups=['RESIDENTIAL'], # Set the country code for the proxy. country_code='US', ) # ... if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/deployment/code_examples/apify/proxy_example.py ================================================ import asyncio from apify import Actor async def main() -> None: async with Actor: # Create a new Apify Proxy configuration. The password can be found at # https://console.apify.com/proxy/http-settings and should be provided either # as a parameter "password" or as an environment variable "APIFY_PROXY_PASSWORD". proxy_configuration = await Actor.create_proxy_configuration( password='apify_proxy_YOUR_PASSWORD', ) if not proxy_configuration: Actor.log.warning('Failed to create proxy configuration.') return proxy_url = await proxy_configuration.new_url() Actor.log.info(f'Proxy URL: {proxy_url}') if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/deployment/code_examples/aws/beautifulsoup_crawler_lambda.py ================================================ import asyncio import json from datetime import timedelta from typing import Any from aws_lambda_powertools.utilities.typing import LambdaContext from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.storage_clients import MemoryStorageClient from crawlee.storages import Dataset, RequestQueue async def main() -> str: # highlight-start # Disable writing storage data to the file system storage_client = MemoryStorageClient() # highlight-end # Initialize storages dataset = await Dataset.open(storage_client=storage_client) request_queue = await RequestQueue.open(storage_client=storage_client) crawler = BeautifulSoupCrawler( storage_client=storage_client, max_request_retries=1, request_handler_timeout=timedelta(seconds=30), max_requests_per_crawl=10, ) @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, 'h1s': [h1.text for h1 in context.soup.find_all('h1')], 'h2s': [h2.text for h2 in context.soup.find_all('h2')], 'h3s': [h3.text for h3 in context.soup.find_all('h3')], } await context.push_data(data) await context.enqueue_links() await crawler.run(['https://crawlee.dev']) # Extract data saved in `Dataset` data = await crawler.get_data() # Clean up storages after the crawl await dataset.drop() await request_queue.drop() # Serialize the list of scraped items to JSON string return json.dumps(data.items) def lambda_handler(_event: dict[str, Any], _context: LambdaContext) -> dict[str, Any]: result = asyncio.run(main()) # Return the response with results return {'statusCode': 200, 'body': result} ================================================ FILE: docs/deployment/code_examples/aws/playwright_crawler_lambda.py ================================================ import asyncio import json from datetime import timedelta from typing import Any from aws_lambda_powertools.utilities.typing import LambdaContext from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.storage_clients import MemoryStorageClient from crawlee.storages import Dataset, RequestQueue async def main() -> str: # highlight-start # Disable writing storage data to the file system storage_client = MemoryStorageClient() # highlight-end # Initialize storages dataset = await Dataset.open(storage_client=storage_client) request_queue = await RequestQueue.open(storage_client=storage_client) crawler = PlaywrightCrawler( storage_client=storage_client, max_request_retries=1, request_handler_timeout=timedelta(seconds=30), max_requests_per_crawl=10, # highlight-start # Configure Playwright to run in AWS Lambda environment browser_launch_options={ 'args': [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--single-process', ] }, # highlight-end ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') data = { 'url': context.request.url, 'title': await context.page.title(), 'h1s': await context.page.locator('h1').all_text_contents(), 'h2s': await context.page.locator('h2').all_text_contents(), 'h3s': await context.page.locator('h3').all_text_contents(), } await context.push_data(data) await context.enqueue_links() await crawler.run(['https://crawlee.dev']) # Extract data saved in `Dataset` data = await crawler.get_data() # Clean up storages after the crawl await dataset.drop() await request_queue.drop() # Serialize the list of scraped items to JSON string return json.dumps(data.items) def lambda_handler(_event: dict[str, Any], _context: LambdaContext) -> dict[str, Any]: result = asyncio.run(main()) # Return the response with results return {'statusCode': 200, 'body': result} ================================================ FILE: docs/deployment/code_examples/aws/playwright_dockerfile ================================================ FROM apify/actor-python-playwright:3.14 RUN apt update && apt install -yq git && rm -rf /var/lib/apt/lists/* RUN pip install -U pip setuptools \ && pip install 'uv<1' ENV UV_PROJECT_ENVIRONMENT="/usr/local" COPY pyproject.toml uv.lock ./ RUN echo "Python version:" \ && python --version \ && echo "Installing dependencies:" \ && PLAYWRIGHT_INSTALLED=$(pip freeze | grep -q playwright && echo "true" || echo "false") \ && if [ "$PLAYWRIGHT_INSTALLED" = "true" ]; then \ echo "Playwright already installed, excluding from uv sync" \ && uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact --no-install-package playwright; \ else \ echo "Playwright not found, installing all dependencies" \ && uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact; \ fi \ && echo "All installed Python packages:" \ && pip freeze COPY . ./ RUN python -m compileall -q . # highlight-start # AWS Lambda entrypoint ENTRYPOINT [ "/usr/local/bin/python3", "-m", "awslambdaric" ] # Lambda handler function CMD [ "aws_playwright.main.lambda_handler" ] # highlight-end ================================================ FILE: docs/deployment/code_examples/google/cloud_run_example.py ================================================ import json import os import uvicorn from litestar import Litestar, get from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.storage_clients import MemoryStorageClient @get('/') async def main() -> str: """The crawler entry point that will be called when the HTTP endpoint is accessed.""" # highlight-start # Disable writing storage data to the file system storage_client = MemoryStorageClient() # highlight-end crawler = PlaywrightCrawler( headless=True, max_requests_per_crawl=10, browser_type='firefox', storage_client=storage_client, ) @crawler.router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: """Default request handler that processes each page during crawling.""" context.log.info(f'Processing {context.request.url} ...') title = await context.page.query_selector('title') await context.push_data( { 'url': context.request.loaded_url, 'title': await title.inner_text() if title else None, } ) await context.enqueue_links() await crawler.run(['https://crawlee.dev']) data = await crawler.get_data() # Return the results as JSON to the client return json.dumps(data.items) # Initialize the Litestar app with our route handler app = Litestar(route_handlers=[main]) # Start the Uvicorn server using the `PORT` environment variable provided by GCP # This is crucial - Cloud Run expects your app to listen on this specific port uvicorn.run(app, host='0.0.0.0', port=int(os.environ.get('PORT', '8080'))) # noqa: S104 # Use all interfaces in a container, safely ================================================ FILE: docs/deployment/code_examples/google/google_example.py ================================================ import asyncio import json from datetime import timedelta import functions_framework from flask import Request, Response from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.storage_clients import MemoryStorageClient async def main() -> str: # highlight-start # Disable writing storage data to the file system storage_client = MemoryStorageClient() # highlight-end crawler = BeautifulSoupCrawler( storage_client=storage_client, max_request_retries=1, request_handler_timeout=timedelta(seconds=30), max_requests_per_crawl=10, ) @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, 'h1s': [h1.text for h1 in context.soup.find_all('h1')], 'h2s': [h2.text for h2 in context.soup.find_all('h2')], 'h3s': [h3.text for h3 in context.soup.find_all('h3')], } await context.push_data(data) await context.enqueue_links() await crawler.run(['https://crawlee.dev']) # highlight-start # Extract data saved in `Dataset` data = await crawler.get_data() # Serialize to json string and return return json.dumps(data.items) # highlight-end @functions_framework.http def crawlee_run(request: Request) -> Response: # You can pass data to your crawler using `request` function_id = request.headers['Function-Execution-Id'] response_str = asyncio.run(main()) # Return a response with the crawling results return Response(response=response_str, status=200) ================================================ FILE: docs/deployment/google_cloud.mdx ================================================ --- id: gcp-cloud-run-functions title: Cloud Run functions description: Prepare your crawler to run in Cloud Run functions on Google Cloud Platform. --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import GoogleFunctions from '!!raw-loader!./code_examples/google/google_example.py'; [Google Cloud Run Functions](https://cloud.google.com/functions) is a serverless execution environment for running simple HTTP-based web scrapers. This service is best suited for lightweight crawlers that don't require browser rendering capabilities and can be executed via HTTP requests. ## Updating the project For the project foundation, use BeautifulSoupCrawler as described in this [example](../examples/beautifulsoup-crawler). Add [`functions-framework`](https://pypi.org/project/functions-framework/) to your dependencies file `requirements.txt`. If you're using a project manager like `poetry` or `uv`, export your dependencies to `requirements.txt`. Update the project code to make it compatible with Cloud Functions and return data in JSON format. Also add an entry point that Cloud Functions will use to run the project. {GoogleFunctions.replace(/^.*?\n/, '')} You can test your project locally. Start the server by running: ```bash functions-framework --target=crawlee_run ``` Then make a GET request to `http://127.0.0.1:8080/`, for example in your browser. ## Deploying to Google Cloud Platform In the Google Cloud dashboard, create a new function, allocate memory and CPUs to it, set region and function timeout. When deploying, select **"Use an inline editor to create a function"**. This allows you to configure the project using only the Google Cloud Console dashboard. Using the `inline editor`, update the function files according to your project. **Make sure** to update the `requirements.txt` file to match your project's dependencies. Also, make sure to set the **Function entry point** to the name of the function decorated with `@functions_framework.http`, which in our case is `crawlee_run`. After the Function deploys, you can test it by clicking the "Test" button. This button opens a popup with a `curl` script that calls your new Cloud Function. To avoid having to install the `gcloud` CLI application locally, you can also run this script in the Cloud Shell by clicking the link above the code block. ================================================ FILE: docs/deployment/google_cloud_run.mdx ================================================ --- id: gcp-cloud-run title: Cloud Run description: Prepare your crawler to run in Cloud Run on Google Cloud Platform. --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import GoogleCloudRun from '!!raw-loader!./code_examples/google/cloud_run_example.py'; [Google Cloud Run](https://cloud.google.com/run) is a container-based serverless platform that allows you to run web crawlers with headless browsers. This service is recommended when your Crawlee applications need browser rendering capabilities, require more granular control, or have complex dependencies that aren't supported by [Cloud Functions](./gcp-cloud-run-functions). GCP Cloud Run allows you to deploy using Docker containers, giving you full control over your environment and the flexibility to use any web server framework of your choice, unlike Cloud Functions which are limited to [Flask](https://flask.palletsprojects.com/en/stable/). ## Preparing the project We'll prepare our project using [Litestar](https://litestar.dev/) and the [Uvicorn](https://www.uvicorn.org/) web server. The HTTP server handler will wrap the crawler to communicate with clients. Because the Cloud Run platform sees only an opaque Docker container, we have to take care of this bit ourselves. :::info GCP passes you an environment variable called `PORT` - your HTTP server is expected to be listening on this port (GCP exposes this one to the outer world). ::: {GoogleCloudRun.replace(/^.*?\n/, '')} :::tip Always make sure to keep all the logic in the request handler - as with other FaaS services, your request handlers have to be **stateless.** ::: ## Deploying to Google Cloud Platform Now, we’re ready to deploy! If you have initialized your project using `uvx crawlee create`, the initialization script has prepared a Dockerfile for you. All you have to do now is run `gcloud run deploy` in your project folder (the one with your Dockerfile in it). The gcloud CLI application will ask you a few questions, such as what region you want to deploy your application in, or whether you want to make your application public or private. After answering those questions, you should be able to see your application in the GCP dashboard and run it using the link you find there. :::tip In case your first execution of your newly created Cloud Run fails, try editing the Run configuration - mainly setting the available memory to 1GiB or more and updating the request timeout according to the size of the website you are scraping. ::: ================================================ FILE: docs/examples/add_data_to_dataset.mdx ================================================ --- id: add-data-to-dataset title: Add data to dataset --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_bs.py'; import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_pw.py'; import DatasetExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_dataset.py'; This example demonstrates how to store extracted data into datasets using the `context.push_data` helper function. If the specified dataset does not already exist, it will be created automatically. Additionally, you can save data to custom datasets by providing `dataset_id` or `dataset_name` parameters to the `push_data` function. {BeautifulSoupExample} {PlaywrightExample} Each item in the dataset will be stored in its own file within the following directory: ```text {PROJECT_FOLDER}/storage/datasets/default/ ``` For more control, you can also open a dataset manually using the asynchronous constructor `Dataset.open` {DatasetExample} ================================================ FILE: docs/examples/beautifulsoup_crawler.mdx ================================================ --- id: beautifulsoup-crawler title: BeautifulSoup crawler --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler.py'; This example demonstrates how to use `BeautifulSoupCrawler` to crawl a list of URLs, load each URL using a plain HTTP request, parse the HTML using the [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library and extract some data from it - the page title and all `

`, `

` and `

` tags. This setup is perfect for scraping specific elements from web pages. Thanks to the well-known BeautifulSoup, you can easily navigate the HTML structure and retrieve the data you need with minimal code. It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request. {BeautifulSoupExample} ================================================ FILE: docs/examples/capture_screenshot_using_playwright.mdx ================================================ --- id: capture-screenshots-using-playwright title: Capture screenshots using Playwright --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import CaptureScreenshotExample from '!!raw-loader!roa-loader!./code_examples/capture_screenshot_using_playwright.py'; This example demonstrates how to capture screenshots of web pages using `PlaywrightCrawler` and store them in the key-value store. The `PlaywrightCrawler` is configured to automate the browsing and interaction with web pages. It uses headless Chromium as the browser type to perform these tasks. Each web page specified in the initial list of URLs is visited sequentially, and a screenshot of the page is captured using Playwright's `page.screenshot()` method. The captured screenshots are stored in the key-value store, which is suitable for managing and storing files in various formats. In this case, screenshots are stored as PNG images with a unique key generated from the URL of the page. {CaptureScreenshotExample} ================================================ FILE: docs/examples/capturing_page_snapshots_with_error_snapshotter.mdx ================================================ --- id: capturing-page-snapshots-with-error-snapshotter title: Capturing page snapshots with ErrorSnapshotter description: How to capture page snapshots on errors. --- import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import ApiLink from '@site/src/components/ApiLink'; import ParselCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_with_error_snapshotter.py'; import PlaywrightCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_error_snapshotter.py'; This example demonstrates how to capture page snapshots on first occurrence of each unique error. The capturing happens automatically if you set `save_error_snapshots=True` in the crawler's `Statistics`. The error snapshot can contain `html` file and `jpeg` file that are created from the page where the unhandled exception was raised. Captured error snapshot files are saved to the default key-value store. Both `PlaywrightCrawler` and [HTTP crawlers](../guides/http-crawlers) are capable of capturing the html file, but only `PlaywrightCrawler` is able to capture page screenshot as well. { ParselCrawlerWithErrorSnapshotter } { PlaywrightCrawlerWithErrorSnapshotter } ================================================ FILE: docs/examples/code_examples/adaptive_playwright_crawler.py ================================================ import asyncio from datetime import timedelta from playwright.async_api import Route from crawlee.crawlers import ( AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, ) async def main() -> None: # Crawler created by following factory method will use `beautifulsoup` # for parsing static content. crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( max_requests_per_crawl=10, # Limit the max requests per crawl. playwright_crawler_specific_kwargs={'headless': False}, ) @crawler.router.default_handler async def request_handler_for_label( context: AdaptivePlaywrightCrawlingContext, ) -> None: # Do some processing using `parsed_content` context.log.info(context.parsed_content.title) # Locate element h2 within 5 seconds h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000)) # Do stuff with element found by the selector context.log.info(h2) # Find more links and enqueue them. await context.enqueue_links() # Save some data. await context.push_data({'Visited url': context.request.url}) @crawler.pre_navigation_hook async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: """Hook executed both in static sub crawler and playwright sub crawler. Trying to access `context.page` in this hook would raise `AdaptiveContextError` for pages crawled without playwright.""" context.log.info(f'pre navigation hook for: {context.request.url} ...') @crawler.pre_navigation_hook(playwright_only=True) async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: """Hook executed only in playwright sub crawler. It is safe to access `page` object. """ async def some_routing_function(route: Route) -> None: await route.continue_() await context.page.route('*/**', some_routing_function) context.log.info( f'Playwright only pre navigation hook for: {context.request.url} ...' ) # Run the crawler with the initial list of URLs. await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/add_data_to_dataset_bs.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, 'html': str(context.soup)[:1000], } # Push the extracted data to the default dataset. await context.push_data(data) # Run the crawler with the initial list of requests. await crawler.run( [ 'https://crawlee.dev', 'https://apify.com', 'https://example.com', ] ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/add_data_to_dataset_dataset.py ================================================ import asyncio from crawlee.storages import Dataset async def main() -> None: # Open dataset manually using asynchronous constructor open(). dataset = await Dataset.open() # Interact with dataset directly. await dataset.push_data({'key': 'value'}) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/add_data_to_dataset_pw.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: crawler = PlaywrightCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. data = { 'url': context.request.url, 'title': await context.page.title(), 'html': str(await context.page.content())[:1000], } # Push the extracted data to the default dataset. await context.push_data(data) # Run the crawler with the initial list of requests. await crawler.run( [ 'https://crawlee.dev', 'https://apify.com', 'https://example.com', ] ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/beautifulsoup_crawler.py ================================================ import asyncio from datetime import timedelta from crawlee.crawlers import ( BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext, ) async def main() -> None: # Create an instance of the BeautifulSoupCrawler class, a crawler that automatically # loads the URLs and parses their HTML using the BeautifulSoup library. crawler = BeautifulSoupCrawler( # On error, retry each page at most once. max_request_retries=1, # Increase the timeout for processing each page to 30 seconds. request_handler_timeout=timedelta(seconds=30), # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. # The handler receives a context parameter, providing various properties and # helper methods. Here are a few key ones we use for demonstration: # - request: an instance of the Request class containing details such as the URL # being crawled and the HTTP method used. # - soup: the BeautifulSoup object containing the parsed HTML of the response. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, 'h1s': [h1.text for h1 in context.soup.find_all('h1')], 'h2s': [h2.text for h2 in context.soup.find_all('h2')], 'h3s': [h3.text for h3 in context.soup.find_all('h3')], } # Push the extracted data to the default dataset. In local configuration, # the data will be stored as JSON files in ./storage/datasets/default. await context.push_data(data) # Register pre navigation hook which will be called before each request. # This hook is optional and does not need to be defined at all. @crawler.pre_navigation_hook async def some_hook(context: BasicCrawlingContext) -> None: pass # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/beautifulsoup_crawler_keep_alive.py ================================================ import asyncio from crawlee._types import BasicCrawlingContext from crawlee.crawlers import BeautifulSoupCrawler async def main() -> None: crawler = BeautifulSoupCrawler( # Keep the crawler alive even when there are no requests to be processed now. keep_alive=True, ) def stop_crawler_if_url_visited(context: BasicCrawlingContext) -> None: """Stop crawler once specific url is visited. Example of guard condition to stop the crawler.""" if context.request.url == 'https://crawlee.dev/docs/examples': crawler.stop( 'Stop crawler that was in keep_alive state after specific url was visite' ) else: context.log.info('keep_alive=True, waiting for more requests to come.') async def add_request_later(url: str, after_s: int) -> None: """Add requests to the queue after some time. Can be done by external code.""" # Just an example of request being added to the crawler later, # when it is waiting due to `keep_alive=True`. await asyncio.sleep(after_s) await crawler.add_requests([url]) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BasicCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Stop crawler if some guard condition has been met. stop_crawler_if_url_visited(context) # Start some tasks that will add some requests later to simulate real situation, # where requests are added later by external code. add_request_later_task1 = asyncio.create_task( add_request_later(url='https://crawlee.dev', after_s=1) ) add_request_later_task2 = asyncio.create_task( add_request_later(url='https://crawlee.dev/docs/examples', after_s=5) ) # Run the crawler without the initial list of requests. # Wait for more requests to be added to the queue later due to `keep_alive=True`. await crawler.run() await asyncio.gather(add_request_later_task1, add_request_later_task2) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/beautifulsoup_crawler_stop.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: # Create an instance of the BeautifulSoupCrawler class, a crawler that automatically # loads the URLs and parses their HTML using the BeautifulSoup library. crawler = BeautifulSoupCrawler() # Define the default request handler, which will be called for every request. # The handler receives a context parameter, providing various properties and # helper methods. Here are a few key ones we use for demonstration: # - request: an instance of the Request class containing details such as the URL # being crawled and the HTTP method used. # - soup: the BeautifulSoup object containing the parsed HTML of the response. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Create custom condition to stop crawler once it finds what it is looking for. if 'crawlee' in context.request.url: crawler.stop( reason='Manual stop of crawler after finding `crawlee` in the url.' ) # Extract data from the page. data = { 'url': context.request.url, } # Push the extracted data to the default dataset. In local configuration, # the data will be stored as JSON files in ./storage/datasets/default. await context.push_data(data) # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/capture_screenshot_using_playwright.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.storages import KeyValueStore async def main() -> None: crawler = PlaywrightCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, # Headless mode, set to False to see the browser in action. headless=False, # Browser types supported by Playwright. browser_type='chromium', ) # Open the default key-value store. kvs = await KeyValueStore.open() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Capture the screenshot of the page using Playwright's API. screenshot = await context.page.screenshot() name = context.request.url.split('/')[-1] # Store the screenshot in the key-value store. await kvs.set_value( key=f'screenshot-{name}', value=screenshot, content_type='image/png', ) # Run the crawler with the initial list of URLs. await crawler.run( [ 'https://crawlee.dev', 'https://apify.com', 'https://example.com', ] ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/configure_json_logging.py ================================================ from __future__ import annotations import asyncio import inspect import logging import sys from typing import TYPE_CHECKING from loguru import logger from crawlee.crawlers import HttpCrawler, HttpCrawlingContext if TYPE_CHECKING: from loguru import Record # Configure loguru interceptor to capture standard logging output class InterceptHandler(logging.Handler): def emit(self, record: logging.LogRecord) -> None: # Get corresponding Loguru level if it exists try: level: str | int = logger.level(record.levelname).name except ValueError: level = record.levelno # Find caller from where originated the logged message frame, depth = inspect.currentframe(), 0 while frame: filename = frame.f_code.co_filename is_logging = filename == logging.__file__ is_frozen = 'importlib' in filename and '_bootstrap' in filename if depth > 0 and not (is_logging | is_frozen): break frame = frame.f_back depth += 1 dummy_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None) standard_attrs = set(dummy_record.__dict__.keys()) extra_dict = { key: value for key, value in record.__dict__.items() if key not in standard_attrs } ( logger.bind(**extra_dict) .opt(depth=depth, exception=record.exc_info) .patch(lambda loguru_record: loguru_record.update({'name': record.name})) .log(level, record.getMessage()) ) # Configure loguru formatter def formatter(record: Record) -> str: basic_format = '[{name}] | {level: ^8} | - {message}' if record['extra']: basic_format = basic_format + ' {extra}' return f'{basic_format}\n' # Remove default loguru logger logger.remove() # Set up loguru with JSONL serialization in file `crawler.log` logger.add('crawler.log', format=formatter, serialize=True, level='INFO') # Set up loguru logger for console logger.add(sys.stderr, format=formatter, colorize=True, level='INFO') # Configure standard logging to use our interceptor logging.basicConfig(handlers=[InterceptHandler()], level=logging.INFO, force=True) async def main() -> None: # Initialize crawler with disabled table logs crawler = HttpCrawler( configure_logging=False, # Disable default logging configuration statistics_log_format='inline', # Set inline formatting for statistics logs ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Run the crawler await crawler.run(['https://www.crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/crawl_all_links_on_website_bs.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Enqueue all links found on the page. await context.enqueue_links() # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/crawl_all_links_on_website_pw.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: crawler = PlaywrightCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Enqueue all links found on the page. await context.enqueue_links() # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/crawl_multiple_urls_bs.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Run the crawler with the initial list of requests. await crawler.run( [ 'https://crawlee.dev', 'https://apify.com', 'https://example.com', ] ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/crawl_multiple_urls_pw.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: crawler = PlaywrightCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Run the crawler with the initial list of requests. await crawler.run( [ 'https://crawlee.dev', 'https://apify.com', 'https://example.com', ] ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/crawl_specific_links_on_website_bs.py ================================================ import asyncio from crawlee import Glob from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Enqueue all the documentation links found on the page, except for the examples. await context.enqueue_links( include=[Glob('https://crawlee.dev/docs/**')], exclude=[Glob('https://crawlee.dev/docs/examples')], ) # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/crawl_specific_links_on_website_pw.py ================================================ import asyncio from crawlee import Glob from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: crawler = PlaywrightCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Enqueue all the documentation links found on the page, except for the examples. await context.enqueue_links( include=[Glob('https://crawlee.dev/docs/**')], exclude=[Glob('https://crawlee.dev/docs/examples')], ) # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/crawl_website_with_relative_links_all_links.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Enqueue all links found on the page. Any URLs found will be matched by # this strategy, even if they go off the site you are currently crawling. await context.enqueue_links(strategy='all') # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/crawl_website_with_relative_links_same_domain.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Setting the strategy to same domain will enqueue all links found that # are on the same hostname as request.loaded_url or request.url. await context.enqueue_links(strategy='same-domain') # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/crawl_website_with_relative_links_same_hostname.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Setting the strategy to same hostname will enqueue all links found that are on # the same hostname (including subdomains) as request.loaded_url or request.url. await context.enqueue_links(strategy='same-hostname') # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/crawl_website_with_relative_links_same_origin.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Setting the strategy to same origin will enqueue all links found that are on # the same origin as request.loaded_url or request.url. await context.enqueue_links(strategy='same-origin') # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/export_entire_dataset_to_file_csv.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, } # Enqueue all links found on the page. await context.enqueue_links() # Push the extracted data to the default dataset. await context.push_data(data) # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev']) # Export the entire dataset to a CSV file. # Use semicolon as delimiter and always quote strings. await crawler.export_data(path='results.csv', delimiter=';', quoting='all') if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/export_entire_dataset_to_file_json.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, } # Enqueue all links found on the page. await context.enqueue_links() # Push the extracted data to the default dataset. await context.push_data(data) # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev']) # Export the entire dataset to a JSON file. # Set ensure_ascii=False to allow Unicode characters in the output. await crawler.export_data(path='results.json', ensure_ascii=False) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/extract_and_add_specific_links_on_website_bs.py ================================================ import asyncio from crawlee import Glob from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract all the documentation links found on the page, except for the examples. extracted_links = await context.extract_links( include=[Glob('https://crawlee.dev/docs/**')], exclude=[Glob('https://crawlee.dev/docs/examples')], ) # Some very custom filtering which can't be achieved by `extract_links` arguments. max_link_length = 30 filtered_links = [ link for link in extracted_links if len(link.url) < max_link_length ] # Add filtered links to the request queue. await context.add_requests(filtered_links) # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/extract_and_add_specific_links_on_website_pw.py ================================================ import asyncio from crawlee import Glob from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: crawler = PlaywrightCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract all the documentation links found on the page, except for the examples. extracted_links = await context.extract_links( include=[Glob('https://crawlee.dev/docs/**')], exclude=[Glob('https://crawlee.dev/docs/examples')], ) # Some very custom filtering which can't be achieved by `extract_links` arguments. max_link_length = 30 filtered_links = [ link for link in extracted_links if len(link.url) < max_link_length ] # Add filtered links to the request queue. await context.add_requests(filtered_links) # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/fill_and_submit_web_form_crawler.py ================================================ import asyncio from urllib.parse import urlencode from crawlee import Request from crawlee.crawlers import HttpCrawler, HttpCrawlingContext async def main() -> None: crawler = HttpCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') response = (await context.http_response.read()).decode('utf-8') context.log.info(f'Response: {response}') # To see the response in the logs. # Prepare a POST request to the form endpoint. request = Request.from_url( url='https://httpbin.org/post', method='POST', headers={'content-type': 'application/x-www-form-urlencoded'}, payload=urlencode( { 'custname': 'John Doe', 'custtel': '1234567890', 'custemail': 'johndoe@example.com', 'size': 'large', 'topping': ['bacon', 'cheese', 'mushroom'], 'delivery': '13:00', 'comments': 'Please ring the doorbell upon arrival.', } ).encode(), ) # Run the crawler with the initial list of requests. await crawler.run([request]) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/fill_and_submit_web_form_request.py ================================================ import asyncio from urllib.parse import urlencode from crawlee import Request async def main() -> None: # Prepare a POST request to the form endpoint. request = Request.from_url( url='https://httpbin.org/post', method='POST', headers={'content-type': 'application/x-www-form-urlencoded'}, payload=urlencode( { 'custname': 'John Doe', 'custtel': '1234567890', 'custemail': 'johndoe@example.com', 'size': 'large', 'topping': ['bacon', 'cheese', 'mushroom'], 'delivery': '13:00', 'comments': 'Please ring the doorbell upon arrival.', } ).encode(), ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/parsel_crawler.py ================================================ import asyncio from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext # Regex for identifying email addresses on a webpage. EMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' async def main() -> None: crawler = ParselCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. data = { 'url': context.request.url, 'title': context.selector.xpath('//title/text()').get(), 'email_address_list': context.selector.re(EMAIL_REGEX), } # Push the extracted data to the default dataset. await context.push_data(data) # Enqueue all links found on the page. await context.enqueue_links() # Register pre navigation hook which will be called before each request. # This hook is optional and does not need to be defined at all. @crawler.pre_navigation_hook async def some_hook(context: BasicCrawlingContext) -> None: pass # Run the crawler with the initial list of URLs. await crawler.run(['https://github.com']) # Export the entire dataset to a JSON file. await crawler.export_data(path='results.json') if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py ================================================ import asyncio from random import choice from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.statistics import Statistics async def main() -> None: crawler = ParselCrawler( statistics=Statistics.with_default_state(save_error_snapshots=True) ) @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Simulate various errors to demonstrate `ErrorSnapshotter` # saving only the first occurrence of unique error. await context.enqueue_links() random_number = choice(range(10)) if random_number == 1: raise KeyError('Some KeyError') if random_number == 2: raise ValueError('Some ValueError') if random_number == 3: raise RuntimeError('Some RuntimeError') await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/playwright_block_requests.py ================================================ import asyncio from crawlee.crawlers import ( PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext, ) async def main() -> None: crawler = PlaywrightCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') await context.enqueue_links() # Define the hook, which will be called before every request. @crawler.pre_navigation_hook async def navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None: context.log.info(f'Navigating to {context.request.url} ...') # Block all requests to URLs that include `adsbygoogle.js` and also all defaults. await context.block_requests(extra_url_patterns=['adsbygoogle.js']) # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/playwright_crawler.py ================================================ import asyncio from crawlee.crawlers import ( PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext, ) async def main() -> None: crawler = PlaywrightCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, # Headless mode, set to False to see the browser in action. headless=False, # Browser types supported by Playwright. browser_type='chromium', ) # Define the default request handler, which will be called for every request. # The handler receives a context parameter, providing various properties and # helper methods. Here are a few key ones we use for demonstration: # - request: an instance of the Request class containing details such as the URL # being crawled and the HTTP method used. # - page: Playwright's Page object, which allows interaction with the web page # (see https://playwright.dev/python/docs/api/class-page for more details). @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page using Playwright's API. posts = await context.page.query_selector_all('.athing') data = [] for post in posts: # Get the HTML elements for the title and rank within each post. title_element = await post.query_selector('.title a') rank_element = await post.query_selector('.rank') # Extract the data we want from the elements. title = await title_element.inner_text() if title_element else None rank = await rank_element.inner_text() if rank_element else None href = await title_element.get_attribute('href') if title_element else None data.append({'title': title, 'rank': rank, 'href': href}) # Push the extracted data to the default dataset. In local configuration, # the data will be stored as JSON files in ./storage/datasets/default. await context.push_data(data) # Find a link to the next page and enqueue it if it exists. await context.enqueue_links(selector='.morelink') # Define a hook that will be called each time before navigating to a new URL. # The hook receives a context parameter, providing access to the request and # browser page among other things. In this example, we log the URL being # navigated to. @crawler.pre_navigation_hook async def log_navigation_url(context: PlaywrightPreNavCrawlingContext) -> None: context.log.info(f'Navigating to {context.request.url} ...') # Run the crawler with the initial list of URLs. await crawler.run(['https://news.ycombinator.com/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/playwright_crawler_with_camoufox.py ================================================ import asyncio # Camoufox is external package and needs to be installed. It is not included in crawlee. from camoufox import AsyncNewBrowser from typing_extensions import override from crawlee.browsers import ( BrowserPool, PlaywrightBrowserController, PlaywrightBrowserPlugin, ) from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext class CamoufoxPlugin(PlaywrightBrowserPlugin): """Example browser plugin that uses Camoufox browser, but otherwise keeps the functionality of PlaywrightBrowserPlugin. """ @override async def new_browser(self) -> PlaywrightBrowserController: if not self._playwright: raise RuntimeError('Playwright browser plugin is not initialized.') return PlaywrightBrowserController( browser=await AsyncNewBrowser( self._playwright, **self._browser_launch_options ), # Increase, if camoufox can handle it in your use case. max_open_pages_per_browser=1, # This turns off the crawlee header_generation. Camoufox has its own. header_generator=None, ) async def main() -> None: crawler = PlaywrightCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, # Custom browser pool. Gives users full control over browsers used by the crawler. browser_pool=BrowserPool(plugins=[CamoufoxPlugin()]), ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract some data from the page using Playwright's API. posts = await context.page.query_selector_all('.athing') for post in posts: # Get the HTML elements for the title and rank within each post. title_element = await post.query_selector('.title a') # Extract the data we want from the elements. title = await title_element.inner_text() if title_element else None # Push the extracted data to the default dataset. await context.push_data({'title': title}) # Find a link to the next page and enqueue it if it exists. await context.enqueue_links(selector='.morelink') # Run the crawler with the initial list of URLs. await crawler.run(['https://news.ycombinator.com/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py ================================================ import asyncio from random import choice from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.statistics import Statistics async def main() -> None: crawler = PlaywrightCrawler( statistics=Statistics.with_default_state(save_error_snapshots=True) ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Simulate various errors to demonstrate `ErrorSnapshotter` # saving only the first occurrence of unique error. await context.enqueue_links() random_number = choice(range(10)) if random_number == 1: raise KeyError('Some KeyError') if random_number == 2: raise ValueError('Some ValueError') if random_number == 3: raise RuntimeError('Some RuntimeError') await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/playwright_crawler_with_fingerprint_generator.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.fingerprint_suite import ( DefaultFingerprintGenerator, HeaderGeneratorOptions, ScreenOptions, ) async def main() -> None: # Use default fingerprint generator with desired fingerprint options. # Generator will generate real looking browser fingerprint based on the options. # Unspecified fingerprint options will be automatically selected by the generator. fingerprint_generator = DefaultFingerprintGenerator( header_options=HeaderGeneratorOptions(browsers=['chrome']), screen_options=ScreenOptions(min_width=400), ) crawler = PlaywrightCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, # Headless mode, set to False to see the browser in action. headless=False, # Browser types supported by Playwright. browser_type='chromium', # Fingerprint generator to be used. By default no fingerprint generation is done. fingerprint_generator=fingerprint_generator, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Find a link to the next page and enqueue it if it exists. await context.enqueue_links(selector='.morelink') # Run the crawler with the initial list of URLs. await crawler.run(['https://news.ycombinator.com/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/respect_robots_on_skipped_request.py ================================================ import asyncio from crawlee import SkippedReason from crawlee.crawlers import ( BeautifulSoupCrawler, BeautifulSoupCrawlingContext, ) async def main() -> None: # Initialize the crawler with robots.txt compliance enabled crawler = BeautifulSoupCrawler(respect_robots_txt_file=True) @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # highlight-start # This handler is called when a request is skipped @crawler.on_skipped_request async def skipped_request_handler(url: str, reason: SkippedReason) -> None: # Check if the request was skipped due to robots.txt rules if reason == 'robots_txt': crawler.log.info(f'Skipped {url} due to robots.txt rules.') # highlight-end # Start the crawler with the specified URLs # The login URL will be skipped and handled by the skipped_request_handler await crawler.run( ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login'] ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/respect_robots_txt_file.py ================================================ import asyncio from crawlee.crawlers import ( BeautifulSoupCrawler, BeautifulSoupCrawlingContext, ) async def main() -> None: # Initialize the crawler with robots.txt compliance enabled crawler = BeautifulSoupCrawler(respect_robots_txt_file=True) @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Start the crawler with the specified URLs # The crawler will check the robots.txt file before making requests # In this example, 'https://news.ycombinator.com/login' will be skipped # because it's disallowed in the site's robots.txt file await crawler.run( ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login'] ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/resuming_paused_crawl.py ================================================ import asyncio from crawlee import ConcurrencySettings, service_locator from crawlee.crawlers import ( BeautifulSoupCrawler, BeautifulSoupCrawlingContext, ) # Disable clearing the `RequestQueue`, `KeyValueStore` and `Dataset` on each run. # This makes the scraper continue from where it left off in the previous run. # The recommended way to achieve this behavior is setting the environment variable # `CRAWLEE_PURGE_ON_START=0` configuration = service_locator.get_configuration() configuration.purge_on_start = False async def main() -> None: crawler = BeautifulSoupCrawler( # Let's slow down the crawler for a demonstration concurrency_settings=ConcurrencySettings(max_tasks_per_minute=20) ) @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # List of links for crawl requests = [ 'https://crawlee.dev', 'https://crawlee.dev/python/docs', 'https://crawlee.dev/python/docs/examples', 'https://crawlee.dev/python/docs/guides', 'https://crawlee.dev/python/docs/quick-start', ] await crawler.run(requests) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/run_parallel_crawlers.py ================================================ import asyncio from crawlee import ConcurrencySettings from crawlee.crawlers import ( ParselCrawler, ParselCrawlingContext, PlaywrightCrawler, PlaywrightCrawlingContext, ) from crawlee.sessions import SessionPool from crawlee.storages import RequestQueue async def main() -> None: # Open request queues for both crawlers with different aliases playwright_rq = await RequestQueue.open(alias='playwright-requests') parsel_rq = await RequestQueue.open(alias='parsel-requests') # Use a shared session pool between both crawlers async with SessionPool() as session_pool: playwright_crawler = PlaywrightCrawler( # Set the request queue for Playwright crawler request_manager=playwright_rq, session_pool=session_pool, # Configure concurrency settings for Playwright crawler concurrency_settings=ConcurrencySettings( max_concurrency=5, desired_concurrency=5 ), # Set `keep_alive`` so that the crawler does not stop working when there are # no requests in the queue. keep_alive=True, ) parsel_crawler = ParselCrawler( # Set the request queue for Parsel crawler request_manager=parsel_rq, session_pool=session_pool, # Configure concurrency settings for Parsel crawler concurrency_settings=ConcurrencySettings( max_concurrency=10, desired_concurrency=10 ), # Set maximum requests per crawl for Parsel crawler max_requests_per_crawl=50, ) @playwright_crawler.router.default_handler async def handle_playwright(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Playwright Processing {context.request.url}...') title = await context.page.title() # Push the extracted data to the dataset for Playwright crawler await context.push_data( {'title': title, 'url': context.request.url, 'source': 'playwright'}, dataset_name='playwright-data', ) @parsel_crawler.router.default_handler async def handle_parsel(context: ParselCrawlingContext) -> None: context.log.info(f'Parsel Processing {context.request.url}...') title = context.parsed_content.css('title::text').get() # Push the extracted data to the dataset for Parsel crawler await context.push_data( {'title': title, 'url': context.request.url, 'source': 'parsel'}, dataset_name='parsel-data', ) # Enqueue links to the Playwright request queue for blog pages await context.enqueue_links( selector='a[href*="/blog/"]', rq_alias='playwright-requests' ) # Enqueue other links to the Parsel request queue await context.enqueue_links(selector='a:not([href*="/blog/"])') # Start the Playwright crawler in the background background_crawler_task = asyncio.create_task(playwright_crawler.run([])) # Run the Parsel crawler with the initial URL and wait for it to finish await parsel_crawler.run(['https://crawlee.dev/blog']) # Wait for the Playwright crawler to finish processing all requests while not await playwright_rq.is_empty(): playwright_crawler.log.info('Waiting for Playwright crawler to finish...') await asyncio.sleep(5) # Stop the Playwright crawler after all requests are processed playwright_crawler.stop() # Wait for the background Playwright crawler task to complete await background_crawler_task if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/using_browser_profiles_chrome.py ================================================ import asyncio import shutil from pathlib import Path from tempfile import TemporaryDirectory from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext # Profile name to use (usually 'Default' for single profile setups) PROFILE_NAME = 'Default' # Paths to Chrome profiles in your system (example for Windows) # Use `chrome://version/` to find your profile path PROFILE_PATH = Path(Path.home(), 'AppData', 'Local', 'Google', 'Chrome', 'User Data') async def main() -> None: # Create a temporary folder to copy the profile to with TemporaryDirectory(prefix='crawlee-') as tmpdirname: tmp_profile_dir = Path(tmpdirname) # Copy the profile to a temporary folder shutil.copytree( PROFILE_PATH / PROFILE_NAME, tmp_profile_dir / PROFILE_NAME, dirs_exist_ok=True, ) crawler = PlaywrightCrawler( headless=False, # Use the installed Chrome browser browser_type='chrome', # Disable fingerprints to preserve profile identity fingerprint_generator=None, # Set user data directory to temp folder user_data_dir=tmp_profile_dir, browser_launch_options={ # Slow down actions to mimic human behavior 'slow_mo': 200, 'args': [ # Use the specified profile f'--profile-directory={PROFILE_NAME}', ], }, ) @crawler.router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Visiting {context.request.url}') await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/using_browser_profiles_firefox.py ================================================ import asyncio from pathlib import Path from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext # Replace this with your actual Firefox profile name # Find it at about:profiles in Firefox PROFILE_NAME = 'your-profile-name-here' # Paths to Firefox profiles in your system (example for Windows) # Use `about:profiles` to find your profile path PROFILE_PATH = Path( Path.home(), 'AppData', 'Roaming', 'Mozilla', 'Firefox', 'Profiles', PROFILE_NAME ) async def main() -> None: crawler = PlaywrightCrawler( # Use Firefox browser type browser_type='firefox', # Disable fingerprints to use the profile as is fingerprint_generator=None, headless=False, # Path to your Firefox profile user_data_dir=PROFILE_PATH, browser_launch_options={ 'args': [ # Required to avoid version conflicts '--allow-downgrade' ] }, ) @crawler.router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Visiting {context.request.url}') await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/code_examples/using_sitemap_request_loader.py ================================================ import asyncio from collections.abc import Callable from yarl import URL from crawlee import RequestOptions, RequestTransformAction from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.http_clients import ImpitHttpClient from crawlee.request_loaders import SitemapRequestLoader # Create a transform_request_function that maps request options based on the host in # the URL def create_transform_request( data_mapper: dict[str, dict], ) -> Callable[[RequestOptions], RequestOptions | RequestTransformAction]: def transform_request( request_options: RequestOptions, ) -> RequestOptions | RequestTransformAction: # According to the Sitemap protocol, all URLs in a Sitemap must be from a single # host. request_host = URL(request_options['url']).host if request_host and (mapping_data := data_mapper.get(request_host)): # Set properties from the mapping data if 'label' in mapping_data: request_options['label'] = mapping_data['label'] if 'user_data' in mapping_data: request_options['user_data'] = mapping_data['user_data'] return request_options return 'unchanged' return transform_request async def main() -> None: # Prepare data mapping for hosts apify_host = URL('https://apify.com/sitemap.xml').host crawlee_host = URL('https://crawlee.dev/sitemap.xml').host if not apify_host or not crawlee_host: raise ValueError('Unable to extract host from URLs') data_map = { apify_host: { 'label': 'apify', 'user_data': {'source': 'apify'}, }, crawlee_host: { 'label': 'crawlee', 'user_data': {'source': 'crawlee'}, }, } # Initialize the SitemapRequestLoader with the transform function async with SitemapRequestLoader( # Set the sitemap URLs and the HTTP client sitemap_urls=['https://crawlee.dev/sitemap.xml', 'https://apify.com/sitemap.xml'], http_client=ImpitHttpClient(), transform_request_function=create_transform_request(data_map), ) as sitemap_loader: # Convert the sitemap loader to a request manager request_manager = await sitemap_loader.to_tandem() # Create and configure the crawler crawler = BeautifulSoupCrawler( request_manager=request_manager, max_requests_per_crawl=10, ) # Create default handler for requests without a specific label @crawler.router.default_handler async def handler(context: BeautifulSoupCrawlingContext) -> None: source = context.request.user_data.get('source', 'unknown') context.log.info( f'Processing request: {context.request.url} from source: {source}' ) # Create handler for requests labeled 'apify' @crawler.router.handler('apify') async def apify_handler(context: BeautifulSoupCrawlingContext) -> None: source = context.request.user_data.get('source', 'unknown') context.log.info( f'Apify handler processing: {context.request.url} from source: {source}' ) # Create handler for requests labeled 'crawlee' @crawler.router.handler('crawlee') async def crawlee_handler(context: BeautifulSoupCrawlingContext) -> None: source = context.request.user_data.get('source', 'unknown') context.log.info( f'Crawlee handler processing: {context.request.url} from source: {source}' ) await crawler.run() if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/examples/crawl_all_links_on_website.mdx ================================================ --- id: crawl-all-links-on-website title: Crawl all links on website --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_all_links_on_website_bs.py'; import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_all_links_on_website_pw.py'; This example uses the `enqueue_links` helper to add new links to the `RequestQueue` as the crawler navigates from page to page. By automatically discovering and enqueuing all links on a given page, the crawler can systematically scrape an entire website. This approach is ideal for web scraping tasks where you need to collect data from multiple interconnected pages. :::tip If no options are given, by default the method will only add links that are under the same subdomain. This behavior can be controlled with the `strategy` option, which is an instance of the `EnqueueStrategy` type alias. You can find more info about this option in the [Crawl website with relative links](./crawl-website-with-relative-links) example. ::: {BeautifulSoupExample} {PlaywrightExample} ================================================ FILE: docs/examples/crawl_multiple_urls.mdx ================================================ --- id: crawl-multiple-urls title: Crawl multiple URLs --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_multiple_urls_bs.py'; import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_multiple_urls_pw.py'; This example demonstrates how to crawl a specified list of URLs using different crawlers. You'll learn how to set up the crawler, define a request handler, and run the crawler with multiple URLs. This setup is useful for scraping data from multiple pages or websites concurrently. {BeautifulSoupExample} {PlaywrightExample} ================================================ FILE: docs/examples/crawl_specific_links_on_website.mdx ================================================ --- id: crawl-specific-links-on-website title: Crawl specific links on website --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_bs.py'; import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_pw.py'; import BeautifulSoupExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_bs.py'; import PlaywrightExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_pw.py'; This example demonstrates how to crawl a website while targeting specific patterns of links. By utilizing the `enqueue_links` helper, you can pass `include` or `exclude` parameters to improve your crawling strategy. This approach ensures that only the links matching the specified patterns are added to the `RequestQueue`. Both `include` and `exclude` support lists of globs or regular expressions. This functionality is great for focusing on relevant sections of a website and avoiding scraping unnecessary or irrelevant content. {BeautifulSoupExample} {PlaywrightExample} ## Even more control over the enqueued links `enqueue_links` is a convenience helper and internally it calls `extract_links` to find the links and `add_requests` to add them to the queue. If you need some additional custom filtering of the extracted links before enqueuing them, then consider using `extract_links` and `add_requests` instead of the `enqueue_links` {BeautifulSoupExampleExtractAndAdd} {PlaywrightExampleExtractAndAdd} ================================================ FILE: docs/examples/crawl_website_with_relative_links.mdx ================================================ --- id: crawl-website-with-relative-links title: Crawl website with relative links --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import AllLinksExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_all_links.py'; import SameDomainExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_domain.py'; import SameHostnameExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_hostname.py'; import SameOriginExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_origin.py'; When crawling a website, you may encounter various types of links that you wish to include in your crawl. To facilitate this, we provide the `enqueue_links` method on the crawler context, which will automatically find and add these links to the crawler's `RequestQueue`. This method simplifies the process of handling different types of links, including relative links, by automatically resolving them based on the page's context. :::note For these examples, we are using the `BeautifulSoupCrawler`. However, the same method is available for other crawlers as well. You can use it in exactly the same way. ::: `EnqueueStrategy` type alias provides four distinct strategies for crawling relative links: - `all` - Enqueues all links found, regardless of the domain they point to. This strategy is useful when you want to follow every link, including those that navigate to external websites. - `same-domain` - Enqueues all links found that share the same domain name, including any possible subdomains. This strategy ensures that all links within the same top-level and base domain are included. - `same-hostname` - Enqueues all links found for the exact same hostname. This is the **default** strategy, and it restricts the crawl to links that have the same hostname as the current page, excluding subdomains. - `same-origin` - Enqueues all links found that share the same origin. The same origin refers to URLs that share the same protocol, domain, and port, ensuring a strict scope for the crawl. {AllLinksExample} {SameDomainExample} {SameHostnameExample} {SameOriginExample} ================================================ FILE: docs/examples/crawler_keep_alive.mdx ================================================ --- id: crawler-keep-alive title: Keep a Crawler alive waiting for more requests --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_keep_alive.py'; This example demonstrates how to keep crawler alive even when there are no requests at the moment by using `keep_alive=True` argument of `BasicCrawler.__init__`. This is available to all crawlers that inherit from `BasicCrawler` and in the example below it is shown on `BeautifulSoupCrawler`. To stop the crawler that was started with `keep_alive=True` you can call `crawler.stop()`. {BeautifulSoupExample} ================================================ FILE: docs/examples/crawler_stop.mdx ================================================ --- id: crawler-stop title: Stopping a Crawler with stop method --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_stop.py'; This example demonstrates how to use `stop` method of `BasicCrawler` to stop crawler once the crawler finds what it is looking for. This method is available to all crawlers that inherit from `BasicCrawler` and in the example below it is shown on `BeautifulSoupCrawler`. Simply call `crawler.stop()` to stop the crawler. It will not continue to crawl through new requests. Requests that are already being concurrently processed are going to get finished. It is possible to call `stop` method with optional argument `reason` that is a string that will be used in logs and it can improve logs readability especially if you have multiple different conditions for triggering `stop`. {BeautifulSoupExample} ================================================ FILE: docs/examples/export_entire_dataset_to_file.mdx ================================================ --- id: export-entire-dataset-to-file title: Export entire dataset to file --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import JsonExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_json.py'; import CsvExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_csv.py'; This example demonstrates how to use the `BasicCrawler.export_data` method of the crawler to export the entire default dataset to a single file. This method supports exporting data in either CSV or JSON format and also accepts additional keyword arguments so you can fine-tune the underlying `json.dump` or `csv.writer` behavior. :::note For these examples, we are using the `BeautifulSoupCrawler`. However, the same method is available for other crawlers as well. You can use it in exactly the same way. ::: {JsonExample} {CsvExample} ================================================ FILE: docs/examples/fill_and_submit_web_form.mdx ================================================ --- id: fill-and-submit-web-form title: Fill and submit web form --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import RequestExample from '!!raw-loader!roa-loader!./code_examples/fill_and_submit_web_form_request.py'; import CrawlerExample from '!!raw-loader!roa-loader!./code_examples/fill_and_submit_web_form_crawler.py'; This example demonstrates how to fill and submit a web form using the `HttpCrawler` crawler. The same approach applies to any crawler that inherits from it, such as the `BeautifulSoupCrawler` or `ParselCrawler`. We are going to use the [httpbin.org](https://httpbin.org) website to demonstrate how it works. ## Investigate the form fields First, we need to examine the form fields and the form's action URL. You can do this by opening the [httpbin.org/forms/post](https://httpbin.org/forms/post) page in a browser and inspecting the form fields. In Chrome, right-click on the page and select "Inspect" or press `Ctrl+Shift+I`. Use the element selector (`Ctrl+Shift+C`) to click on the form element you want to inspect. ![HTML input element name](/img/fill-and-submit-web-form/00.jpg 'HTML input element name.') Identify the field names. For example, the customer name field is `custname`, the email field is `custemail`, and the phone field is `custtel`. Now navigate to the "Network" tab in developer tools and submit the form by clicking the "Submit order" button. ![Submitting the form](/img/fill-and-submit-web-form/01.jpg 'Submitting the form.') Find the form submission request and examine its details. The "Headers" tab will show the submission URL, in this case, it is `https://httpbin.org/post`. ![Network request investigation](/img/fill-and-submit-web-form/02.jpg 'Network request investigation.') The "Payload" tab will display the form fields and their submitted values. This method could be an alternative to inspecting the HTML source code directly. ![Network payload investigation](/img/fill-and-submit-web-form/03.jpg 'Network payload investigation.') ## Preparing a POST request Now, let's create a POST request with the form fields and their values using the `Request` class, specifically its `Request.from_url` constructor: {RequestExample} Alternatively, you can send form data as URL parameters using the `url` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach. ## Implementing the crawler Finally, let's implement the crawler and run it with the prepared request. Although we are using the `HttpCrawler`, the process is the same for any crawler that inherits from it. {CrawlerExample} ## Running the crawler Finally, run your crawler. Your logs should show something like this: ```plaintext ... [crawlee.http_crawler._http_crawler] INFO Processing https://httpbin.org/post ... [crawlee.http_crawler._http_crawler] INFO Response: { "args": {}, "data": "", "files": {}, "form": { "comments": "Please ring the doorbell upon arrival.", "custemail": "johndoe@example.com", "custname": "John Doe", "custtel": "1234567890", "delivery": "13:00", "size": "large", "topping": [ "bacon", "cheese", "mushroom" ] }, "headers": { "Accept": "*/*", "Accept-Encoding": "gzip, deflate, br", "Content-Length": "190", "Content-Type": "application/x-www-form-urlencoded", "Host": "httpbin.org", "User-Agent": "python-httpx/0.27.0", "X-Amzn-Trace-Id": "Root=1-66c849d6-1ae432fb7b4156e6149ff37f" }, "json": null, "origin": "78.80.81.196", "url": "https://httpbin.org/post" } [crawlee._autoscaling.autoscaled_pool] INFO Waiting for remaining tasks to finish [crawlee.http_crawler._http_crawler] INFO Final request statistics: ┌───────────────────────────────┬──────────┐ │ requests_finished │ 1 │ │ requests_failed │ 0 │ │ retry_histogram │ [1] │ │ request_avg_failed_duration │ None │ │ request_avg_finished_duration │ 0.678442 │ │ requests_finished_per_minute │ 85 │ │ requests_failed_per_minute │ 0 │ │ request_total_duration │ 0.678442 │ │ requests_total │ 1 │ │ crawler_runtime │ 0.707666 │ └───────────────────────────────┴──────────┘ ``` This log output confirms that the crawler successfully submitted the form and processed the response. Congratulations! You have successfully filled and submitted a web form using the `HttpCrawler`. ================================================ FILE: docs/examples/json_logging.mdx ================================================ --- id: configure-json-logging title: Сonfigure JSON logging --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import JsonLoggingExample from '!!raw-loader!roa-loader!./code_examples/configure_json_logging.py'; This example demonstrates how to configure JSON line (JSONL) logging with Crawlee. By using the `use_table_logs=False` parameter, you can disable table-formatted statistics logs, which makes it easier to parse logs with external tools or to serialize them as JSON. The example shows how to integrate with the popular [`loguru`](https://github.com/delgan/loguru) library to capture Crawlee logs and format them as JSONL (one JSON object per line). This approach works well when you need to collect logs for analysis, monitoring, or when integrating with logging platforms like ELK Stack, Grafana Loki, or similar systems. {JsonLoggingExample} Here's an example of what a crawler statistics log entry in JSONL format. ```json { "text": "[HttpCrawler] | INFO | - Final request statistics: {'requests_finished': 1, 'requests_failed': 0, 'retry_histogram': [1], 'request_avg_failed_duration': None, 'request_avg_finished_duration': 3.57098, 'requests_finished_per_minute': 17, 'requests_failed_per_minute': 0, 'request_total_duration': 3.57098, 'requests_total': 1, 'crawler_runtime': 3.59165}\n", "record": { "elapsed": { "repr": "0:00:05.604568", "seconds": 5.604568 }, "exception": null, "extra": { "requests_finished": 1, "requests_failed": 0, "retry_histogram": [1], "request_avg_failed_duration": null, "request_avg_finished_duration": 3.57098, "requests_finished_per_minute": 17, "requests_failed_per_minute": 0, "request_total_duration": 3.57098, "requests_total": 1, "crawler_runtime": 3.59165 }, "file": { "name": "_basic_crawler.py", "path": "/crawlers/_basic/_basic_crawler.py" }, "function": "run", "level": { "icon": "ℹ️", "name": "INFO", "no": 20 }, "line": 583, "message": "Final request statistics:", "module": "_basic_crawler", "name": "HttpCrawler", "process": { "id": 198383, "name": "MainProcess" }, "thread": { "id": 135312814966592, "name": "MainThread" }, "time": { "repr": "2025-03-17 17:14:45.339150+00:00", "timestamp": 1742231685.33915 } } } ``` ================================================ FILE: docs/examples/parsel_crawler.mdx ================================================ --- id: parsel-crawler title: Parsel crawler --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import ParselCrawlerExample from '!!raw-loader!roa-loader!./code_examples/parsel_crawler.py'; This example shows how to use `ParselCrawler` to crawl a website or a list of URLs. Each URL is loaded using a plain HTTP request and the response is parsed using [Parsel](https://pypi.org/project/parsel/) library which supports CSS and XPath selectors for HTML responses and JMESPath for JSON responses. We can extract data from all kinds of complex HTML structures using XPath. In this example, we will use Parsel to crawl github.com and extract page title, URL and emails found in the webpage. The default handler will scrape data from the current webpage and enqueue all the links found in the webpage for continuous scraping. It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request. {ParselCrawlerExample} ================================================ FILE: docs/examples/playwright_crawler.mdx ================================================ --- id: playwright-crawler title: Playwright crawler --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler.py'; This example demonstrates how to use `PlaywrightCrawler` to recursively scrape the Hacker news website using headless Chromium and Playwright. The `PlaywrightCrawler` manages the browser and page instances, simplifying the process of interacting with web pages. In the request handler, Playwright's API is used to extract data from each post on the page. Specifically, it retrieves the title, rank, and URL of each post. Additionally, the handler enqueues links to the next pages to ensure continuous scraping. This setup is ideal for scraping dynamic web pages where JavaScript execution is required to render the content. A **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. {PlaywrightCrawlerExample} ================================================ FILE: docs/examples/playwright_crawler_adaptive.mdx ================================================ --- id: adaptive-playwright-crawler title: Adaptive Playwright crawler --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import AdaptivePlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/adaptive_playwright_crawler.py'; This example demonstrates how to use `AdaptivePlaywrightCrawler`. An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler such as `ParselCrawler` or `BeautifulSoupCrawler`. It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit. A [pre-navigation hook](/python/docs/guides/adaptive-playwright-crawler#page-configuration-with-pre-navigation-hooks) can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only=True` to mark hooks that should be executed only for playwright sub crawler. For more detailed description please see [Adaptive Playwright crawler guide](/python/docs/guides/adaptive-playwright-crawler) {AdaptivePlaywrightCrawlerExample} ================================================ FILE: docs/examples/playwright_crawler_with_block_requests.mdx ================================================ --- id: playwright-crawler-with-block-requests title: Playwright crawler with block requests --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import PlaywrightBlockRequests from '!!raw-loader!roa-loader!./code_examples/playwright_block_requests.py'; This example demonstrates how to optimize your `PlaywrightCrawler` performance by blocking unnecessary network requests. The primary use case is when you need to scrape or interact with web pages without loading non-essential resources like images, styles, or analytics scripts. This can significantly reduce bandwidth usage and improve crawling speed. The `block_requests` helper provides the most efficient way to block requests as it operates directly in the browser. By default, `block_requests` will block all URLs including the following patterns: ```python ['.css', '.webp', '.jpg', '.jpeg', '.png', '.svg', '.gif', '.woff', '.pdf', '.zip'] ``` You can also replace the default patterns list with your own by providing `url_patterns`, or extend it by passing additional patterns in `extra_url_patterns`. {PlaywrightBlockRequests} ================================================ FILE: docs/examples/playwright_crawler_with_camoufox.mdx ================================================ --- id: playwright-crawler-with-camoufox title: Playwright crawler with Camoufox --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import PlaywrightCrawlerExampleWithCamoufox from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_camoufox.py'; This example demonstrates how to integrate Camoufox into `PlaywrightCrawler` using `BrowserPool` with custom `PlaywrightBrowserPlugin`. Camoufox is a stealthy minimalistic build of Firefox. For details please visit its homepage https://camoufox.com/ . To be able to run this example you will need to install camoufox, as it is external tool, and it is not part of the crawlee. For installation please see https://pypi.org/project/camoufox/. **Warning!** Camoufox is using custom build of firefox. This build can be hundreds of MB large. You can either pre-download this file using following command `python3 -m camoufox fetch` or camoufox will download it automatically once you try to run it, and it does not find existing binary. For more details please refer to: https://github.com/daijro/camoufox/tree/main/pythonlib#camoufox-python-interface **Project template -** It is possible to generate project with Python code which includes Camoufox integration into crawlee through crawlee cli. Call `crawlee create` and pick `Playwright-camoufox` when asked for Crawler type. The example code after PlayWrightCrawler instantiation is similar to example describing the use of Playwright Crawler. The main difference is that in this example Camoufox will be used as the browser through BrowserPool. {PlaywrightCrawlerExampleWithCamoufox} ================================================ FILE: docs/examples/playwright_crawler_with_fingerprint_generator.mdx ================================================ --- id: playwright-crawler-with-fingerprint-generator title: Playwright crawler with fingerprint generator --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_fingerprint_generator.py'; This example demonstrates how to use `PlaywrightCrawler` together with `FingerprintGenerator` that will populate several browser attributes to mimic real browser fingerprint. To read more about fingerprints please see: https://docs.apify.com/academy/anti-scraping/techniques/fingerprinting. You can implement your own fingerprint generator or use `DefaultFingerprintGenerator`. To use the generator initialize it with the desired fingerprint options. The generator will try to create fingerprint based on those options. Unspecified options will be automatically selected by the generator from the set of reasonable values. If some option is important for you, do not rely on the default and explicitly define it. {PlaywrightCrawlerExample} ================================================ FILE: docs/examples/respect_robots_txt_file.mdx ================================================ --- id: respect-robots-txt-file title: Respect robots.txt file --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py'; import OnSkippedRequest from '!!raw-loader!roa-loader!./code_examples/respect_robots_on_skipped_request.py'; This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file. To configure `Crawlee` to follow the `robots.txt` file, set the parameter `respect_robots_txt_file=True` in `BasicCrawlerOptions`. In this case, `Crawlee` will skip any URLs forbidden in the website's robots.txt file. As an example, let's look at the website `https://news.ycombinator.com/` and its corresponding [robots.txt](https://news.ycombinator.com/robots.txt) file. Since the file has a rule `Disallow: /login`, the URL `https://news.ycombinator.com/login` will be automatically skipped. The code below demonstrates this behavior using the `BeautifulSoupCrawler`: {RespectRobotsTxt} ## Handle with `on_skipped_request` If you want to process URLs skipped according to the `robots.txt` rules, for example for further analysis, you should use the `on_skipped_request` handler from `BasicCrawler`. Let's update the code by adding the `on_skipped_request` handler: {OnSkippedRequest} ================================================ FILE: docs/examples/resuming_paused_crawl.mdx ================================================ --- id: resuming-paused-crawl title: Resuming a paused crawl --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import ResumeCrawl from '!!raw-loader!roa-loader!./code_examples/resuming_paused_crawl.py'; This example demonstrates how to resume crawling from its last state when running locally, if for some reason it was unexpectedly terminated. If each run should continue crawling from the previous state, you can configure this using `purge_on_start` in `Configuration`. Use the code below and perform 2 sequential runs. During the 1st run, stop the crawler by pressing `CTRL+C`, and the 2nd run will resume crawling from where it stopped. {ResumeCrawl} Perform the 1st run, interrupting the crawler with `CTRL+C` after 2 links have been processed. ![Run with interruption](/img/resuming-paused-crawl/00.webp 'Run with interruption.') Now resume crawling after the pause to process the remaining 3 links. ![Resuming crawling](/img/resuming-paused-crawl/01.webp 'Resuming crawling.') Alternatively, use the environment variable `CRAWLEE_PURGE_ON_START=0` instead of using `configuration.purge_on_start = False`. For example, when running code: ```bash CRAWLEE_PURGE_ON_START=0 python -m best_crawler ``` ================================================ FILE: docs/examples/run_parallel_crawlers.mdx ================================================ --- id: run-parallel-crawlers title: Run parallel crawlers --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import RunParallelCrawlersExample from '!!raw-loader!roa-loader!./code_examples/run_parallel_crawlers.py'; This example demonstrates how to run two parallel crawlers where one crawler processes links discovered by another crawler. In some situations, you may need different approaches for scraping data from a website. For example, you might use `PlaywrightCrawler` for navigating JavaScript-heavy pages and a faster, more lightweight `ParselCrawler` for processing static pages. One way to solve this is to use `AdaptivePlaywrightCrawler`, see the [Adaptive Playwright crawler example](./adaptive-playwright-crawler) to learn more. The code below demonstrates an alternative approach using two separate crawlers. Links are passed between crawlers via `RequestQueue` aliases. The `keep_alive` option allows the Playwright crawler to run in the background and wait for incoming links without stopping when its queue is empty. You can also use different storage clients for each crawler without losing the ability to pass links between queues. Learn more about available storage clients in this [guide](/python/docs/guides/storage-clients). {RunParallelCrawlersExample} ================================================ FILE: docs/examples/using_browser_profile.mdx ================================================ --- id: using_browser_profile title: Using browser profile --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import ChromeProfileExample from '!!raw-loader!./code_examples/using_browser_profiles_chrome.py'; import FirefoxProfileExample from '!!raw-loader!./code_examples/using_browser_profiles_firefox.py'; This example demonstrates how to run `PlaywrightCrawler` using your local browser profile from [Chrome](https://www.google.com/intl/us/chrome/) or [Firefox](https://www.firefox.com/). Using browser profiles allows you to leverage existing login sessions, saved passwords, bookmarks, and other personalized browser data during crawling. This can be particularly useful for testing scenarios or when you need to access content that requires authentication. ## Chrome browser To run `PlaywrightCrawler` with your Chrome profile, you need to know the path to your profile files. You can find this information by entering `chrome://version/` as a URL in your Chrome browser. If you have multiple profiles, pay attention to the profile name - if you only have one profile, it's always `Default`. :::warning Profile access limitation Due to [Chrome's security policies](https://developer.chrome.com/blog/remote-debugging-port), automation cannot use your main browsing profile directly. The example copies your profile to a temporary location as a workaround. ::: Make sure you don't have any running Chrome browser processes before running this code: {ChromeProfileExample} ## Firefox browser To find the path to your Firefox profile, enter `about:profiles` as a URL in your Firefox browser. Unlike Chrome, you can use your standard profile path directly without copying it first. Make sure you don't have any running Firefox browser processes before running this code: {FirefoxProfileExample} ================================================ FILE: docs/examples/using_sitemap_request_loader.mdx ================================================ --- id: using-sitemap-request-loader title: Using sitemap request loader --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import SitemapRequestLoaderExample from '!!raw-loader!roa-loader!./code_examples/using_sitemap_request_loader.py'; This example demonstrates how to use `SitemapRequestLoader` to crawl websites that provide `sitemap.xml` files following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html). The `SitemapRequestLoader` processes sitemaps in a streaming fashion without loading them entirely into memory, making it suitable for large sitemaps. The example shows how to use the `transform_request_function` parameter to configure request options based on URL patterns. This allows you to modify request properties such as labels and user data based on the source URL, enabling different handling logic for different websites or sections. The following code example implements processing of sitemaps from two different domains (Apify and Crawlee), with different labels assigned to requests based on their host. The `create_transform_request` function maps each host to the corresponding request configuration, while the crawler uses different handlers based on the assigned labels. {SitemapRequestLoaderExample} For more information about request loaders, see the [Request loaders guide](../guides/request-loaders). ================================================ FILE: docs/guides/architecture_overview.mdx ================================================ --- id: architecture-overview title: Architecture overview description: An overview of the core components of the Crawlee library and its architecture. --- import ApiLink from '@site/src/components/ApiLink'; Crawlee is a modern and modular web scraping framework. It is designed for both HTTP-only and browser-based scraping. In this guide, we will provide a high-level overview of its architecture and the main components that make up the system. ## Crawler The main user-facing component of Crawlee is the crawler, which orchestrates the crawling process and takes care of all other components. It manages storages, executes user-defined request handlers, handles retries, manages concurrency, and coordinates all other components. All crawlers inherit from the `BasicCrawler` class, which provides the basic functionality. There are two main groups of specialized crawlers: HTTP crawlers and browser crawlers. :::info You will learn more about the request handlers in the request router section. ::: ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Abstract classes %% ======================== class BasicCrawler { <> } class AbstractHttpCrawler { <> } %% ======================== %% Specific classes %% ======================== class HttpCrawler class ParselCrawler class BeautifulSoupCrawler class PlaywrightCrawler class AdaptivePlaywrightCrawler %% ======================== %% Inheritance arrows %% ======================== BasicCrawler --|> AbstractHttpCrawler BasicCrawler --|> PlaywrightCrawler BasicCrawler --|> AdaptivePlaywrightCrawler AbstractHttpCrawler --|> HttpCrawler AbstractHttpCrawler --|> ParselCrawler AbstractHttpCrawler --|> BeautifulSoupCrawler ``` ### HTTP crawlers HTTP crawlers use HTTP clients to fetch pages and parse them with HTML parsing libraries. They are fast and efficient for sites that do not require JavaScript rendering. HTTP clients are Crawlee components that wrap around HTTP libraries like [httpx](https://www.python-httpx.org/), [curl-impersonate](https://github.com/lwthiker/curl-impersonate) or [impit](https://apify.github.io/impit) and handle HTTP communication for requests and responses. You can learn more about them in the [HTTP clients guide](./http-clients). HTTP crawlers inherit from `AbstractHttpCrawler` and there are three crawlers that belong to this category: - `BeautifulSoupCrawler` utilizes the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) HTML parser. - `ParselCrawler` utilizes [Parsel](https://github.com/scrapy/parsel) for parsing HTML. - `HttpCrawler` does not parse HTTP responses at all and is used when no content parsing is required. You can learn more about HTTP crawlers in the [HTTP crawlers guide](./http-crawlers). ### Browser crawlers Browser crawlers use a real browser to render pages, enabling scraping of sites that require JavaScript. They manage browser instances, pages, and context lifecycles. Currently, the only browser crawler is `PlaywrightCrawler`, which utilizes the [Playwright](https://playwright.dev/) library. Playwright provides a high-level API for controlling and navigating browsers. You can learn more about `PlaywrightCrawler`, its features, and how it internally manages browser instances in the [Playwright crawler guide](./playwright-crawler). ### Adaptive crawler The `AdaptivePlaywrightCrawler` sits between HTTP and browser crawlers. It can automatically decide whether to use HTTP or browser crawling for each request based on heuristics or user configuration. This allows for optimal performance and compatibility. It also provides a uniform interface for both crawling types (modes). You can learn more about adaptive crawling in the [Adaptive Playwright crawler guide](./adaptive-playwright-crawler). ## Crawling contexts Crawling contexts are objects that encapsulate the state and data for each request being processed by the crawler. They provide access to the request, response, session, and helper methods for handling the request. Crawling contexts are used to pass data between different parts of the crawler and to manage the lifecycle of each request. These contexts are provided to user-defined request handlers, which can then use them to access request data, response data, or use helper methods to interact with storages, and extract and enqueue new requests. ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Classes %% ======================== class BasicCrawlingContext class HttpCrawlingContext class HttpCrawlingResult class ParsedHttpCrawlingContext class ParselCrawlingContext class BeautifulSoupCrawlingContext class PlaywrightPreNavCrawlingContext class PlaywrightCrawlingContext class AdaptivePlaywrightPreNavCrawlingContext class AdaptivePlaywrightCrawlingContext %% ======================== %% Inheritance arrows %% ======================== BasicCrawlingContext --|> HttpCrawlingContext HttpCrawlingResult --|> HttpCrawlingContext HttpCrawlingContext --|> ParsedHttpCrawlingContext ParsedHttpCrawlingContext --|> ParselCrawlingContext ParsedHttpCrawlingContext --|> BeautifulSoupCrawlingContext BasicCrawlingContext --|> PlaywrightPreNavCrawlingContext PlaywrightPreNavCrawlingContext --|> PlaywrightCrawlingContext BasicCrawlingContext --|> AdaptivePlaywrightPreNavCrawlingContext ParsedHttpCrawlingContext --|> AdaptivePlaywrightCrawlingContext ``` They have a similar inheritance structure as the crawlers, with the base class being `BasicCrawlingContext`. The specific crawling contexts are: - `HttpCrawlingContext` for HTTP crawlers. - `ParsedHttpCrawlingContext` for HTTP crawlers with parsed responses. - `ParselCrawlingContext` for HTTP crawlers that use [Parsel](https://github.com/scrapy/parsel) for parsing. - `BeautifulSoupCrawlingContext` for HTTP crawlers that use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing. - `PlaywrightPreNavCrawlingContext` for Playwright crawlers before the page is navigated. - `PlaywrightCrawlingContext` for Playwright crawlers. - `AdaptivePlaywrightPreNavCrawlingContext` for Adaptive Playwright crawlers before the page is navigated. - `AdaptivePlaywrightCrawlingContext` for Adaptive Playwright crawlers. ## Storages Storages are the components that manage data in Crawlee. They provide a way to store and retrieve data during the crawling process. Crawlee's storage system consists of two main layers: - **Storages**: High-level interfaces for interacting with different storage types - **Storage clients**: Backend implementations that handle the actual data persistence and management (you will learn more about them in the next section) Crawlee provides three built-in storage types for managing data: - `Dataset` - Append-only, tabular storage for structured data. It is ideal for storing scraping results. - `KeyValueStore` - Storage for arbitrary data like JSON documents, images or configs. It supports get and set operations with key-value pairs; updates are only possible by replacement. - `RequestQueue` - A managed queue for pending and completed requests, with automatic deduplication and dynamic addition of new items. It is used to track URLs for crawling. See the [Storages guide](./storages) for more details. ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Abstract classes %% ======================== class Storage { <> } %% ======================== %% Specific classes %% ======================== class Dataset class KeyValueStore class RequestQueue %% ======================== %% Inheritance arrows %% ======================== Storage --|> Dataset Storage --|> KeyValueStore Storage --|> RequestQueue ``` ## Storage clients Storage clients are the backend implementations for storages that handle interactions with different storage systems. They provide a unified interface for `Dataset`, `KeyValueStore`, and `RequestQueue`, regardless of the underlying storage implementation. Crawlee provides several built-in storage client implementations: - `MemoryStorageClient` - Stores data in memory with no persistence (ideal for testing and fast operations). - `FileSystemStorageClient` - Provides persistent file system storage with caching (default client). - [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com/) (cloud-based). It is implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). You can find more information about it in the [Apify SDK documentation](https://docs.apify.com/sdk/python/docs/overview/introduction). ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Abstract classes %% ======================== class StorageClient { <> } %% ======================== %% Specific classes %% ======================== class MemoryStorageClient class FileSystemStorageClient class ApifyStorageClient %% ======================== %% Inheritance arrows %% ======================== StorageClient --|> MemoryStorageClient StorageClient --|> FileSystemStorageClient StorageClient --|> ApifyStorageClient ``` Storage clients can be registered globally with the `ServiceLocator` (you will learn more about the `ServiceLocator` in the next section), passed directly to crawlers, or specified when opening individual storage instances. You can also create custom storage clients by implementing the `StorageClient` interface. See the [Storage clients guide](./storage-clients) for more details. ## Request router The request `Router` is a central component that manages the flow of requests and responses in Crawlee. It is responsible for routing requests to the appropriate request handlers, managing the crawling context, and coordinating the execution of user-defined logic. ### Request handlers Request handlers are user-defined functions that process requests and responses in Crawlee. They are the core of the crawling logic and are responsible for handling data extraction, processing, and storage. Each request handler receives a crawling context as an argument, which provides access to request data, response data, and other information related to the request. Request handlers can be registered with the `Router`. The request routing in Crawlee supports: - Default handlers - Fallback handlers for requests without specific labels. - Label-based routing - Handlers for specific request types based on labels. - Error handlers - Handle errors during request processing. - Failed request handlers - Handle requests that exceed retry limits. - Pre-navigation hooks - Execute logic before navigating to URLs. See the [Request router guide](./request-router) for detailed information and examples. ## Service locator The `ServiceLocator` is a central registry for global services in Crawlee. It manages and provides access to core services throughout the framework, ensuring consistent configuration across all components. The service locator coordinates these three services: - `Configuration` - Application-wide settings and parameters that control various aspects of Crawlee behavior. - `StorageClient` - Backend implementation for data storage across datasets, key-value stores, and request queues. - `EventManager` - Event coordination system for internal framework events and custom user hooks. Services can be registered globally through the `service_locator` singleton instance, passed to crawler constructors, or provided when opening individual storage instances. The service locator includes conflict prevention mechanisms to ensure configuration consistency and prevent accidental service conflicts during runtime. See the [Service locator guide](./service-locator) for detailed information about service registration and configuration options. ## Request loaders Request loaders provide a subset of `RequestQueue` functionality, focusing specifically on reading and accessing streams of requests from various sources. They define how requests are fetched and processed, enabling use cases such as reading URLs from files, external APIs, sitemaps, or combining multiple sources together. Unlike request queues, they do not handle storage or persistence—they only provide request reading capabilities. - `RequestLoader` - Base interface for read-only access to a stream of requests, with capabilities like fetching the next request, marking as handled, and status checking. - `RequestList` - Lightweight in-memory implementation of `RequestLoader` for managing static lists of URLs. - `SitemapRequestLoader` - A specialized loader that reads URLs from XML and plain-text sitemaps following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html) with filtering capabilities. ### Request managers `RequestManager` extends `RequestLoader` with write capabilities for adding and reclaiming requests, providing full request management functionality. `RequestQueue` is the primary concrete implementation of `RequestManager`. `RequestManagerTandem` combines a read-only `RequestLoader` with a writable `RequestManager`, transferring requests from the loader to the manager for hybrid scenarios. This is useful when you want to start with a predefined set of URLs (from a file or sitemap) but also need to add new requests dynamically during crawling. The tandem first processes all requests from the loader, then handles any additional requests added to the manager. Request loaders are useful when you need to start with a predefined set of URLs. The tandem approach allows processing requests from static sources (like files or sitemaps) while maintaining the ability to add new requests dynamically. See the [Request loaders guide](./request-loaders) for detailed information. ## Event manager The `EventManager` is responsible for coordinating internal events throughout Crawlee and enabling custom hooks. It provides a system for registering event listeners, emitting events, and managing their execution lifecycle. Crawlee provides several implementations of the event manager: - `EventManager` is the base class for event management in Crawlee. - `LocalEventManager` extends the base event manager for local environments by automatically emitting `SYSTEM_INFO` events at regular intervals. This provides real-time system metrics including CPU usage and memory consumption, which are essential for internal components like the `Snapshotter` and `AutoscaledPool`. - [`ApifyEventManager`](https://docs.apify.com/sdk/python/reference/class/PlatformEventManager) - Manages events on the [Apify platform](https://apify.com/) (cloud-based). It is implemented in the [Apify SDK](https://docs.apify.com/sdk/python/). :::info You can learn more about `Snapshotter` and `AutoscaledPool` and their configuration in the [Scaling crawlers guide](./scaling-crawlers). ::: Crawlee defines several built-in event types: - `PERSIST_STATE` - Emitted periodically to trigger state persistence. - `SYSTEM_INFO` - Contains CPU and memory usage information. - `MIGRATING` - Signals that the crawler is migrating to a different environment. - `ABORTING` - Indicates the crawler is aborting execution. - `EXIT` - Emitted when the crawler is exiting. - `CRAWLER_STATUS` - Provides status updates from crawlers. Additional specialized events for browser and session management are also available. The event manager operates as an async context manager, automatically starting periodic tasks when entered and ensuring all listeners complete before exiting. Event listeners can be either synchronous or asynchronous functions and are executed safely without blocking the main event loop. ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Abstract classes %% ======================== class EventManager { <> } %% ======================== %% Specific classes %% ======================== class LocalEventManager class ApifyEventManager %% ======================== %% Inheritance arrows %% ======================== EventManager --|> LocalEventManager EventManager --|> ApifyEventManager ``` ## Session management The core component of session management in Crawlee is `SessionPool`. It manages a collection of sessions that simulate individual users with unique attributes like cookies, IP addresses (via proxies), and browser fingerprints. Sessions help avoid blocking by rotating user identities and maintaining realistic browsing patterns. :::info You can learn more about fingerprints and how to avoid getting blocked in the [Avoid blocking guide](./avoid-blocking). ::: ### Session A session is represented as a `Session` object, which contains components like cookies, error tracking, usage limits, and expiration handling. Sessions can be marked as good (`Session.mark_good`), bad (`Session.mark_bad`), or retired (`Session.retire`) based on their performance, and they automatically become unusable when they exceed error thresholds or usage limits. ### Session pool The session pool provides automated session lifecycle management: - Automatic rotation - Retrieves random sessions from the pool and creates new ones as needed. - Pool maintenance - Removes retired sessions and maintains the pool at maximum capacity. - State persistence - Persists session state to enable recovery across restarts. - Configurable limits - Supports custom pool sizes, session settings, and creation functions. The pool operates as an async context manager, automatically initializing with sessions and cleaning up on exit. It ensures proper session management by rotating sessions based on usage count, expiration time, and custom rules while maintaining optimal pool size. See the [Session management guide](./session-management) for more information. ## Statistics The `Statistics` class provides runtime monitoring for crawler operations, tracking performance metrics like request counts, processing times, retry attempts, and error patterns. It operates as an async context manager, automatically persisting data across crawler restarts and migrations using `KeyValueStore`. The system includes error tracking through the `ErrorTracker` class, which groups similar errors by type and message patterns using wildcard matching. It can capture HTML snapshots and screenshots for debugging and separately track retry-specific errors. Statistics are logged at configurable intervals in both table and inline formats, with final summary data returned from the `crawler.run` method available through `FinalStatistics`. ## Conclusion In this guide, we provided a high-level overview of the core components of the Crawlee library and its architecture. We covered the main components like crawlers, crawling contexts, storages, request routers, service locator, request loaders, event manager, session management, and statistics. Check out other guides, the [API reference](https://crawlee.dev/python/api), and [Examples](../examples) for more details on how to use these components in your own projects. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! ================================================ FILE: docs/guides/avoid_blocking.mdx ================================================ --- id: avoid-blocking title: Avoid getting blocked description: How to avoid getting blocked when scraping --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import PlaywrightDefaultFingerprintGenerator from '!!raw-loader!roa-loader!./code_examples/avoid_blocking/playwright_with_fingerprint_generator.py'; import PlaywrightWithCamoufox from '!!raw-loader!roa-loader!../examples/code_examples/playwright_crawler_with_camoufox.py'; import PlaywrightDefaultFingerprintGeneratorWithArgs from '!!raw-loader!./code_examples/avoid_blocking/default_fingerprint_generator_with_args.py'; A scraper might get blocked for numerous reasons. Let's narrow it down to the two main ones. The first is a bad or blocked IP address. You can learn about this topic in the [proxy management guide](./proxy-management). The second reason is [browser fingerprints](https://pixelprivacy.com/resources/browser-fingerprinting/) (or signatures), which we will explore more in this guide. Check the [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping) to gain a deeper theoretical understanding of blocking and learn a few tips and tricks. Browser fingerprint is a collection of browser attributes and significant features that can show if our browser is a bot or a real user. Moreover, most browsers have these unique features that allow the website to track the browser even within different IP addresses. This is the main reason why scrapers should change browser fingerprints while doing browser-based scraping. In return, it should significantly reduce the blocking. ## Using browser fingerprints Changing browser fingerprints can be a tedious job. Luckily, Crawlee provides this feature with minimal configuration necessary - the usage of fingerprints in `PlaywrightCrawler` is enabled by default. You can customize the fingerprints by using the `fingerprint_generator` argument of the `PlaywrightCrawler.__init__`, either pass your own implementation of `FingerprintGenerator` or use `DefaultFingerprintGenerator`. {PlaywrightDefaultFingerprintGenerator} In certain cases we want to narrow down the fingerprints used - e.g. specify a certain operating system, locale or browser. This is also possible with Crawlee - the crawler can have the generation algorithm customized to reflect the particular browser version and many more. For description of fingerprint generation options please see `HeaderGeneratorOptions`, `ScreenOptions` and `DefaultFingerprintGenerator.__init__` See the example below: {PlaywrightDefaultFingerprintGeneratorWithArgs} If you do not want to use fingerprints, then pass `fingerprint_generator=None` argument to the `PlaywrightCrawler.__init__`. ## Using Camoufox In some cases even `PlaywrightCrawler` with fingerprints is not enough. You can try using `PlaywrightCrawler` together with [Camoufox](https://camoufox.com/). See the example integration below: {PlaywrightWithCamoufox} **Related links** - [Fingerprint Suite Docs](https://github.com/apify/fingerprint-suite) - [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping) ================================================ FILE: docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py ================================================ import asyncio from crawlee.fingerprint_suite import ( DefaultFingerprintGenerator, HeaderGeneratorOptions, ScreenOptions, ) async def main() -> None: fingerprint_generator = DefaultFingerprintGenerator( header_options=HeaderGeneratorOptions(browsers=['chrome']), screen_options=ScreenOptions(min_width=400), ) # ... if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/avoid_blocking/playwright_with_fingerprint_generator.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: # Fingerprint generator is used by default. crawler = PlaywrightCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Find a link to the next page and enqueue it if it exists. await context.enqueue_links(selector='.morelink') # Run the crawler with the initial list of URLs. await crawler.run(['https://news.ycombinator.com/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py ================================================ import asyncio import io from pathlib import Path from warcio.statusandheaders import StatusAndHeaders from warcio.warcwriter import WARCWriter from crawlee.crawlers import ParselCrawler, ParselCrawlingContext async def archive_response(context: ParselCrawlingContext, writer: WARCWriter) -> None: """Helper function for archiving response in WARC format.""" # Create WARC records for response response_body = await context.http_response.read() response_payload_stream = io.BytesIO(response_body) response_headers = StatusAndHeaders( str(context.http_response.status_code), context.http_response.headers, protocol='HTTP/1.1', ) response_record = writer.create_warc_record( context.request.url, 'response', payload=response_payload_stream, length=len(response_body), http_headers=response_headers, ) writer.write_record(response_record) async def main() -> None: crawler = ParselCrawler( max_requests_per_crawl=10, ) # Create a WARC archive file a prepare the writer. archive = Path('example.warc.gz') with archive.open('wb') as output: writer = WARCWriter(output, gzip=True) # Create a WARC info record to store metadata about the archive. warcinfo_payload = { 'software': 'Crawlee', 'format': 'WARC/1.1', 'description': 'Example archive created with ParselCrawler', } writer.write_record(writer.create_warcinfo_record(archive.name, warcinfo_payload)) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Archiving {context.request.url} ...') await archive_response(context=context, writer=writer) await context.enqueue_links(strategy='same-domain') await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/creating_web_archive/manual_archiving_playwright_crawler.py ================================================ import asyncio import io import logging from functools import partial from pathlib import Path from playwright.async_api import Request from warcio.statusandheaders import StatusAndHeaders from warcio.warcwriter import WARCWriter from crawlee.crawlers import ( PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext, ) async def archive_response( request: Request, writer: WARCWriter, logger: logging.Logger ) -> None: """Helper function for archiving response in WARC format.""" response = await request.response() if not response: logger.warning(f'Could not get response {request.url}') return try: response_body = await response.body() except Exception as e: logger.warning(f'Could not get response body for {response.url}: {e}') return logger.info(f'Archiving resource {response.url}') response_payload_stream = io.BytesIO(response_body) response_headers = StatusAndHeaders( str(response.status), response.headers, protocol='HTTP/1.1' ) response_record = writer.create_warc_record( response.url, 'response', payload=response_payload_stream, length=len(response_body), http_headers=response_headers, ) writer.write_record(response_record) async def main() -> None: crawler = PlaywrightCrawler( max_requests_per_crawl=1, headless=False, ) # Create a WARC archive file a prepare the writer. archive = Path('example.warc.gz') with archive.open('wb') as output: writer = WARCWriter(output, gzip=True) # Create a WARC info record to store metadata about the archive. warcinfo_payload = { 'software': 'Crawlee', 'format': 'WARC/1.1', 'description': 'Example archive created with PlaywrightCrawler', } writer.write_record(writer.create_warcinfo_record(archive.name, warcinfo_payload)) @crawler.pre_navigation_hook async def archiving_hook(context: PlaywrightPreNavCrawlingContext) -> None: # Ensure that all responses with additional resources are archived context.page.on( 'requestfinished', partial(archive_response, logger=context.log, writer=writer), ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: # For some sites, where the content loads dynamically, # it is needed to scroll the page to load all content. # It slows down the crawling, but ensures that all content is loaded. await context.infinite_scroll() await context.enqueue_links(strategy='same-domain') await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/creating_web_archive/simple_pw_through_proxy_pywb_server.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration async def main() -> None: crawler = PlaywrightCrawler( # Use the local wayback server as a proxy proxy_configuration=ProxyConfiguration(proxy_urls=['http://localhost:8080/']), # Ignore the HTTPS errors if you have not followed pywb CA setup instructions browser_launch_options={'ignore_https_errors': True}, max_requests_per_crawl=10, headless=False, ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Archiving {context.request.url} ...') # For some sites, where the content loads dynamically, # it is needed to scroll the page to load all content. # It slows down the crawling, but ensures that all content is loaded. await context.infinite_scroll() await context.enqueue_links(strategy='same-domain') await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/error_handling/change_handle_error_status.py ================================================ import asyncio import json from crawlee import HttpHeaders from crawlee.crawlers import HttpCrawler, HttpCrawlingContext from crawlee.errors import HttpStatusCodeError from crawlee.sessions import SessionPool # Using a placeholder refresh token for this example REFRESH_TOKEN = 'PLACEHOLDER' UNAUTHORIZED_CODE = 401 async def main() -> None: crawler = HttpCrawler( max_request_retries=2, # Only treat 403 as a blocking status code, not 401 session_pool=SessionPool(create_session_settings={'blocked_status_codes': [403]}), # Don't treat 401 responses as errors ignore_http_error_status_codes=[UNAUTHORIZED_CODE], ) @crawler.router.default_handler async def default_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Now we can handle 401 responses ourselves if context.http_response.status_code == UNAUTHORIZED_CODE: # Get a fresh access token headers = {'authorization': f'Bearer {REFRESH_TOKEN}'} response = await context.send_request( 'https://placeholder.org/refresh', headers=headers ) data = json.loads(await response.read()) # Add the new token to our `Request` headers context.request.headers |= HttpHeaders( {'authorization': f'Bearer {data["access_token"]}'}, ) # Trigger a retry with our updated headers raise HttpStatusCodeError('Unauthorized', status_code=UNAUTHORIZED_CODE) await crawler.run(['http://httpbingo.org/status/401']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/error_handling/disable_retry.py ================================================ import asyncio from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext from crawlee.errors import HttpStatusCodeError, SessionError async def main() -> None: crawler = HttpCrawler(max_request_retries=5) # Create a parsing error for demonstration @crawler.router.default_handler async def default_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') raise ValueError('Simulated parsing error') # This handler runs before any retry attempts @crawler.error_handler async def retry_handler(context: BasicCrawlingContext, error: Exception) -> None: context.log.error(f'Failed request {context.request.url}') # Only allow retries for network-related errors if not isinstance(error, (SessionError, HttpStatusCodeError)): context.log.error('Non-network error detected') # Stop further retry attempts for this `Request` context.request.no_retry = True await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/error_handling/handle_proxy_error.py ================================================ import asyncio from crawlee import Request from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext from crawlee.errors import ProxyError async def main() -> None: # Set how many session rotations will happen before calling the error handler # when ProxyError occurs crawler = HttpCrawler(max_session_rotations=5, max_request_retries=6) # For this example, we'll create a proxy error in our handler @crawler.router.default_handler async def default_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') raise ProxyError('Simulated proxy error') # This handler runs after all retry attempts are exhausted @crawler.failed_request_handler async def failed_handler(context: BasicCrawlingContext, error: Exception) -> None: context.log.error(f'Failed request {context.request.url}, after 5 rotations') request = context.request # For proxy errors, we can add a new `Request` to try again if isinstance(error, ProxyError) and not request.unique_key.startswith('retry'): context.log.info(f'Retrying {request.url} ...') # Create a new `Request` with a modified key to avoid deduplication new_request = Request.from_url( request.url, unique_key=f'retry{request.unique_key}' ) # Add the new `Request` to the `Queue` rq = await crawler.get_request_manager() await rq.add_request(new_request) await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/http_clients/parsel_curl_impersonate_example.py ================================================ import asyncio from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.http_clients import CurlImpersonateHttpClient async def main() -> None: http_client = CurlImpersonateHttpClient( # Optional additional keyword arguments for `curl_cffi.requests.AsyncSession`. timeout=10, impersonate='chrome131', ) crawler = ParselCrawler( http_client=http_client, # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Enqueue all links from the page. await context.enqueue_links() # Extract data from the page. data = { 'url': context.request.url, 'title': context.selector.css('title::text').get(), } # Push the extracted data to the default dataset. await context.push_data(data) # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/http_clients/parsel_httpx_example.py ================================================ import asyncio from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.http_clients import HttpxHttpClient async def main() -> None: http_client = HttpxHttpClient( # Optional additional keyword arguments for `httpx.AsyncClient`. timeout=10, follow_redirects=True, ) crawler = ParselCrawler( http_client=http_client, # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Enqueue all links from the page. await context.enqueue_links() # Extract data from the page. data = { 'url': context.request.url, 'title': context.selector.css('title::text').get(), } # Push the extracted data to the default dataset. await context.push_data(data) # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/http_clients/parsel_impit_example.py ================================================ import asyncio from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.http_clients import ImpitHttpClient async def main() -> None: http_client = ImpitHttpClient( # Optional additional keyword arguments for `impit.AsyncClient`. http3=True, browser='firefox', verify=True, ) crawler = ParselCrawler( http_client=http_client, # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Enqueue all links from the page. await context.enqueue_links() # Extract data from the page. data = { 'url': context.request.url, 'title': context.selector.css('title::text').get(), } # Push the extracted data to the default dataset. await context.push_data(data) # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/http_crawlers/__init__.py ================================================ ================================================ FILE: docs/guides/code_examples/http_crawlers/beautifulsoup_example.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: # Create a BeautifulSoupCrawler instance crawler = BeautifulSoupCrawler( # Limit the crawl to 10 requests max_requests_per_crawl=10, ) # Define the default request handler @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # Extract data using BeautifulSoup data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, } # Push extracted data to the dataset await context.push_data(data) # Enqueue links found on the page for further crawling await context.enqueue_links() # Run the crawler await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/http_crawlers/custom_crawler_example.py ================================================ ================================================ FILE: docs/guides/code_examples/http_crawlers/http_example.py ================================================ import asyncio import re from crawlee.crawlers import HttpCrawler, HttpCrawlingContext async def main() -> None: # Create an HttpCrawler instance - no automatic parsing crawler = HttpCrawler( # Limit the crawl to 10 requests max_requests_per_crawl=10, ) # Define the default request handler @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # Get the raw response content response_body = await context.http_response.read() response_text = response_body.decode('utf-8') # Extract title manually using regex (since we don't have a parser) title_match = re.search( r']*>([^<]+)', response_text, re.IGNORECASE ) title = title_match.group(1).strip() if title_match else None # Extract basic information data = { 'url': context.request.url, 'title': title, } # Push extracted data to the dataset await context.push_data(data) # Simple link extraction for further crawling href_pattern = r'href=["\']([^"\']+)["\']' matches = re.findall(href_pattern, response_text, re.IGNORECASE) # Enqueue first few links found (limit to avoid too many requests) for href in matches[:3]: if href.startswith('http') and 'crawlee.dev' in href: await context.add_requests([href]) # Run the crawler await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/http_crawlers/lexbor_parser.py ================================================ import asyncio from pydantic import ValidationError from selectolax.lexbor import LexborHTMLParser from yarl import URL from crawlee import Request from crawlee.crawlers import HttpCrawler, HttpCrawlingContext async def main() -> None: crawler = HttpCrawler( max_request_retries=1, max_requests_per_crawl=10, ) @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Parse the HTML content using Selectolax with Lexbor backend. parsed_html = LexborHTMLParser(await context.http_response.read()) # Extract data from the page. data = { 'url': context.request.url, 'title': parsed_html.css_first('title').text(), 'h1s': [h1.text() for h1 in parsed_html.css('h1')], 'h2s': [h2.text() for h2 in parsed_html.css('h2')], 'h3s': [h3.text() for h3 in parsed_html.css('h3')], } await context.push_data(data) # Css selector to extract valid href attributes. links_selector = ( 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])' ) base_url = URL(context.request.url) extracted_requests = [] # Extract links. for item in parsed_html.css(links_selector): href = item.attributes.get('href') if not href: continue # Convert relative URLs to absolute if needed. url = str(base_url.join(URL(href))) try: request = Request.from_url(url) except ValidationError as exc: context.log.warning(f'Skipping invalid URL "{url}": {exc}') continue extracted_requests.append(request) # Add extracted requests to the queue with the same-domain strategy. await context.add_requests(extracted_requests, strategy='same-domain') await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/http_crawlers/lxml_parser.py ================================================ import asyncio from lxml import html from pydantic import ValidationError from crawlee import Request from crawlee.crawlers import HttpCrawler, HttpCrawlingContext async def main() -> None: crawler = HttpCrawler( max_request_retries=1, max_requests_per_crawl=10, ) @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Parse the HTML content using lxml. parsed_html = html.fromstring(await context.http_response.read()) # Extract data from the page. data = { 'url': context.request.url, 'title': parsed_html.findtext('.//title'), 'h1s': [h1.text_content() for h1 in parsed_html.findall('.//h1')], 'h2s': [h2.text_content() for h2 in parsed_html.findall('.//h2')], 'h3s': [h3.text_content() for h3 in parsed_html.findall('.//h3')], } await context.push_data(data) # Convert relative URLs to absolute before extracting links. parsed_html.make_links_absolute(context.request.url, resolve_base_href=True) # Xpath 1.0 selector for extracting valid href attributes. links_xpath = ( '//a/@href[not(starts-with(., "#")) ' 'and not(starts-with(., "javascript:")) ' 'and not(starts-with(., "mailto:"))]' ) extracted_requests = [] # Extract links. for url in parsed_html.xpath(links_xpath): try: request = Request.from_url(url) except ValidationError as exc: context.log.warning(f'Skipping invalid URL "{url}": {exc}') continue extracted_requests.append(request) # Add extracted requests to the queue with the same-domain strategy. await context.add_requests(extracted_requests, strategy='same-domain') await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/http_crawlers/lxml_saxonche_parser.py ================================================ import asyncio from lxml import html from pydantic import ValidationError from saxonche import PySaxonProcessor from crawlee import Request from crawlee.crawlers import HttpCrawler, HttpCrawlingContext async def main() -> None: crawler = HttpCrawler( max_request_retries=1, max_requests_per_crawl=10, ) # Create Saxon processor once and reuse across requests. saxon_proc = PySaxonProcessor(license=False) xpath_proc = saxon_proc.new_xpath_processor() @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Parse HTML with lxml. parsed_html = html.fromstring(await context.http_response.read()) # Convert relative URLs to absolute before extracting links. parsed_html.make_links_absolute(context.request.url, resolve_base_href=True) # Convert parsed HTML to XML for Saxon processing. xml = html.tostring(parsed_html, encoding='unicode', method='xml') # Parse XML with Saxon. parsed_xml = saxon_proc.parse_xml(xml_text=xml) # Set the parsed context for XPath evaluation. xpath_proc.set_context(xdm_item=parsed_xml) # Extract data using XPath 2.0 string() function. data = { 'url': context.request.url, 'title': xpath_proc.evaluate_single('.//title/string()'), 'h1s': [str(h) for h in (xpath_proc.evaluate('//h1/string()') or [])], 'h2s': [str(h) for h in (xpath_proc.evaluate('//h2/string()') or [])], 'h3s': [str(h) for h in (xpath_proc.evaluate('//h3/string()') or [])], } await context.push_data(data) # XPath 2.0 with distinct-values() to get unique links and remove fragments. links_xpath = """ distinct-values( for $href in //a/@href[ not(starts-with(., "#")) and not(starts-with(., "javascript:")) and not(starts-with(., "mailto:")) ] return replace($href, "#.*$", "") ) """ extracted_requests = [] # Extract links. for item in xpath_proc.evaluate(links_xpath) or []: url = item.string_value try: request = Request.from_url(url) except ValidationError as exc: context.log.warning(f'Skipping invalid URL "{url}": {exc}') continue extracted_requests.append(request) # Add extracted requests to the queue with the same-domain strategy. await context.add_requests(extracted_requests, strategy='same-domain') await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/http_crawlers/parsel_example.py ================================================ import asyncio from crawlee.crawlers import ParselCrawler, ParselCrawlingContext async def main() -> None: # Create a ParselCrawler instance crawler = ParselCrawler( # Limit the crawl to 10 requests max_requests_per_crawl=10, ) # Define the default request handler @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # Extract data using Parsel's XPath and CSS selectors data = { 'url': context.request.url, 'title': context.selector.xpath('//title/text()').get(), } # Push extracted data to the dataset await context.push_data(data) # Enqueue links found on the page for further crawling await context.enqueue_links() # Run the crawler await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/http_crawlers/pyquery_parser.py ================================================ import asyncio from pydantic import ValidationError from pyquery import PyQuery from yarl import URL from crawlee import Request from crawlee.crawlers import HttpCrawler, HttpCrawlingContext async def main() -> None: crawler = HttpCrawler( max_request_retries=1, max_requests_per_crawl=10, ) @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Parse the HTML content using PyQuery. parsed_html = PyQuery(await context.http_response.read()) # Extract data using jQuery-style selectors. data = { 'url': context.request.url, 'title': parsed_html('title').text(), 'h1s': [h1.text() for h1 in parsed_html('h1').items()], 'h2s': [h2.text() for h2 in parsed_html('h2').items()], 'h3s': [h3.text() for h3 in parsed_html('h3').items()], } await context.push_data(data) # Css selector to extract valid href attributes. links_selector = ( 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])' ) base_url = URL(context.request.url) extracted_requests = [] # Extract links. for item in parsed_html(links_selector).items(): href = item.attr('href') if not href: continue # Convert relative URLs to absolute if needed. url = str(base_url.join(URL(str(href)))) try: request = Request.from_url(url) except ValidationError as exc: context.log.warning(f'Skipping invalid URL "{url}": {exc}') continue extracted_requests.append(request) # Add extracted requests to the queue with the same-domain strategy. await context.add_requests(extracted_requests, strategy='same-domain') await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/http_crawlers/scrapling_parser.py ================================================ import asyncio from pydantic import ValidationError from scrapling.parser import Selector from yarl import URL from crawlee import Request from crawlee.crawlers import HttpCrawler, HttpCrawlingContext async def main() -> None: crawler = HttpCrawler( max_request_retries=1, max_requests_per_crawl=10, ) @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Parse the HTML content using Scrapling. page = Selector(await context.http_response.read(), url=context.request.url) # Extract data using Xpath selectors with .get_all_text method for full text # content. title_el = page.xpath_first('//title') data = { 'url': context.request.url, 'title': title_el.text if isinstance(title_el, Selector) else title_el, 'h1s': [ h1.get_all_text() if isinstance(h1, Selector) else h1 for h1 in page.xpath('//h1') ], 'h2s': [ h2.get_all_text() if isinstance(h2, Selector) else h2 for h2 in page.xpath('//h2') ], 'h3s': [ h3.get_all_text() if isinstance(h3, Selector) else h3 for h3 in page.xpath('//h3') ], } await context.push_data(data) # Css selector to extract valid href attributes. links_selector = ( 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])' ) base_url = URL(context.request.url) extracted_requests = [] # Extract links. for item in page.css(links_selector): href = item.attrib.get('href') if isinstance(item, Selector) else None if not href: continue # Convert relative URLs to absolute if needed. url = str(base_url.join(URL(href))) try: request = Request.from_url(url) except ValidationError as exc: context.log.warning(f'Skipping invalid URL "{url}": {exc}') continue extracted_requests.append(request) # Add extracted requests to the queue with the same-domain strategy. await context.add_requests(extracted_requests, strategy='same-domain') await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py ================================================ import asyncio from crawlee.crawlers import ( AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext, ) from .selectolax_parser import SelectolaxLexborParser async def main() -> None: crawler: AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler( max_requests_per_crawl=10, # Use custom Selectolax parser for static content parsing. static_parser=SelectolaxLexborParser(), ) @crawler.router.default_handler async def handle_request(context: AdaptivePlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') data = { 'url': context.request.url, 'title': await context.query_selector_one('title'), } await context.push_data(data) await context.enqueue_links() await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/http_crawlers/selectolax_context.py ================================================ from dataclasses import dataclass, fields from selectolax.lexbor import LexborHTMLParser from typing_extensions import Self from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext # Custom context for Selectolax parser, you can add your own methods here # to facilitate working with the parsed document. @dataclass(frozen=True) class SelectolaxLexborContext(ParsedHttpCrawlingContext[LexborHTMLParser]): """Crawling context providing access to the parsed page. This context is passed to request handlers and includes all standard context methods (push_data, enqueue_links, etc.) plus custom helpers. """ @property def parser(self) -> LexborHTMLParser: """Convenient alias for accessing the parsed document.""" return self.parsed_content @classmethod def from_parsed_http_crawling_context( cls, context: ParsedHttpCrawlingContext[LexborHTMLParser] ) -> Self: """Create custom context from the base context. Copies all fields from the base context to preserve framework functionality while adding custom interface. """ return cls( **{field.name: getattr(context, field.name) for field in fields(context)} ) ================================================ FILE: docs/guides/code_examples/http_crawlers/selectolax_crawler.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING from selectolax.lexbor import LexborHTMLParser, LexborNode from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions from .selectolax_context import SelectolaxLexborContext from .selectolax_parser import SelectolaxLexborParser if TYPE_CHECKING: from collections.abc import AsyncGenerator from typing_extensions import Unpack from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext # Custom crawler using custom context, It is optional and you can use # AbstractHttpCrawler directly with SelectolaxLexborParser if you don't need # any custom context methods. class SelectolaxLexborCrawler( AbstractHttpCrawler[SelectolaxLexborContext, LexborHTMLParser, LexborNode] ): """Custom crawler using Selectolax Lexbor for HTML parsing.""" def __init__( self, **kwargs: Unpack[HttpCrawlerOptions[SelectolaxLexborContext]], ) -> None: # Final step converts the base context to custom context type. async def final_step( context: ParsedHttpCrawlingContext[LexborHTMLParser], ) -> AsyncGenerator[SelectolaxLexborContext, None]: # Yield custom context wrapping with additional functionality around the base # context. yield SelectolaxLexborContext.from_parsed_http_crawling_context(context) # Build context pipeline: HTTP request -> parsing -> custom context. kwargs['_context_pipeline'] = ( self._create_static_content_crawler_pipeline().compose(final_step) ) super().__init__( parser=SelectolaxLexborParser(), **kwargs, ) ================================================ FILE: docs/guides/code_examples/http_crawlers/selectolax_crawler_run.py ================================================ import asyncio from .selectolax_crawler import SelectolaxLexborContext, SelectolaxLexborCrawler async def main() -> None: crawler = SelectolaxLexborCrawler( max_requests_per_crawl=10, ) @crawler.router.default_handler async def handle_request(context: SelectolaxLexborContext) -> None: context.log.info(f'Processing {context.request.url} ...') data = { 'url': context.request.url, 'title': context.parser.css_first('title').text(), } await context.push_data(data) await context.enqueue_links() await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/http_crawlers/selectolax_parser.py ================================================ from __future__ import annotations import asyncio from typing import TYPE_CHECKING from selectolax.lexbor import LexborHTMLParser, LexborNode from typing_extensions import override from crawlee.crawlers._abstract_http import AbstractHttpParser if TYPE_CHECKING: from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse class SelectolaxLexborParser(AbstractHttpParser[LexborHTMLParser, LexborNode]): """Parser for parsing HTTP response using Selectolax Lexbor.""" @override async def parse(self, response: HttpResponse) -> LexborHTMLParser: """Parse HTTP response body into a document object.""" response_body = await response.read() # Run parsing in a thread to avoid blocking the event loop. return await asyncio.to_thread(LexborHTMLParser, response_body) @override async def parse_text(self, text: str) -> LexborHTMLParser: """Parse raw HTML string into a document object.""" return LexborHTMLParser(text) @override async def select( self, parsed_content: LexborHTMLParser, selector: str ) -> Sequence[LexborNode]: """Select elements matching a CSS selector.""" return tuple(item for item in parsed_content.css(selector)) @override def is_matching_selector( self, parsed_content: LexborHTMLParser, selector: str ) -> bool: """Check if any element matches the selector.""" return parsed_content.css_first(selector) is not None @override def find_links( self, parsed_content: LexborHTMLParser, selector: str, attribute: str ) -> Iterable[str]: """Extract href attributes from elements matching the selector. Used by `enqueue_links` helper to discover URLs. """ link: LexborNode urls: list[str] = [] for link in parsed_content.css(selector): url = link.attributes.get(attribute) if url: urls.append(url.strip()) return urls ================================================ FILE: docs/guides/code_examples/login_crawler/http_login.py ================================================ import asyncio import json from datetime import datetime, timedelta from crawlee import ConcurrencySettings, Request from crawlee.crawlers import ( HttpCrawler, HttpCrawlingContext, ) from crawlee.sessions import SessionPool async def main() -> None: crawler = HttpCrawler( max_requests_per_crawl=10, # Configure to use a single persistent session throughout the crawl max_session_rotations=0, # Limit request rate to avoid triggering anti-scraping measures concurrency_settings=ConcurrencySettings(max_tasks_per_minute=30), session_pool=SessionPool( max_pool_size=1, create_session_settings={ # Set high value to ensure the session isn't replaced during crawling 'max_usage_count': 999_999, # Set high value to prevent session expiration during crawling 'max_age': timedelta(hours=999_999), # Higher error tolerance before the session is considered blocked # Make sure you implement proper error handling in your code 'max_error_score': 100, }, ), ) # Default request handler for normal page processing @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Specialized handler for the login API request @crawler.router.handler('login') async def login_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing login at {context.request.url} ...') # Verify that a session is available before proceeding if not context.session: raise RuntimeError('Session not found') # Parse the API response containing authentication tokens and user data data = json.loads(await context.http_response.read()) # Extract authentication data from the response token = data['token'] expires = data['expires'].replace('Z', '+00:00') expires_int = int(datetime.fromisoformat(expires).timestamp()) user_id = data['userId'] username = data['username'] # Set authentication cookies in the session that will be used # for subsequent requests context.session.cookies.set(name='token', value=token, expires=expires_int) context.session.cookies.set(name='userID', value=user_id) context.session.cookies.set(name='userName', value=username) # After successful authentication, continue crawling with the # authenticated session await context.add_requests(['https://demoqa.com/BookStore/v1/Books']) # Create a POST request to the authentication API endpoint # This will trigger the login_handler when executed request = Request.from_url( 'https://demoqa.com/Account/v1/Login', label='login', method='POST', payload=json.dumps( {'userName': 'crawlee_test', 'password': 'Test1234!'} ).encode(), headers={'Content-Type': 'application/json'}, ) # Start the crawling process with the login request await crawler.run([request]) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/login_crawler/playwright_login.py ================================================ import asyncio from datetime import timedelta from crawlee import ConcurrencySettings, Request from crawlee.crawlers import ( PlaywrightCrawler, PlaywrightCrawlingContext, ) from crawlee.sessions import SessionPool async def main() -> None: crawler = PlaywrightCrawler( max_requests_per_crawl=10, headless=True, browser_type='chromium', # We only have one session and it shouldn't rotate max_session_rotations=0, # Limit crawling intensity to avoid blocking concurrency_settings=ConcurrencySettings(max_tasks_per_minute=30), session_pool=SessionPool( # Limit the pool to one session max_pool_size=1, create_session_settings={ # High value for session usage limit 'max_usage_count': 999_999, # High value for session lifetime 'max_age': timedelta(hours=999_999), # High score allows the session to encounter more errors # before crawlee decides the session is blocked # Make sure you know how to handle these errors 'max_error_score': 100, }, ), ) # The main handler for processing requests @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # A handler for the login page @crawler.router.handler('login') async def login_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing login {context.request.url} ...') # Check if the session is available if not context.session: raise RuntimeError('Session not found') # Entering data into the form, `delay` to simulate human typing # Without this, the data will be entered instantly await context.page.type('#userName', 'crawlee_test', delay=100) await context.page.type('#password', 'Test1234!', delay=100) await context.page.click('#login', delay=100) # Wait for an element confirming that we have successfully # logged in to the site await context.page.locator('#userName-value').first.wait_for(state='visible') context.log.info('Login successful!') # Moving on to the basic flow of crawling await context.add_requests(['https://demoqa.com/books']) # We start crawling with login. This is necessary to access the rest of the pages await crawler.run([Request.from_url('https://demoqa.com/login', label='login')]) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/playwright_crawler/browser_configuration_example.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: crawler = PlaywrightCrawler( headless=False, browser_type='chromium', # Browser launch options browser_launch_options={ # For support `msedge` channel you need to install it # `playwright install msedge` 'channel': 'msedge', 'slow_mo': 200, }, # Context launch options, applied to each page as it is created browser_new_context_options={ 'color_scheme': 'dark', # Set headers 'extra_http_headers': { 'Custom-Header': 'my-header', 'Accept-Language': 'en', }, # Set only User Agent 'user_agent': 'My-User-Agent', }, # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') await context.enqueue_links() # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/playwright_crawler/browser_pool_page_hooks_example.py ================================================ from __future__ import annotations import asyncio import logging from typing import TYPE_CHECKING, Any from crawlee.browsers import BrowserPool from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.storages import KeyValueStore if TYPE_CHECKING: from crawlee.browsers._browser_controller import BrowserController from crawlee.browsers._types import CrawleePage from crawlee.proxy_configuration import ProxyInfo logger = logging.getLogger(__name__) async def main() -> None: async with BrowserPool() as browser_pool: @browser_pool.pre_page_create_hook async def log_page_init( page_id: str, _browser_controller: BrowserController, _browser_new_context_options: dict[str, Any], _proxy_info: ProxyInfo | None, ) -> None: """Log when a new page is about to be created.""" logger.info(f'Creating page {page_id}...') @browser_pool.post_page_create_hook async def set_viewport( crawlee_page: CrawleePage, _browser_controller: BrowserController ) -> None: """Set a fixed viewport size on each newly created page.""" await crawlee_page.page.set_viewport_size({'width': 1280, 'height': 1024}) @browser_pool.pre_page_close_hook async def save_screenshot( crawlee_page: CrawleePage, _browser_controller: BrowserController ) -> None: """Save a screenshot to KeyValueStore before each page is closed.""" kvs = await KeyValueStore.open() screenshot = await crawlee_page.page.screenshot() await kvs.set_value( key=f'screenshot-{crawlee_page.id}', value=screenshot, content_type='image/png', ) logger.info(f'Saved screenshot for page {crawlee_page.id}.') @browser_pool.post_page_close_hook async def log_page_closed( page_id: str, _browser_controller: BrowserController ) -> None: """Log after each page is closed.""" logger.info(f'Page {page_id} closed successfully.') crawler = PlaywrightCrawler( browser_pool=browser_pool, max_requests_per_crawl=5, ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') await context.enqueue_links() # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/playwright_crawler/multiple_launch_example.py ================================================ import asyncio from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: # Create a plugin for each required browser. plugin_chromium = PlaywrightBrowserPlugin( browser_type='chromium', max_open_pages_per_browser=1 ) plugin_firefox = PlaywrightBrowserPlugin( browser_type='firefox', max_open_pages_per_browser=1 ) crawler = PlaywrightCrawler( browser_pool=BrowserPool(plugins=[plugin_chromium, plugin_firefox]), # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: browser_name = ( context.page.context.browser.browser_type.name if context.page.context.browser else 'undefined' ) context.log.info(f'Processing {context.request.url} with {browser_name} ...') await context.enqueue_links() # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev', 'https://apify.com/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/playwright_crawler/navigation_hooks_example.py ================================================ import asyncio from crawlee.crawlers import ( PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPostNavCrawlingContext, PlaywrightPreNavCrawlingContext, ) from crawlee.errors import SessionError async def main() -> None: crawler = PlaywrightCrawler(max_requests_per_crawl=10) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') await context.enqueue_links() @crawler.pre_navigation_hook async def configure_page(context: PlaywrightPreNavCrawlingContext) -> None: context.log.info(f'Navigating to {context.request.url} ...') # block stylesheets, images, fonts and other static assets # to speed up page loading await context.block_requests() @crawler.post_navigation_hook async def custom_captcha_check(context: PlaywrightPostNavCrawlingContext) -> None: # check if the page contains a captcha captcha_element = context.page.locator('input[name="captcha"]').first if await captcha_element.is_visible(): context.log.warning('Captcha detected! Skipping the page.') raise SessionError('Captcha detected') # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/playwright_crawler/plugin_browser_configuration_example.py ================================================ import asyncio from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin from crawlee.crawlers import PlaywrightCrawler async def main() -> None: crawler = PlaywrightCrawler( browser_pool=BrowserPool( plugins=[ PlaywrightBrowserPlugin( browser_type='chromium', browser_launch_options={ 'headless': False, 'channel': 'msedge', 'slow_mo': 200, }, browser_new_context_options={ 'color_scheme': 'dark', 'extra_http_headers': { 'Custom-Header': 'my-header', 'Accept-Language': 'en', }, 'user_agent': 'My-User-Agent', }, ) ] ) ) # ... if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/playwright_crawler_adaptive/handler.py ================================================ import asyncio from datetime import timedelta from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext async def main() -> None: crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser() @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: # Locate element h2 within 5 seconds h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000)) # Do stuff with element found by the selector context.log.info(h2) await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/playwright_crawler_adaptive/init_beautifulsoup.py ================================================ import asyncio from crawlee.crawlers import AdaptivePlaywrightCrawler async def main() -> None: crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( # Arguments relevant only for PlaywrightCrawler playwright_crawler_specific_kwargs={ 'headless': False, 'browser_type': 'chromium', }, # Common arguments relevant to all crawlers max_crawl_depth=5, ) # ... if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/playwright_crawler_adaptive/init_parsel.py ================================================ import asyncio from crawlee.crawlers import AdaptivePlaywrightCrawler async def main() -> None: crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( # Arguments relevant only for PlaywrightCrawler playwright_crawler_specific_kwargs={ 'headless': False, 'browser_type': 'chromium', }, # Common arguments relevant to all crawlers max_crawl_depth=5, ) # ... if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/playwright_crawler_adaptive/init_prediction.py ================================================ import asyncio from crawlee import Request from crawlee._types import RequestHandlerRunResult from crawlee.crawlers import ( AdaptivePlaywrightCrawler, RenderingType, RenderingTypePrediction, RenderingTypePredictor, ) class CustomRenderingTypePredictor(RenderingTypePredictor): def __init__(self) -> None: super().__init__() self._learning_data = list[tuple[Request, RenderingType]]() def predict(self, request: Request) -> RenderingTypePrediction: # Some custom logic that produces some `RenderingTypePrediction` # based on the `request` input. rendering_type: RenderingType = ( 'static' if 'abc' in request.url else 'client only' ) return RenderingTypePrediction( # Recommends `static` rendering type -> HTTP-based sub crawler will be used. rendering_type=rendering_type, # Recommends that both sub crawlers should run with 20% chance. When both sub # crawlers are running, the predictor can compare results and learn. # High number means that predictor is not very confident about the # `rendering_type`, low number means that predictor is very confident. detection_probability_recommendation=0.2, ) def store_result(self, request: Request, rendering_type: RenderingType) -> None: # This function allows predictor to store new learning data and retrain itself # if needed. `request` is input for prediction and `rendering_type` is the correct # prediction. self._learning_data.append((request, rendering_type)) # retrain def result_checker(result: RequestHandlerRunResult) -> bool: # Some function that inspects produced `result` and returns `True` if the result # is correct. return bool(result) # Check something on result def result_comparator( result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult ) -> bool: # Some function that inspects two results and returns `True` if they are # considered equivalent. It is used when comparing results produced by HTTP-based # sub crawler and playwright based sub crawler. return ( result_1.push_data_calls == result_2.push_data_calls ) # For example compare `push_data` calls. async def main() -> None: crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( rendering_type_predictor=CustomRenderingTypePredictor(), result_checker=result_checker, result_comparator=result_comparator, ) # ... if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/playwright_crawler_adaptive/pre_nav_hooks.py ================================================ import asyncio from playwright.async_api import Route from crawlee.crawlers import ( AdaptivePlaywrightCrawler, AdaptivePlaywrightPreNavCrawlingContext, ) async def main() -> None: crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser() @crawler.pre_navigation_hook async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: """Hook executed both in static sub crawler and playwright sub crawler. Trying to access `context.page` in this hook would raise `AdaptiveContextError` for pages crawled without playwright. """ context.log.info(f'pre navigation hook for: {context.request.url}') @crawler.pre_navigation_hook(playwright_only=True) async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: """Hook executed only in playwright sub crawler.""" async def some_routing_function(route: Route) -> None: await route.continue_() await context.page.route('*/**', some_routing_function) context.log.info( f'Playwright only pre navigation hook for: {context.request.url}' ) await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/playwright_crawler_stagehand/__init__.py ================================================ ================================================ FILE: docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py ================================================ from __future__ import annotations from datetime import datetime, timezone from typing import TYPE_CHECKING, Any, cast from stagehand.context import StagehandContext from typing_extensions import override from crawlee.browsers import ( PlaywrightBrowserController, PlaywrightBrowserPlugin, PlaywrightPersistentBrowser, ) from .support_classes import CrawleeStagehandPage if TYPE_CHECKING: from collections.abc import Mapping from playwright.async_api import Page from stagehand import Stagehand from crawlee.proxy_configuration import ProxyInfo class StagehandBrowserController(PlaywrightBrowserController): @override def __init__( self, browser: PlaywrightPersistentBrowser, stagehand: Stagehand, **kwargs: Any ) -> None: # Initialize with browser context instead of browser instance super().__init__(browser, **kwargs) self._stagehand = stagehand self._stagehand_context: StagehandContext | None = None @override async def new_page( self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None, ) -> Page: # Initialize browser context if not already done if not self._browser_context: self._browser_context = await self._create_browser_context( browser_new_context_options=browser_new_context_options, proxy_info=proxy_info, ) # Initialize Stagehand context if not already done if not self._stagehand_context: self._stagehand_context = await StagehandContext.init( self._browser_context, self._stagehand ) # Create a new page using Stagehand context page = await self._stagehand_context.new_page() pw_page = page._page # noqa: SLF001 # Handle page close event pw_page.on(event='close', f=self._on_page_close) # Update internal state self._pages.append(pw_page) self._last_page_opened_at = datetime.now(timezone.utc) self._total_opened_pages += 1 # Wrap StagehandPage to provide Playwright Page interface return cast('Page', CrawleeStagehandPage(page)) class StagehandPlugin(PlaywrightBrowserPlugin): """Browser plugin that integrates Stagehand with Crawlee's browser management.""" @override def __init__(self, stagehand: Stagehand, **kwargs: Any) -> None: super().__init__(**kwargs) self._stagehand = stagehand @override async def new_browser(self) -> StagehandBrowserController: if not self._playwright: raise RuntimeError('Playwright browser plugin is not initialized.') browser = PlaywrightPersistentBrowser( # Stagehand can run only on a Chromium-based browser. self._playwright.chromium, self._user_data_dir, self._browser_launch_options, ) # Return custom controller with Stagehand return StagehandBrowserController( browser=browser, stagehand=self._stagehand, header_generator=None, fingerprint_generator=None, ) ================================================ FILE: docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py ================================================ from __future__ import annotations import asyncio import os from typing import cast from stagehand import StagehandConfig, StagehandPage from crawlee import ConcurrencySettings from crawlee.browsers import BrowserPool from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from .browser_classes import StagehandPlugin from .support_classes import CrawleeStagehand async def main() -> None: # Configure local Stagehand with Gemini model config = StagehandConfig( env='LOCAL', model_name='google/gemini-2.5-flash-preview-05-20', model_api_key=os.getenv('GEMINI_API_KEY'), ) # Create Stagehand instance stagehand = CrawleeStagehand(config) # Create crawler with custom browser pool using Stagehand crawler = PlaywrightCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=10, # Custom browser pool. Gives users full control over browsers used by the crawler. concurrency_settings=ConcurrencySettings(max_tasks_per_minute=10), browser_pool=BrowserPool( plugins=[ StagehandPlugin(stagehand, browser_launch_options={'headless': True}) ], ), ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Cast to StagehandPage for proper type hints in IDE page = cast('StagehandPage', context.page) # Use regular Playwright method playwright_title = await page.title() context.log.info(f'Playwright page title: {playwright_title}') # highlight-start # Use AI-powered extraction with natural language gemini_title = await page.extract('Extract page title') context.log.info(f'Gemini page title: {gemini_title}') # highlight-end await context.enqueue_links() # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, Any from stagehand import Stagehand, StagehandPage if TYPE_CHECKING: from types import TracebackType class CrawleeStagehandPage: """StagehandPage wrapper for Crawlee.""" def __init__(self, page: StagehandPage) -> None: self._page = page async def goto( self, url: str, *, referer: str | None = None, timeout: int | None = None, wait_until: str | None = None, ) -> Any: """Navigate to the specified URL.""" # Override goto to return navigation result that `PlaywrightCrawler` expects return await self._page._page.goto( # noqa: SLF001 url, referer=referer, timeout=timeout, wait_until=wait_until, ) def __getattr__(self, name: str) -> Any: """Delegate all other methods to the underlying StagehandPage.""" return getattr(self._page, name) async def __aenter__(self) -> CrawleeStagehandPage: """Enter the context manager.""" return self async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None, ) -> None: await self._page.close() class CrawleeStagehand(Stagehand): """Stagehand wrapper for Crawlee to disable the launch of Playwright.""" async def init(self) -> None: # Skip Stagehand's own Playwright initialization # Let Crawlee's PlaywrightBrowserPlugin manage the browser lifecycle self._initialized = True ================================================ FILE: docs/guides/code_examples/proxy_management/inspecting_bs_example.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration async def main() -> None: # Create a ProxyConfiguration object and pass it to the crawler. proxy_configuration = ProxyConfiguration( proxy_urls=[ 'http://proxy-1.com/', 'http://proxy-2.com/', ] ) crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def default_handler(context: BeautifulSoupCrawlingContext) -> None: # Log the proxy used for the current request. context.log.info(f'Proxy for the current request: {context.proxy_info}') # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/proxy_management/inspecting_pw_example.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration async def main() -> None: # Create a ProxyConfiguration object and pass it to the crawler. proxy_configuration = ProxyConfiguration( proxy_urls=[ 'http://proxy-1.com/', 'http://proxy-2.com/', ] ) crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: # Log the proxy used for the current request. context.log.info(f'Proxy for the current request: {context.proxy_info}') # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/proxy_management/integration_bs_example.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration async def main() -> None: # Create a ProxyConfiguration object and pass it to the crawler. proxy_configuration = ProxyConfiguration( proxy_urls=[ 'http://proxy-1.com/', 'http://proxy-2.com/', ] ) crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def default_handler(context: BeautifulSoupCrawlingContext) -> None: # Extract data from the page. data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, } context.log.info(f'Extracted data: {data}') # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/proxy_management/integration_pw_example.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration async def main() -> None: # Create a ProxyConfiguration object and pass it to the crawler. proxy_configuration = ProxyConfiguration( proxy_urls=[ 'http://proxy-1.com/', 'http://proxy-2.com/', ] ) crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: # Extract data from the page. data = { 'url': context.request.url, 'title': await context.page.title(), } context.log.info(f'Extracted data: {data}') # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/proxy_management/quick_start_example.py ================================================ import asyncio from crawlee.proxy_configuration import ProxyConfiguration async def main() -> None: proxy_configuration = ProxyConfiguration( proxy_urls=[ 'http://proxy-1.com/', 'http://proxy-2.com/', ] ) # The proxy URLs are rotated in a round-robin. proxy_url_1 = await proxy_configuration.new_url() # http://proxy-1.com/ proxy_url_2 = await proxy_configuration.new_url() # http://proxy-2.com/ proxy_url_3 = await proxy_configuration.new_url() # http://proxy-1.com/ if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/proxy_management/session_bs_example.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler from crawlee.proxy_configuration import ProxyConfiguration async def main() -> None: # Create a ProxyConfiguration object and pass it to the crawler. proxy_configuration = ProxyConfiguration( proxy_urls=[ 'http://proxy-1.com/', 'http://proxy-2.com/', ] ) crawler = BeautifulSoupCrawler( proxy_configuration=proxy_configuration, use_session_pool=True, ) # ... if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/proxy_management/session_pw_example.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler from crawlee.proxy_configuration import ProxyConfiguration async def main() -> None: # Create a ProxyConfiguration object and pass it to the crawler. proxy_configuration = ProxyConfiguration( proxy_urls=[ 'http://proxy-1.com/', 'http://proxy-2.com/', ] ) crawler = PlaywrightCrawler( proxy_configuration=proxy_configuration, use_session_pool=True, ) # ... if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/proxy_management/tiers_bs_example.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration async def main() -> None: # Create a ProxyConfiguration object and pass it to the crawler. proxy_configuration = ProxyConfiguration( tiered_proxy_urls=[ # No proxy tier. # Optional in case you do not want to use any proxy on lowest tier. [None], # lower tier, cheaper, preferred as long as they work [ 'http://cheap-datacenter-proxy-1.com/', 'http://cheap-datacenter-proxy-2.com/', ], # higher tier, more expensive, used as a fallback [ 'http://expensive-residential-proxy-1.com/', 'http://expensive-residential-proxy-2.com/', ], ] ) crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def default_handler(context: BeautifulSoupCrawlingContext) -> None: # Log the proxy used for the current request. context.log.info(f'Proxy for the current request: {context.proxy_info}') # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/proxy_management/tiers_pw_example.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration async def main() -> None: # Create a ProxyConfiguration object and pass it to the crawler. proxy_configuration = ProxyConfiguration( tiered_proxy_urls=[ # No proxy tier. # Optional in case you do not want to use any proxy on lowest tier. [None], # lower tier, cheaper, preferred as long as they work [ 'http://cheap-datacenter-proxy-1.com/', 'http://cheap-datacenter-proxy-2.com/', ], # higher tier, more expensive, used as a fallback [ 'http://expensive-residential-proxy-1.com/', 'http://expensive-residential-proxy-2.com/', ], ] ) crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: # Log the proxy used for the current request. context.log.info(f'Proxy for the current request: {context.proxy_info}') # Run the crawler with the initial list of requests. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_loaders/rl_basic_example.py ================================================ import asyncio from crawlee.request_loaders import RequestList async def main() -> None: # Open the request list, if it does not exist, it will be created. # Leave name empty to use the default request list. request_list = RequestList( name='my-request-list', requests=[ 'https://apify.com/', 'https://crawlee.dev/', 'https://crawlee.dev/python/', ], ) # Fetch and process requests from the queue. while request := await request_list.fetch_next_request(): # Do something with it... print(f'Processing {request.url}') # And mark it as handled. await request_list.mark_request_as_handled(request) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_loaders/rl_basic_example_with_persist.py ================================================ import asyncio import logging from crawlee import service_locator from crawlee.request_loaders import RequestList logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(message)s') logger = logging.getLogger(__name__) # Disable clearing the `KeyValueStore` on each run. # This is necessary so that the state keys are not cleared at startup. # The recommended way to achieve this behavior is setting the environment variable # `CRAWLEE_PURGE_ON_START=0` configuration = service_locator.get_configuration() configuration.purge_on_start = False async def main() -> None: # Open the request list, if it does not exist, it will be created. # Leave name empty to use the default request list. request_list = RequestList( name='my-request-list', requests=[ 'https://apify.com/', 'https://crawlee.dev/', 'https://crawlee.dev/python/', ], # Enable persistence persist_state_key='my-persist-state', persist_requests_key='my-persist-requests', ) # We receive only one request. # Each time you run it, it will be a new request until you exhaust the `RequestList`. request = await request_list.fetch_next_request() if request: logger.info(f'Processing request: {request.url}') # Do something with it... # And mark it as handled. await request_list.mark_request_as_handled(request) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_loaders/rl_tandem_example.py ================================================ import asyncio from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.request_loaders import RequestList async def main() -> None: # Create a static request list. request_list = RequestList(['https://crawlee.dev', 'https://apify.com']) # highlight-start # Convert the request list to a request manager using the to_tandem method. # It is a tandem with the default request queue. request_manager = await request_list.to_tandem() # highlight-end # Create a crawler and pass the request manager to it. crawler = ParselCrawler( request_manager=request_manager, max_requests_per_crawl=10, # Limit the max requests per crawl. ) @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # New links will be enqueued directly to the queue. await context.enqueue_links() # Extract data using Parsel's XPath and CSS selectors. data = { 'url': context.request.url, 'title': context.selector.xpath('//title/text()').get(), } # Push extracted data to the dataset. await context.push_data(data) await crawler.run() if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py ================================================ import asyncio from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.request_loaders import RequestList, RequestManagerTandem from crawlee.storages import RequestQueue async def main() -> None: # Create a static request list. request_list = RequestList(['https://crawlee.dev', 'https://apify.com']) # Open the default request queue. request_queue = await RequestQueue.open() # And combine them together to a sinhle request manager. request_manager = RequestManagerTandem(request_list, request_queue) # Create a crawler and pass the request manager to it. crawler = ParselCrawler( request_manager=request_manager, max_requests_per_crawl=10, # Limit the max requests per crawl. ) @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # New links will be enqueued directly to the queue. await context.enqueue_links() # Extract data using Parsel's XPath and CSS selectors. data = { 'url': context.request.url, 'title': context.selector.xpath('//title/text()').get(), } # Push extracted data to the dataset. await context.push_data(data) await crawler.run() if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_loaders/sitemap_basic_example.py ================================================ import asyncio import re from crawlee.http_clients import ImpitHttpClient from crawlee.request_loaders import SitemapRequestLoader async def main() -> None: # Create an HTTP client for fetching the sitemap. http_client = ImpitHttpClient() # Create a sitemap request loader with filtering rules. sitemap_loader = SitemapRequestLoader( sitemap_urls=['https://crawlee.dev/sitemap.xml'], http_client=http_client, include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'. max_buffer_size=500, # Keep up to 500 URLs in memory before processing. ) # We work with the loader until we process all relevant links from the sitemap. while request := await sitemap_loader.fetch_next_request(): # Do something with it... print(f'Processing {request.url}') # And mark it as handled. await sitemap_loader.mark_request_as_handled(request) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_loaders/sitemap_example_with_persist.py ================================================ import asyncio import logging from crawlee import service_locator from crawlee.http_clients import ImpitHttpClient from crawlee.request_loaders import SitemapRequestLoader logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(message)s') logger = logging.getLogger(__name__) # Disable clearing the `KeyValueStore` on each run. # This is necessary so that the state keys are not cleared at startup. # The recommended way to achieve this behavior is setting the environment variable # `CRAWLEE_PURGE_ON_START=0` configuration = service_locator.get_configuration() configuration.purge_on_start = False async def main() -> None: # Create an HTTP client for fetching sitemaps # Use the context manager for `SitemapRequestLoader` to correctly save the state when # the work is completed. async with ( ImpitHttpClient() as http_client, SitemapRequestLoader( sitemap_urls=['https://crawlee.dev/sitemap.xml'], http_client=http_client, # Enable persistence persist_state_key='my-persist-state', ) as sitemap_loader, ): # We receive only one request. # Each time you run it, it will be a new request until you exhaust the sitemap. request = await sitemap_loader.fetch_next_request() if request: logger.info(f'Processing request: {request.url}') # Do something with it... # And mark it as handled. await sitemap_loader.mark_request_as_handled(request) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_loaders/sitemap_tandem_example.py ================================================ import asyncio import re from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.http_clients import ImpitHttpClient from crawlee.request_loaders import SitemapRequestLoader async def main() -> None: # Create an HTTP client for fetching the sitemap. http_client = ImpitHttpClient() # Create a sitemap request loader with filtering rules. sitemap_loader = SitemapRequestLoader( sitemap_urls=['https://crawlee.dev/sitemap.xml'], http_client=http_client, include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'. max_buffer_size=500, # Keep up to 500 URLs in memory before processing. ) # highlight-start # Convert the sitemap loader into a request manager linked # to the default request queue. request_manager = await sitemap_loader.to_tandem() # highlight-end # Create a crawler and pass the request manager to it. crawler = ParselCrawler( request_manager=request_manager, max_requests_per_crawl=10, # Limit the max requests per crawl. ) @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # New links will be enqueued directly to the queue. await context.enqueue_links() # Extract data using Parsel's XPath and CSS selectors. data = { 'url': context.request.url, 'title': context.selector.xpath('//title/text()').get(), } # Push extracted data to the dataset. await context.push_data(data) await crawler.run() if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py ================================================ import asyncio import re from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.http_clients import ImpitHttpClient from crawlee.request_loaders import RequestManagerTandem, SitemapRequestLoader from crawlee.storages import RequestQueue async def main() -> None: # Create an HTTP client for fetching the sitemap. http_client = ImpitHttpClient() # Create a sitemap request loader with filtering rules. sitemap_loader = SitemapRequestLoader( sitemap_urls=['https://crawlee.dev/sitemap.xml'], http_client=http_client, include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'. max_buffer_size=500, # Keep up to 500 URLs in memory before processing. ) # Open the default request queue. request_queue = await RequestQueue.open() # And combine them together to a single request manager. request_manager = RequestManagerTandem(sitemap_loader, request_queue) # Create a crawler and pass the request manager to it. crawler = ParselCrawler( request_manager=request_manager, max_requests_per_crawl=10, # Limit the max requests per crawl. ) @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # New links will be enqueued directly to the queue. await context.enqueue_links() # Extract data using Parsel's XPath and CSS selectors. data = { 'url': context.request.url, 'title': context.selector.xpath('//title/text()').get(), } # Push extracted data to the dataset. await context.push_data(data) await crawler.run() if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_router/adaptive_crawler_handlers.py ================================================ import asyncio from crawlee import HttpHeaders from crawlee.crawlers import ( AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, ) async def main() -> None: crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( max_requests_per_crawl=10, # Limit the max requests per crawl. ) @crawler.pre_navigation_hook async def common_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: # Common pre-navigation hook - runs for both HTTP and browser requests. context.request.headers |= HttpHeaders( {'Accept': 'text/html,application/xhtml+xml'}, ) @crawler.pre_navigation_hook(playwright_only=True) async def browser_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: # Playwright-specific pre-navigation hook - runs only when browser is used. await context.page.set_viewport_size({'width': 1280, 'height': 720}) if context.block_requests: await context.block_requests(extra_url_patterns=['*.css', '*.js']) @crawler.router.default_handler async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None: # Extract title using the unified context interface. title_tag = context.parsed_content.find('title') title = title_tag.get_text() if title_tag else None # Extract other data consistently across both modes. links = [a.get('href') for a in context.parsed_content.find_all('a', href=True)] await context.push_data( { 'url': context.request.url, 'title': title, 'links': links, } ) await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_router/basic_request_handlers.py ================================================ import asyncio from crawlee import Request from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.router import Router async def main() -> None: # Create a custom router instance router = Router[ParselCrawlingContext]() # Define the default handler (fallback for requests without specific labels) @router.default_handler async def default_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing homepage: {context.request.url}') # Extract page title title = context.selector.css('title::text').get() or 'No title found' await context.push_data( { 'url': context.request.url, 'title': title, 'page_type': 'homepage', } ) # Find and enqueue collection/category links await context.enqueue_links(selector='a[href*="/collections/"]', label='CATEGORY') # Define a handler for category pages @router.handler('CATEGORY') async def category_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing category page: {context.request.url}') # Extract category information category_title = context.selector.css('h1::text').get() or 'Unknown Category' product_count = len(context.selector.css('.product-item').getall()) await context.push_data( { 'url': context.request.url, 'type': 'category', 'category_title': category_title, 'product_count': product_count, 'handler': 'category', } ) # Enqueue product links from this category await context.enqueue_links(selector='a[href*="/products/"]', label='PRODUCT') # Define a handler for product detail pages @router.handler('PRODUCT') async def product_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing product page: {context.request.url}') # Extract detailed product information product_data = { 'url': context.request.url, 'name': context.selector.css('h1::text').get(), 'price': context.selector.css('.price::text').get(), 'description': context.selector.css('.product-description p::text').get(), 'images': context.selector.css('.product-gallery img::attr(src)').getall(), 'in_stock': bool(context.selector.css('.add-to-cart-button').get()), 'handler': 'product', } await context.push_data(product_data) # Create crawler with the router crawler = ParselCrawler( request_handler=router, max_requests_per_crawl=10, # Limit the max requests per crawl. ) # Start crawling with some initial requests await crawler.run( [ # Will use default handler 'https://warehouse-theme-metal.myshopify.com/', # Will use category handler Request.from_url( 'https://warehouse-theme-metal.myshopify.com/collections/all', label='CATEGORY', ), ] ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_router/custom_router_default_only.py ================================================ import asyncio from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.router import Router async def main() -> None: # Create a custom router instance router = Router[ParselCrawlingContext]() # Define only a default handler @router.default_handler async def default_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # Extract page title title = context.selector.css('title::text').get() or 'No title found' # Extract and save basic page data await context.push_data( { 'url': context.request.url, 'title': title, } ) # Find and enqueue product links for further crawling await context.enqueue_links( selector='a[href*="/products/"]', label='PRODUCT', # Note: no handler for this label, will use default ) # Create crawler with the custom router crawler = ParselCrawler( request_handler=router, max_requests_per_crawl=10, # Limit the max requests per crawl. ) # Start crawling await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_router/error_handler.py ================================================ import asyncio from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext from crawlee.errors import HttpStatusCodeError # HTTP status code constants TOO_MANY_REQUESTS = 429 async def main() -> None: # Create a crawler instance crawler = ParselCrawler( max_requests_per_crawl=10, # Limit the max requests per crawl. ) @crawler.router.default_handler async def default_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # Extract product information (might fail for some pages) product_name = context.selector.css('h1[data-testid="product-title"]::text').get() if not product_name: raise ValueError('Product name not found - might be a non-product page') price = context.selector.css('.price::text').get() await context.push_data( { 'url': context.request.url, 'product_name': product_name, 'price': price, } ) # Error handler - called when an error occurs during request processing @crawler.error_handler async def error_handler(context: BasicCrawlingContext, error: Exception) -> None: error_name = type(error).__name__ context.log.warning(f'Error occurred for {context.request.url}: {error_name}') # You can modify the request or context here before retry if ( isinstance(error, HttpStatusCodeError) and error.status_code == TOO_MANY_REQUESTS ): context.log.info('Rate limited - will retry with delay') # You could modify headers, add delay, etc. elif isinstance(error, ValueError): context.log.info('Parse error - marking request as no retry') context.request.no_retry = True # Start crawling await crawler.run( [ 'https://warehouse-theme-metal.myshopify.com/products/on-running-cloudmonster-2-mens', # Might cause parse error 'https://warehouse-theme-metal.myshopify.com/collections/mens-running', ] ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_router/failed_request_handler.py ================================================ import asyncio from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext async def main() -> None: # Create a crawler instance with retry settings crawler = ParselCrawler( max_requests_per_crawl=10, # Limit the max requests per crawl. max_request_retries=2, # Allow 2 retries before failing ) @crawler.router.default_handler async def default_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # Extract product information product_name = context.selector.css('h1[data-testid="product-title"]::text').get() if not product_name: product_name = context.selector.css('h1::text').get() or 'Unknown Product' price = context.selector.css('.price::text').get() or 'Price not available' await context.push_data( { 'url': context.request.url, 'product_name': product_name, 'price': price, 'status': 'success', } ) # Failed request handler - called when request has exhausted all retries @crawler.failed_request_handler async def failed_handler(context: BasicCrawlingContext, error: Exception) -> None: context.log.error( f'Failed to process {context.request.url} after all retries: {error}' ) # Save failed request information for analysis await context.push_data( { 'failed_url': context.request.url, 'label': context.request.label, 'error_type': type(error).__name__, 'error_message': str(error), 'retry_count': context.request.retry_count, 'status': 'failed', } ) # Start crawling with some URLs that might fail await crawler.run( [ 'https://warehouse-theme-metal.myshopify.com/products/on-running-cloudmonster-2-mens', # This will likely fail 'https://warehouse-theme-metal.myshopify.com/invalid-url', 'https://warehouse-theme-metal.myshopify.com/products/valid-product', ] ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_router/http_pre_navigation.py ================================================ import asyncio from crawlee import HttpHeaders from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext async def main() -> None: crawler = ParselCrawler( max_requests_per_crawl=10, # Limit the max requests per crawl. ) @crawler.pre_navigation_hook async def setup_request(context: BasicCrawlingContext) -> None: # Add custom headers before making the request context.request.headers |= HttpHeaders( { 'User-Agent': 'Crawlee Bot 1.0', 'Accept': 'text/html,application/xhtml+xml', }, ) @crawler.router.default_handler async def default_handler(context: ParselCrawlingContext) -> None: # Extract basic page information title = context.selector.css('title::text').get() await context.push_data( { 'url': context.request.url, 'title': title, } ) await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_router/playwright_pre_navigation.py ================================================ import asyncio from crawlee.crawlers import ( PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext, ) async def main() -> None: crawler = PlaywrightCrawler( max_requests_per_crawl=10, # Limit the max requests per crawl. ) @crawler.pre_navigation_hook async def setup_page(context: PlaywrightPreNavCrawlingContext) -> None: # Set viewport size for consistent rendering await context.page.set_viewport_size({'width': 1280, 'height': 720}) # Block unnecessary resources to speed up crawling await context.block_requests( extra_url_patterns=[ '*.png', '*.jpg', '*.jpeg', '*.gif', '*.svg', '*.css', '*.woff', '*.woff2', '*.ttf', '*google-analytics*', '*facebook*', '*twitter*', ] ) # Set custom user agent await context.page.set_extra_http_headers( { 'User-Agent': 'Mozilla/5.0 (compatible; Crawlee Bot)', } ) @crawler.router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: title = await context.page.title() await context.push_data( { 'url': context.request.url, 'title': title, } ) await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/request_router/simple_default_handler.py ================================================ import asyncio from crawlee.crawlers import ParselCrawler, ParselCrawlingContext async def main() -> None: # Create a crawler instance crawler = ParselCrawler( max_requests_per_crawl=10, # Limit the max requests per crawl. ) # Use the crawler's built-in router to define a default handler @crawler.router.default_handler async def default_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # Extract page title title = context.selector.css('title::text').get() or 'No title found' # Extract and save basic page data await context.push_data( { 'url': context.request.url, 'title': title, } ) # Find and enqueue product links for further crawling await context.enqueue_links(selector='a[href*="/products/"]', label='PRODUCT') # Start crawling await crawler.run(['https://warehouse-theme-metal.myshopify.com/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/running_in_web_server/__init__.py ================================================ ================================================ FILE: docs/guides/code_examples/running_in_web_server/crawler.py ================================================ import asyncio from collections.abc import AsyncIterator from contextlib import asynccontextmanager from typing import TypedDict from fastapi import FastAPI from crawlee.crawlers import ParselCrawler, ParselCrawlingContext class State(TypedDict): """State available in the app.""" crawler: ParselCrawler requests_to_results: dict[str, asyncio.Future[dict[str, str]]] @asynccontextmanager async def lifespan(app: FastAPI) -> AsyncIterator[State]: # Start up code that runs once when the app starts # Results will be stored in this dictionary requests_to_results = dict[str, asyncio.Future[dict[str, str]]]() crawler = ParselCrawler( # Keep the crawler alive even when there are no more requests to process now. # This makes the crawler wait for more requests to be added later. keep_alive=True ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') title = context.selector.xpath('//title/text()').get() or '' # Extract data from the page and save it to the result dictionary. requests_to_results[context.request.unique_key].set_result( { 'title': title, } ) # Start the crawler without awaiting it to finish crawler.log.info(f'Starting crawler for the {app.title}') run_task = asyncio.create_task(crawler.run([])) # Make the crawler and the result dictionary available in the app state yield {'crawler': crawler, 'requests_to_results': requests_to_results} # Cleanup code that runs once when the app shuts down crawler.stop() # Wait for the crawler to finish await run_task ================================================ FILE: docs/guides/code_examples/running_in_web_server/server.py ================================================ from __future__ import annotations import asyncio from uuid import uuid4 from fastapi import FastAPI from starlette.requests import Request from starlette.responses import HTMLResponse import crawlee from .crawler import lifespan app = FastAPI(lifespan=lifespan, title='Crawler app') @app.get('/', response_class=HTMLResponse) def index() -> str: return """

Scraper server

To scrape some page, visit "scrape" endpoint with url parameter. For example: /scrape?url=https://www.example.com

""" @app.get('/scrape') async def scrape_url(request: Request, url: str | None = None) -> dict: if not url: return {'url': 'missing', 'scrape result': 'no results'} # Generate random unique key for the request unique_key = str(uuid4()) # Set the result future in the result dictionary so that it can be awaited request.state.requests_to_results[unique_key] = asyncio.Future[dict[str, str]]() # Add the request to the crawler queue await request.state.crawler.add_requests( [crawlee.Request.from_url(url, unique_key=unique_key)] ) # Wait for the result future to be finished result = await request.state.requests_to_results[unique_key] # Clean the result from the result dictionary to free up memory request.state.requests_to_results.pop(unique_key) # Return the result return {'url': url, 'scrape result': result} ================================================ FILE: docs/guides/code_examples/scaling_crawlers/max_tasks_per_minute_example.py ================================================ import asyncio from crawlee import ConcurrencySettings from crawlee.crawlers import BeautifulSoupCrawler async def main() -> None: concurrency_settings = ConcurrencySettings( # Set the maximum number of concurrent requests the crawler can run to 100. max_concurrency=100, # Limit the total number of requests to 10 per minute to avoid overwhelming # the target website. max_tasks_per_minute=10, ) crawler = BeautifulSoupCrawler( # Apply the defined concurrency settings to the crawler. concurrency_settings=concurrency_settings, ) # ... if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/scaling_crawlers/min_and_max_concurrency_example.py ================================================ import asyncio from crawlee import ConcurrencySettings from crawlee.crawlers import BeautifulSoupCrawler async def main() -> None: concurrency_settings = ConcurrencySettings( # Start with 8 concurrent tasks, as long as resources are available. desired_concurrency=8, # Maintain a minimum of 5 concurrent tasks to ensure steady crawling. min_concurrency=5, # Limit the maximum number of concurrent tasks to 10 to prevent # overloading the system. max_concurrency=10, ) crawler = BeautifulSoupCrawler( # Use the configured concurrency settings for the crawler. concurrency_settings=concurrency_settings, ) # ... if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/service_locator/service_conflicts.py ================================================ import asyncio from crawlee import service_locator from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient async def main() -> None: # Register the storage client via service locator. memory_storage_client = MemoryStorageClient() service_locator.set_storage_client(memory_storage_client) # Retrieve the storage client. current_storage_client = service_locator.get_storage_client() # Try to set a different storage client, which will raise ServiceConflictError # if storage client was already retrieved. file_system_storage_client = FileSystemStorageClient() service_locator.set_storage_client(file_system_storage_client) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/service_locator/service_crawler_configuration.py ================================================ import asyncio from datetime import timedelta from crawlee.configuration import Configuration from crawlee.crawlers import ParselCrawler async def main() -> None: configuration = Configuration( log_level='DEBUG', headless=False, persist_state_interval=timedelta(seconds=30), ) # Register configuration via crawler. crawler = ParselCrawler( configuration=configuration, ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/service_locator/service_crawler_event_manager.py ================================================ import asyncio from datetime import timedelta from crawlee.crawlers import ParselCrawler from crawlee.events import LocalEventManager async def main() -> None: event_manager = LocalEventManager( system_info_interval=timedelta(seconds=5), ) # Register event manager via crawler. crawler = ParselCrawler( event_manager=event_manager, ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/service_locator/service_crawler_storage_client.py ================================================ import asyncio from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import MemoryStorageClient async def main() -> None: storage_client = MemoryStorageClient() # Register storage client via crawler. crawler = ParselCrawler( storage_client=storage_client, ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/service_locator/service_locator_configuration.py ================================================ import asyncio from datetime import timedelta from crawlee import service_locator from crawlee.configuration import Configuration async def main() -> None: configuration = Configuration( log_level='DEBUG', headless=False, persist_state_interval=timedelta(seconds=30), ) # Register configuration via service locator. service_locator.set_configuration(configuration) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/service_locator/service_locator_event_manager.py ================================================ import asyncio from datetime import timedelta from crawlee import service_locator from crawlee.events import LocalEventManager async def main() -> None: event_manager = LocalEventManager( system_info_interval=timedelta(seconds=5), ) # Register event manager via service locator. service_locator.set_event_manager(event_manager) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/service_locator/service_locator_storage_client.py ================================================ import asyncio from crawlee import service_locator from crawlee.storage_clients import MemoryStorageClient async def main() -> None: storage_client = MemoryStorageClient() # Register storage client via service locator. service_locator.set_storage_client(storage_client) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/service_locator/service_storage_configuration.py ================================================ import asyncio from datetime import timedelta from crawlee import service_locator from crawlee.configuration import Configuration from crawlee.storage_clients import MemoryStorageClient from crawlee.storages import Dataset async def main() -> None: configuration = Configuration( log_level='DEBUG', headless=False, persist_state_interval=timedelta(seconds=30), ) # Set the custom configuration as the global default configuration. service_locator.set_configuration(configuration) # Use the global defaults when creating the dataset (or other storage). dataset_1 = await Dataset.open() # Or set explicitly specific configuration if # you do not want to rely on global defaults. dataset_2 = await Dataset.open( storage_client=MemoryStorageClient(), configuration=configuration ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/service_locator/service_storage_storage_client.py ================================================ import asyncio from crawlee.storage_clients import MemoryStorageClient from crawlee.storages import Dataset async def main() -> None: storage_client = MemoryStorageClient() # Pass the storage client to the dataset (or other storage) when opening it. dataset = await Dataset.open( storage_client=storage_client, ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/session_management/multi_sessions_http.py ================================================ import asyncio from collections.abc import Callable from datetime import timedelta from itertools import count from crawlee import ConcurrencySettings, Request from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext from crawlee.errors import RequestCollisionError from crawlee.sessions import Session, SessionPool # Define a function for creating sessions with simple logic for unique `id` generation. # This is necessary if you need to specify a particular session for the first request, # for example during authentication def create_session_function() -> Callable[[], Session]: counter = count() def create_session() -> Session: return Session( id=str(next(counter)), max_usage_count=999_999, max_age=timedelta(hours=999_999), max_error_score=100, blocked_status_codes=[403], ) return create_session async def main() -> None: crawler = HttpCrawler( # Adjust request limits according to your pool size concurrency_settings=ConcurrencySettings(max_tasks_per_minute=500), # Requests are bound to specific sessions, no rotation needed max_session_rotations=0, session_pool=SessionPool( max_pool_size=10, create_session_function=create_session_function() ), ) @crawler.router.default_handler async def basic_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # Initialize the session and bind the next request to this session if needed @crawler.router.handler(label='session_init') async def session_init(context: HttpCrawlingContext) -> None: next_requests = [] if context.session: context.log.info(f'Init session {context.session.id}') next_request = Request.from_url( 'https://a.placeholder.com', session_id=context.session.id ) next_requests.append(next_request) await context.add_requests(next_requests) # Handle errors when a session is blocked and no longer available in the pool # when attempting to execute requests bound to it @crawler.failed_request_handler async def error_processing(context: BasicCrawlingContext, error: Exception) -> None: if isinstance(error, RequestCollisionError) and context.session: context.log.error( f'Request {context.request.url} failed, because the bound ' 'session is unavailable' ) # Create a pool of requests bound to their respective sessions # Use `always_enqueue=True` if session initialization happens on a non-unique address, # such as the site's main page init_requests = [ Request.from_url( 'https://example.org/', label='session_init', session_id=str(session_id), use_extended_unique_key=True, ) for session_id in range(1, 11) ] await crawler.run(init_requests) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/session_management/one_session_http.py ================================================ import asyncio from datetime import timedelta from crawlee import ConcurrencySettings, Request from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext from crawlee.errors import SessionError from crawlee.sessions import SessionPool async def main() -> None: crawler = HttpCrawler( # Limit requests per minute to reduce the chance of being blocked concurrency_settings=ConcurrencySettings(max_tasks_per_minute=50), # Disable session rotation max_session_rotations=0, session_pool=SessionPool( # Only one session in the pool max_pool_size=1, create_session_settings={ # High value for session usage limit 'max_usage_count': 999_999, # High value for session lifetime 'max_age': timedelta(hours=999_999), # High score allows the session to encounter more errors # before crawlee decides the session is blocked # Make sure you know how to handle these errors 'max_error_score': 100, # 403 status usually indicates you're already blocked 'blocked_status_codes': [403], }, ), ) # Basic request handling logic @crawler.router.default_handler async def basic_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # Handler for session initialization (authentication, initial cookies, etc.) @crawler.router.handler(label='session_init') async def session_init(context: HttpCrawlingContext) -> None: if context.session: context.log.info(f'Init session {context.session.id}') # Monitor if our session gets blocked and explicitly stop the crawler @crawler.error_handler async def error_processing(context: BasicCrawlingContext, error: Exception) -> None: if isinstance(error, SessionError) and context.session: context.log.info(f'Session {context.session.id} blocked') crawler.stop() await crawler.run([Request.from_url('https://example.org/', label='session_init')]) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/session_management/sm_basic.py ================================================ import asyncio import re from crawlee.crawlers import BasicCrawler, BasicCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration from crawlee.sessions import SessionPool async def main() -> None: # To use the proxy IP session rotation logic, you must turn the proxy usage on. proxy_configuration = ProxyConfiguration( # options ) # Initialize crawler with a custom SessionPool configuration # to manage concurrent sessions and proxy rotation crawler = BasicCrawler( proxy_configuration=proxy_configuration, # Activates the Session pool (default is true). use_session_pool=True, # Overrides default Session pool configuration. session_pool=SessionPool(max_pool_size=100), ) # Define the default request handler that manages session states @crawler.router.default_handler async def default_handler(context: BasicCrawlingContext) -> None: # Send request, BasicCrawler automatically selects a session from the pool # and sets a proxy for it. You can check it with `context.session` # and `context.proxy_info`. response = await context.send_request(context.request.url) page_content = (await response.read()).decode() title_match = re.search(r'(.*?)', page_content) if context.session and (title := title_match.group(1) if title_match else None): if title == 'Blocked': context.session.retire() elif title == 'Not sure if blocked, might also be a connection error': context.session.mark_bad() else: context.session.mark_good() # BasicCrawler handles this automatically. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/session_management/sm_beautifulsoup.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration from crawlee.sessions import SessionPool async def main() -> None: # To use the proxy IP session rotation logic, you must turn the proxy usage on. proxy_configuration = ProxyConfiguration( # options ) # Initialize crawler with a custom SessionPool configuration # to manage concurrent sessions and proxy rotation crawler = BeautifulSoupCrawler( proxy_configuration=proxy_configuration, # Activates the Session pool (default is true). use_session_pool=True, # Overrides default Session pool configuration. session_pool=SessionPool(max_pool_size=100), ) # Define the default request handler that manages session states # based on the response content and potential blocking @crawler.router.default_handler async def default_handler(context: BeautifulSoupCrawlingContext) -> None: title = context.soup.title.get_text() if context.soup.title else None if context.session: if title == 'Blocked': context.session.retire() elif title == 'Not sure if blocked, might also be a connection error': context.session.mark_bad() else: context.session.mark_good() # BasicCrawler handles this automatically. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/session_management/sm_http.py ================================================ import asyncio import re from crawlee.crawlers import HttpCrawler, HttpCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration from crawlee.sessions import SessionPool async def main() -> None: # To use the proxy IP session rotation logic, you must turn the proxy usage on. proxy_configuration = ProxyConfiguration( # options ) # Initialize crawler with a custom SessionPool configuration # to manage concurrent sessions and proxy rotation crawler = HttpCrawler( proxy_configuration=proxy_configuration, # Activates the Session pool (default is true). use_session_pool=True, # Overrides default Session pool configuration. session_pool=SessionPool(max_pool_size=100), ) # Define the default request handler that manages session states # based on the response content and potential blocking @crawler.router.default_handler async def default_handler(context: HttpCrawlingContext) -> None: page_content = (await context.http_response.read()).decode() title_match = re.search(r'(.*?)', page_content) if context.session and (title := title_match.group(1) if title_match else None): if title == 'Blocked': context.session.retire() elif title == 'Not sure if blocked, might also be a connection error': context.session.mark_bad() else: context.session.mark_good() # BasicCrawler handles this automatically. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/session_management/sm_parsel.py ================================================ import asyncio from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration from crawlee.sessions import SessionPool async def main() -> None: # To use the proxy IP session rotation logic, you must turn the proxy usage on. proxy_configuration = ProxyConfiguration( # options ) # Initialize crawler with a custom SessionPool configuration # to manage concurrent sessions and proxy rotation crawler = ParselCrawler( proxy_configuration=proxy_configuration, # Activates the Session pool (default is true). use_session_pool=True, # Overrides default Session pool configuration. session_pool=SessionPool(max_pool_size=100), ) # Define the default request handler that manages session states # based on the response content and potential blocking @crawler.router.default_handler async def default_handler(context: ParselCrawlingContext) -> None: title = context.selector.css('title::text').get() if context.session: if title == 'Blocked': context.session.retire() elif title == 'Not sure if blocked, might also be a connection error': context.session.mark_bad() else: context.session.mark_good() # BasicCrawler handles this automatically. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/session_management/sm_playwright.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration from crawlee.sessions import SessionPool async def main() -> None: # To use the proxy IP session rotation logic, you must turn the proxy usage on. proxy_configuration = ProxyConfiguration( # options ) # Initialize crawler with a custom SessionPool configuration # to manage concurrent sessions and proxy rotation crawler = PlaywrightCrawler( proxy_configuration=proxy_configuration, # Activates the Session pool (default is true). use_session_pool=True, # Overrides default Session pool configuration. session_pool=SessionPool(max_pool_size=100), ) # Define the default request handler that manages session states # based on the response content and potential blocking @crawler.router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: title = await context.page.title() if context.session: if title == 'Blocked': context.session.retire() elif title == 'Not sure if blocked, might also be a connection error': context.session.mark_bad() else: context.session.mark_good() # BasicCrawler handles this automatically. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/session_management/sm_standalone.py ================================================ import asyncio from crawlee.sessions import SessionPool async def main() -> None: # Override the default Session pool configuration. async with SessionPool( max_pool_size=100, create_session_settings={'max_usage_count': 10, 'blocked_status_codes': [403]}, ) as session_pool: session = await session_pool.get_session() # Increase the error_score. session.mark_bad() # Throw away the session. session.retire() # Lower the error_score and mark the session good. session.mark_good() if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storage_clients/custom_storage_client_example.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import ( DatasetClient, KeyValueStoreClient, RequestQueueClient, ) if TYPE_CHECKING: from crawlee.configuration import Configuration # Implement the storage type clients with your backend logic. class CustomDatasetClient(DatasetClient): # Implement methods like push_data, get_data, iterate_items, etc. pass class CustomKeyValueStoreClient(KeyValueStoreClient): # Implement methods like get_value, set_value, delete, etc. pass class CustomRequestQueueClient(RequestQueueClient): # Implement methods like add_request, fetch_next_request, etc. pass # Implement the storage client factory. class CustomStorageClient(StorageClient): async def create_dataset_client( self, *, id: str | None = None, name: str | None = None, configuration: Configuration | None = None, ) -> CustomDatasetClient: # Create and return your custom dataset client. pass async def create_kvs_client( self, *, id: str | None = None, name: str | None = None, configuration: Configuration | None = None, ) -> CustomKeyValueStoreClient: # Create and return your custom key-value store client. pass async def create_rq_client( self, *, id: str | None = None, name: str | None = None, configuration: Configuration | None = None, ) -> CustomRequestQueueClient: # Create and return your custom request queue client. pass ================================================ FILE: docs/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py ================================================ from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import FileSystemStorageClient # Create a new instance of storage client. storage_client = FileSystemStorageClient() # And pass it to the crawler. crawler = ParselCrawler(storage_client=storage_client) ================================================ FILE: docs/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py ================================================ from crawlee.configuration import Configuration from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import FileSystemStorageClient # Create a new instance of storage client. storage_client = FileSystemStorageClient() # Create a configuration with custom settings. configuration = Configuration( storage_dir='./my_storage', purge_on_start=False, ) # And pass them to the crawler. crawler = ParselCrawler( storage_client=storage_client, configuration=configuration, ) ================================================ FILE: docs/guides/code_examples/storage_clients/memory_storage_client_basic_example.py ================================================ from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import MemoryStorageClient # Create a new instance of storage client. storage_client = MemoryStorageClient() # And pass it to the crawler. crawler = ParselCrawler(storage_client=storage_client) ================================================ FILE: docs/guides/code_examples/storage_clients/redis_storage_client_basic_example.py ================================================ from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import RedisStorageClient # Create a new instance of storage client using connection string. # 'redis://localhost:6379' is the just placeholder, replace it with your actual # connection string. storage_client = RedisStorageClient(connection_string='redis://localhost:6379') # And pass it to the crawler. crawler = ParselCrawler(storage_client=storage_client) ================================================ FILE: docs/guides/code_examples/storage_clients/redis_storage_client_configuration_example.py ================================================ from redis.asyncio import Redis from crawlee.configuration import Configuration from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import RedisStorageClient # Create a new instance of storage client using a Redis client with custom settings. # Replace host and port with your actual Redis server configuration. # Other Redis client settings can be adjusted as needed. storage_client = RedisStorageClient( redis=Redis( host='localhost', port=6379, retry_on_timeout=True, socket_keepalive=True, socket_connect_timeout=10, ) ) # Create a configuration with custom settings. configuration = Configuration(purge_on_start=False) # And pass them to the crawler. crawler = ParselCrawler( storage_client=storage_client, configuration=configuration, ) ================================================ FILE: docs/guides/code_examples/storage_clients/registering_storage_clients_example.py ================================================ import asyncio from crawlee import service_locator from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import MemoryStorageClient from crawlee.storages import Dataset async def main() -> None: # Create custom storage client, MemoryStorageClient for example. storage_client = MemoryStorageClient() # Register it globally via the service locator. service_locator.set_storage_client(storage_client) # Or pass it directly to the crawler, it will be registered globally # to the service locator under the hood. crawler = ParselCrawler(storage_client=storage_client) # Or just provide it when opening a storage (e.g. dataset), it will be used # for this storage only, not globally. dataset = await Dataset.open( name='my-dataset', storage_client=storage_client, ) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storage_clients/sql_storage_client_basic_example.py ================================================ from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import SqlStorageClient async def main() -> None: # Create a new instance of storage client. # This will create an SQLite database file crawlee.db or created tables in your # database if you pass `connection_string` or `engine` # Use the context manager to ensure that connections are properly cleaned up. async with SqlStorageClient() as storage_client: # And pass it to the crawler. crawler = ParselCrawler(storage_client=storage_client) ================================================ FILE: docs/guides/code_examples/storage_clients/sql_storage_client_configuration_example.py ================================================ from sqlalchemy.ext.asyncio import create_async_engine from crawlee.configuration import Configuration from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import SqlStorageClient async def main() -> None: # Create a new instance of storage client. # On first run, also creates tables in your PostgreSQL database. # Use the context manager to ensure that connections are properly cleaned up. async with SqlStorageClient( # Create an `engine` with the desired configuration engine=create_async_engine( 'postgresql+asyncpg://myuser:mypassword@localhost:5432/postgres', future=True, pool_size=5, max_overflow=10, pool_recycle=3600, pool_pre_ping=True, echo=False, ) ) as storage_client: # Create a configuration with custom settings. configuration = Configuration( purge_on_start=False, ) # And pass them to the crawler. crawler = ParselCrawler( storage_client=storage_client, configuration=configuration, ) ================================================ FILE: docs/guides/code_examples/storages/cleaning_do_not_purge_example.py ================================================ import asyncio from crawlee.configuration import Configuration from crawlee.crawlers import HttpCrawler, HttpCrawlingContext async def main() -> None: # Set the purge_on_start field to False to avoid purging the storage on start. # highlight-next-line configuration = Configuration(purge_on_start=False) # Pass the configuration to the crawler. crawler = HttpCrawler(configuration=configuration) @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py ================================================ import asyncio from crawlee.storages import Dataset async def main() -> None: # Create storage client with configuration dataset = await Dataset.open(name='my-dataset') # Purge the dataset explicitly - purging will remove all items from the dataset. # But keeps the dataset itself and its metadata. await dataset.purge() # Or you can drop the dataset completely, which will remove the dataset # and all its items. await dataset.drop() if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storages/dataset_basic_example.py ================================================ import asyncio from crawlee.storages import Dataset async def main() -> None: # Open the dataset, if it does not exist, it will be created. # Leave name empty to use the default dataset. dataset = await Dataset.open(name='my-dataset') # Push a single row of data. await dataset.push_data({'foo': 'bar'}) # Push multiple rows of data (anything JSON-serializable can be pushed). await dataset.push_data([{'foo': 'bar2', 'col2': 'val2'}, {'col3': 123}]) # Fetch all data from the dataset. data = await dataset.get_data() # Do something with it... # Remove the dataset. await dataset.drop() if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storages/dataset_with_crawler_example.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: # Create a new crawler (it can be any subclass of BasicCrawler). crawler = BeautifulSoupCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, } # Push the extracted data to the (default) dataset. await context.push_data(data) # Run the crawler with the initial URLs. await crawler.run(['https://crawlee.dev']) # Export the dataset to a file. await crawler.export_data(path='dataset.csv') if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storages/dataset_with_crawler_explicit_example.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.storages import Dataset async def main() -> None: # Open the dataset, if it does not exist, it will be created. # Leave name empty to use the default dataset. dataset = await Dataset.open(name='my-dataset') # Create a new crawler (it can be any subclass of BasicCrawler). crawler = BeautifulSoupCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, } # Push the extracted data to the dataset. await dataset.push_data(data) # Run the crawler with the initial URLs. await crawler.run(['https://crawlee.dev']) # Export the dataset to the key-value store. await dataset.export_to(key='dataset', content_type='csv') if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storages/helper_add_requests_example.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # highlight-next-line await context.add_requests(['https://apify.com/']) await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storages/helper_enqueue_links_example.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # highlight-next-line await context.enqueue_links() await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storages/kvs_basic_example.py ================================================ import asyncio from crawlee.storages import KeyValueStore async def main() -> None: # Open the key-value store, if it does not exist, it will be created. # Leave name empty to use the default KVS. kvs = await KeyValueStore.open(name='my-key-value-store') # Set a value associated with 'some-key'. await kvs.set_value(key='some-key', value={'foo': 'bar'}) # Get the value associated with 'some-key'. value = kvs.get_value('some-key') # Do something with it... # Delete the value associated with 'some-key' by setting it to None. await kvs.set_value(key='some-key', value=None) # Remove the key-value store. await kvs.drop() if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storages/kvs_with_crawler_example.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: # Create a new Playwright crawler. crawler = PlaywrightCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Capture the screenshot of the page using Playwright's API. screenshot = await context.page.screenshot() name = context.request.url.split('/')[-1] # Get the key-value store from the context. # If it does not exist, # it will be created. Leave name empty to use the default KVS. kvs = await context.get_key_value_store() # Store the screenshot in the key-value store. await kvs.set_value( key=f'screenshot-{name}', value=screenshot, content_type='image/png', ) # Run the crawler with the initial URLs. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storages/kvs_with_crawler_explicit_example.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.storages import KeyValueStore async def main() -> None: # Open the key-value store, if it does not exist, it will be created. # Leave name empty to use the default KVS. kvs = await KeyValueStore.open(name='my-key-value-store') # Create a new Playwright crawler. crawler = PlaywrightCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Capture the screenshot of the page using Playwright's API. screenshot = await context.page.screenshot() name = context.request.url.split('/')[-1] # Store the screenshot in the key-value store. await kvs.set_value( key=f'screenshot-{name}', value=screenshot, content_type='image/png', ) # Run the crawler with the initial URLs. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storages/opening.py ================================================ import asyncio from crawlee.storages import Dataset async def main() -> None: # Named storage (persists across runs) dataset_named = await Dataset.open(name='my-persistent-dataset') # Unnamed storage with alias (purged on start) dataset_unnamed = await Dataset.open(alias='temporary-results') # Default unnamed storage (purged on start) dataset_default = await Dataset.open() if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storages/rq_basic_example.py ================================================ import asyncio from crawlee.storages import RequestQueue async def main() -> None: # Open the request queue, if it does not exist, it will be created. # Leave name empty to use the default request queue. request_queue = await RequestQueue.open(name='my-request-queue') # Add a single request. await request_queue.add_request('https://apify.com/') # Add multiple requests as a batch. await request_queue.add_requests( ['https://crawlee.dev/', 'https://crawlee.dev/python/'] ) # Fetch and process requests from the queue. while request := await request_queue.fetch_next_request(): # Do something with it... # And mark it as handled. await request_queue.mark_request_as_handled(request) # Remove the request queue. await request_queue.drop() if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storages/rq_with_crawler_example.py ================================================ import asyncio from crawlee.crawlers import HttpCrawler, HttpCrawlingContext async def main() -> None: # Create a new crawler (it can be any subclass of BasicCrawler). Request queue is # a default request manager, it will be opened, and fully managed if not specified. crawler = HttpCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Use context's add_requests method helper to add new requests from the handler. await context.add_requests(['https://crawlee.dev/python/']) # Use crawler's add_requests method helper to add new requests. await crawler.add_requests(['https://apify.com/']) # Run the crawler. You can optionally pass the list of initial requests. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py ================================================ import asyncio from crawlee.crawlers import HttpCrawler, HttpCrawlingContext from crawlee.storages import RequestQueue async def main() -> None: # Open the request queue, if it does not exist, it will be created. # Leave name empty to use the default request queue. request_queue = await RequestQueue.open(name='my-request-queue') # Interact with the request queue directly, e.g. add a batch of requests. await request_queue.add_requests(['https://apify.com/', 'https://crawlee.dev/']) # Create a new crawler (it can be any subclass of BasicCrawler) and pass the request # queue as request manager to it. It will be managed by the crawler. crawler = HttpCrawler(request_manager=request_queue) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # And execute the crawler. await crawler.run() if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/code_examples/trace_and_monitor_crawlers/instrument_crawler.py ================================================ import asyncio from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import SimpleSpanProcessor from opentelemetry.trace import set_tracer_provider from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext from crawlee.otel import CrawlerInstrumentor from crawlee.storages import Dataset, KeyValueStore, RequestQueue def instrument_crawler() -> None: """Add instrumentation to the crawler.""" resource = Resource.create( { 'service.name': 'ExampleCrawler', 'service.version': '1.0.0', 'environment': 'development', } ) # Set up the OpenTelemetry tracer provider and exporter provider = TracerProvider(resource=resource) otlp_exporter = OTLPSpanExporter(endpoint='localhost:4317', insecure=True) provider.add_span_processor(SimpleSpanProcessor(otlp_exporter)) set_tracer_provider(provider) # Instrument the crawler with OpenTelemetry CrawlerInstrumentor( instrument_classes=[RequestQueue, KeyValueStore, Dataset] ).instrument() async def main() -> None: """Run the crawler.""" instrument_crawler() crawler = ParselCrawler(max_requests_per_crawl=100) kvs = await KeyValueStore.open() @crawler.pre_navigation_hook async def pre_nav_hook(_: BasicCrawlingContext) -> None: # Simulate some pre-navigation processing await asyncio.sleep(0.01) @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: await context.push_data({'url': context.request.url}) await kvs.set_value(key='url', value=context.request.url) await context.enqueue_links() await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/guides/crawler_login.mdx ================================================ --- id: logging-in-with-a-crawler title: Logging in with a crawler description: How to log in to websites with Crawlee. --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import PlaywrightLogin from '!!raw-loader!roa-loader!./code_examples/login_crawler/playwright_login.py'; import HttpLogin from '!!raw-loader!roa-loader!./code_examples/login_crawler/http_login.py'; Many websites require authentication to access their content. This guide demonstrates how to implement login functionality using both `PlaywrightCrawler` and `HttpCrawler`. ## Session management for authentication When implementing authentication, you'll typically want to maintain the same `Session` throughout your crawl to preserve login state. This requires proper configuration of the `SessionPool`. For more details, see our [session management guide](./session-management). If your use case requires multiple authenticated sessions with different credentials, you can: - Use the `new_session_function` parameter in `SessionPool` to customize session creation. - Specify the `session_id` parameter in `Request` to bind specific requests to particular sessions. For this guide, we'll use [demoqa.com](https://demoqa.com/login), a testing site designed for automation practice that provides a login form and protected content. ## Login with Playwright crawler The following example demonstrates how to authenticate on a website using `PlaywrightCrawler`, which provides browser automation capabilities for filling out logging forms. {PlaywrightLogin} ## Login with HTTP crawler You can also use `HttpCrawler` (or its more specific variants like `ParselCrawler` or `BeautifulSoupCrawler`) to authenticate by sending a POST `Request` with your credentials directly to the authentication endpoint. HTTP-based authentication often varies significantly between websites. Using browser [DevTools](https://developer.chrome.com/docs/devtools/overview) to analyze the `Network` tab during manual login can help you understand the specific authentication flow, required headers, and body parameters for your target website. {HttpLogin} ================================================ FILE: docs/guides/creating_web_archive.mdx ================================================ --- id: creating-web-archive title: Creating web archive description: How to create a Web ARChive (WARC) with Crawlee --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import PlaywrightCrawlerRecordThroughProxy from '!!raw-loader!./code_examples/creating_web_archive/simple_pw_through_proxy_pywb_server.py'; import ParselCrawlerRecordManual from '!!raw-loader!./code_examples/creating_web_archive/manual_archiving_parsel_crawler.py'; import PlaywrightCrawlerRecordManual from '!!raw-loader!./code_examples/creating_web_archive/manual_archiving_playwright_crawler.py'; Archiving webpages is one of the tasks that a web crawler can be used for. There are various use cases, such as archiving for future reference, speeding up web crawler development, creating top-level regression tests for web crawlers and so on. There are various existing libraries of web archives with massive amount of data stored during their years of existence, for example [Wayback Machine](https://web.archive.org/) or [Common Crawl](https://commoncrawl.org/). There are also dedicated tools for archiving web pages, to name some: simple browser extensions such as [Archive Webpage](https://archiveweb.page/), open source tools such as [pywb](https://pypi.org/project/pywb/) or [warcio](https://pypi.org/project/warcio/), or even web crawlers specialized in archiving such as [Browsertrix](https://webrecorder.net/browsertrix/). The common file format used for archiving is [WARC](https://www.iso.org/standard/68004.html). Crawlee does not offer any out-of-the-box functionality to create WARC files, but in this guide, we will show examples of approaches that can be easily used in your use case to create WARC files with Crawlee. ## Crawling through proxy recording server This approach can be especially attractive as it does not require almost any code change to the crawler itself and the correct WARC creation is done by code from well maintained [pywb](https://pypi.org/project/pywb/) package. The trick is to run a properly configured [wayback proxy server](https://pywb.readthedocs.io/en/latest/manual/usage.html#using-pywb-recorder), use it as a proxy for the crawler and record any traffic. Another advantage of this approach is that it is language agnostic. This way, you can record both your Python-based crawler and your JavaScript-based crawler. This is very straightforward and a good place to start. This approach expects that you have already created your crawler, and that you just want to archive all the pages it is visiting during its crawl. Install [pywb](https://pypi.org/project/pywb/) which will allow you to use `wb-manager` and `wayback` commands. Create a new collection that will be used for this archiving session and start the wayback server: ```bash wb-manager init example-collection wayback --record --live -a --auto-interval 10 --proxy example-collection --proxy-record ``` Instead of passing many configuration arguments to `wayback` command, you can configure the server by adding configuration options to `config.yaml`. See the details in the [documentation](https://pywb.readthedocs.io/en/latest/manual/configuring.html#configuring-the-web-archive). ### Configure the crawler Now you should use this locally hosted server as a proxy in your crawler. There are two more steps before starting the crawler: - Make the crawler use the proxy server. - Deal with the [pywb Certificate Authority](https://pywb.readthedocs.io/en/latest/manual/configuring.html#https-proxy-and-pywb-certificate-authority). For example, in `PlaywrightCrawler`, this is the simplest setup, which takes the shortcut and ignores the CA-related errors: {PlaywrightCrawlerRecordThroughProxy} After you run the crawler you will be able to see the archived data in the wayback collection directory for example `.../collections/example-collection/archive`. You can then access the recorded pages directly in the proxy recording server or use it with any other WARC-compatible tool. ## Manual WARC creation A different approach is to create WARC files manually in the crawler, which gives you full control over the WARC files. This is way more complex and low-level approach as you have to ensure that all the relevant data is collected, and correctly stored and that the archiving functions are called at the right time. This is by no means a trivial task and the example archiving functions below are just the most simple examples that will be insufficient for many real-world use cases. You will need to extend and improve them to properly fit your specific needs. ### Simple crawlers With non-browser crawlers such as `ParselCrawler` you will not be able to create high fidelity archive of the page as you will be missing all the JavaScript dynamic content. However, you can still create a WARC file with the HTML content of the page, which can be sufficient for some use cases. Let's take a look at the example below: {ParselCrawlerRecordManual} The example above is calling an archiving function on each request using the `request_handler`. ### Browser-based crawlers With browser crawlers such as `PlaywrightCrawler` you should be able to create high fidelity archive of a web page. Let's take a look at the example below: {PlaywrightCrawlerRecordManual} The example above is adding an archiving callback on each response in the pre_navigation `archiving_hook`. This ensures that additional resources requested by the browser are also archived. ## Using the archived data In the following section, we will describe an example use case how you can use the recorded WARC files to speed up the development of your web crawler. The idea is to use the archived data as a source of responses for your crawler so that you can test it against the real data without having to crawl the web again. It is assumed that you already have the WARC files. If not, please read the previous sections on how to create them first. Let's use pywb again. This time we will not use it as a recording server, but as a proxy server that will serve the previously archived pages to your crawler in development. ```bash wb-manager init example-collection wb-manager add example-collection /your_path_to_warc_file/example.warc.gz wayback --proxy example-collection ``` Previous commands start the wayback server that allows crawler requests to be served from the archived pages in the `example-collection` instead of sending requests to the real website. This is again [proxy mode of the wayback server](https://pywb.readthedocs.io/en/latest/manual/usage.html#http-s-proxy-mode-access), but without recording capability. Now you need to [configure your crawler](#configure-the-crawler) to use this proxy server, which was already described above. Once everything is finished, you can just run your crawler, and it will crawl the offline archived version of the website from your WARC file. You can also manually browse the archived pages in the wayback server by going to the locally hosted server and entering the collection and URL of the archived page, for example: `http://localhost:8080/example-collection/https:/crawlee.dev/`. The wayback server will serve the page from the WARC file if it exists, or it will return a 404 error if it does not. For more detail about the server please refer to the [pywb documentation](https://pywb.readthedocs.io/en/latest/manual/usage.html#getting-started). If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord](https://discord.com/invite/jyEM2PRvMU) community. ================================================ FILE: docs/guides/error_handling.mdx ================================================ --- id: error-handling title: Error handling description: How to handle errors that occur during web crawling. --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import HandleProxyError from '!!raw-loader!roa-loader!./code_examples/error_handling/handle_proxy_error.py'; import ChangeHandleErrorStatus from '!!raw-loader!roa-loader!./code_examples/error_handling/change_handle_error_status.py'; import DisableRetry from '!!raw-loader!roa-loader!./code_examples/error_handling/disable_retry.py'; This guide demonstrates techniques for handling common errors encountered during web crawling operations. ## Handling proxy errors Low-quality proxies can cause problems even with high settings for `max_request_retries` and `max_session_rotations` in `BasicCrawlerOptions`. If you can't get data because of proxy errors, you might want to try again. You can do this using `failed_request_handler`: {HandleProxyError} You can use this same approach when testing different proxy providers. To better manage this process, you can count proxy errors and [stop the crawler](../examples/crawler-stop) if you get too many. ## Changing how error status codes are handled By default, when `Sessions` get status codes like [401](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/401), [403](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/403), or [429](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/429), Crawlee marks the `Session` as `retire` and switches to a new one. This might not be what you want, especially when working with [authentication](./logging-in-with-a-crawler). You can learn more in the [Session management guide](./session-management). Here's an example of how to change this behavior: {ChangeHandleErrorStatus} ## Turning off retries for non-network errors Sometimes you might get unexpected errors when parsing data, like when a website has an unusual structure. Crawlee normally tries again based on your `max_request_retries` setting, but sometimes you don't want that. Here's how to turn off retries for non-network errors using `error_handler`, which runs before Crawlee tries again: {DisableRetry} ================================================ FILE: docs/guides/http_clients.mdx ================================================ --- id: http-clients title: HTTP clients description: Learn about Crawlee's HTTP client architecture, how to switch between different implementations, and create custom HTTP clients for specialized web scraping needs. --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import ParselHttpxExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_httpx_example.py'; import ParselCurlImpersonateExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_curl_impersonate_example.py'; import ParselImpitExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_impit_example.py'; HTTP clients are utilized by HTTP-based crawlers (e.g., `ParselCrawler` and `BeautifulSoupCrawler`) to communicate with web servers. They use external HTTP libraries for communication rather than a browser. Examples of such libraries include [httpx](https://pypi.org/project/httpx/), [aiohttp](https://pypi.org/project/aiohttp/), [curl-cffi](https://pypi.org/project/curl-cffi/), and [impit](https://apify.github.io/impit/). After retrieving page content, an HTML parsing library is typically used to facilitate data extraction. Examples of such libraries include [beautifulsoup](https://pypi.org/project/beautifulsoup4/), [parsel](https://pypi.org/project/parsel/), [selectolax](https://pypi.org/project/selectolax/), and [pyquery](https://pypi.org/project/pyquery/). These crawlers are faster than browser-based crawlers but cannot execute client-side JavaScript. ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Abstract classes %% ======================== class HttpClient { <> } %% ======================== %% Specific classes %% ======================== class ImpitHttpClient class HttpxHttpClient class CurlImpersonateHttpClient %% ======================== %% Inheritance arrows %% ======================== HttpClient --|> ImpitHttpClient HttpClient --|> HttpxHttpClient HttpClient --|> CurlImpersonateHttpClient ``` ## Switching between HTTP clients Crawlee currently provides three main HTTP clients: `ImpitHttpClient`, which uses the `impit` library, `HttpxHttpClient`, which uses the `httpx` library with `browserforge` for custom HTTP headers and fingerprints, and `CurlImpersonateHttpClient`, which uses the `curl-cffi` library. You can switch between them by setting the `http_client` parameter when initializing a crawler class. The default HTTP client is `ImpitHttpClient`. For more details on anti-blocking features, see our [avoid getting blocked guide](./avoid-blocking). Below are examples of how to configure the HTTP client for the `ParselCrawler`: {ParselHttpxExample} {ParselCurlImpersonateExample} {ParselImpitExample} ## Installation requirements Since `ImpitHttpClient` is the default HTTP client, it's included with the base Crawlee installation and requires no additional packages. For `CurlImpersonateHttpClient`, you need to install Crawlee with the `curl-impersonate` extra: ```sh python -m pip install 'crawlee[curl-impersonate]' ``` For `HttpxHttpClient`, you need to install Crawlee with the `httpx` extra: ```sh python -m pip install 'crawlee[httpx]' ``` Alternatively, you can install all available extras to get access to all HTTP clients and features: ```sh python -m pip install 'crawlee[all]' ``` ## Creating custom HTTP clients Crawlee provides an abstract base class, `HttpClient`, which defines the interface that all HTTP clients must implement. This allows you to create custom HTTP clients tailored to your specific requirements. HTTP clients are responsible for several key operations: - sending HTTP requests and receiving responses, - managing cookies and sessions, - handling headers and authentication, - managing proxy configurations, - connection pooling with timeout management. To create a custom HTTP client, you need to inherit from the `HttpClient` base class and implement all required abstract methods. Your implementation must be async-compatible and include proper cleanup and resource management to work seamlessly with Crawlee's concurrent processing model. ## Conclusion This guide introduced you to the HTTP clients available in Crawlee and demonstrated how to switch between them, including their installation requirements and usage examples. You also learned about the responsibilities of HTTP clients and how to implement your own custom HTTP client by inheriting from the `HttpClient` base class. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! ================================================ FILE: docs/guides/http_crawlers.mdx ================================================ --- id: http-crawlers title: HTTP crawlers description: Learn about Crawlee's HTTP crawlers including BeautifulSoup, Parsel, and raw HTTP crawlers for efficient server-rendered content extraction without JavaScript execution. --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import CodeBlock from '@theme/CodeBlock'; import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/beautifulsoup_example.py'; import ParselExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/parsel_example.py'; import HttpExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/http_example.py'; import LxmlParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lxml_parser.py'; import LxmlSaxoncheParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lxml_saxonche_parser.py'; import LexborParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lexbor_parser.py'; import PyqueryParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/pyquery_parser.py'; import ScraplingParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/scrapling_parser.py'; import SelectolaxParserSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_parser.py'; import SelectolaxContextSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_context.py'; import SelectolaxCrawlerSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_crawler.py'; import SelectolaxCrawlerRunSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_crawler_run.py'; import AdaptiveCrawlerRunSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_adaptive_run.py'; HTTP crawlers are ideal for extracting data from server-rendered websites that don't require JavaScript execution. These crawlers make requests via HTTP clients to fetch HTML content and then parse it using various parsing libraries. For client-side rendered content, where you need to execute JavaScript consider using [Playwright crawler](https://crawlee.dev/python/docs/guides/playwright-crawler) instead. ## Overview All HTTP crawlers share a common architecture built around the `AbstractHttpCrawler` base class. The main differences lie in the parsing strategy and the context provided to request handlers. There are `BeautifulSoupCrawler`, `ParselCrawler`, and `HttpCrawler`. It can also be extended to create custom crawlers with specialized parsing requirements. They use HTTP clients to fetch page content and parsing libraries to extract data from the HTML, check out the [HTTP clients guide](./http-clients) to learn about the HTTP clients used by these crawlers, how to switch between them, and how to create custom HTTP clients tailored to your specific requirements. ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Abstract classes %% ======================== class BasicCrawler { <> } class AbstractHttpCrawler { <> } %% ======================== %% Specific classes %% ======================== class HttpCrawler class ParselCrawler class BeautifulSoupCrawler %% ======================== %% Inheritance arrows %% ======================== BasicCrawler --|> AbstractHttpCrawler AbstractHttpCrawler --|> HttpCrawler AbstractHttpCrawler --|> ParselCrawler AbstractHttpCrawler --|> BeautifulSoupCrawler ``` ## BeautifulSoupCrawler The `BeautifulSoupCrawler` uses the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) library for HTML parsing. It provides fault-tolerant parsing that handles malformed HTML, automatic character encoding detection, and supports CSS selectors, tag navigation, and custom search functions. Use this crawler when working with imperfect HTML structures, when you prefer BeautifulSoup's intuitive API, or when prototyping web scraping solutions. {BeautifulSoupExample} ## ParselCrawler The `ParselCrawler` uses the [Parsel](https://parsel.readthedocs.io/) library, which provides XPath 1.0 and CSS selector support built on `lxml` for high performance. It includes built-in regex support for pattern matching, proper XML namespace handling, and offers better performance than BeautifulSoup while maintaining a clean API. Use this crawler when you need XPath functionality, require high-performance parsing, or need to extract data using regular expressions. {ParselExample} ## HttpCrawler The `HttpCrawler` provides direct access to HTTP response body and headers without automatic parsing, offering maximum performance with no parsing overhead. It supports any content type (JSON, XML, binary) and allows complete control over response processing, including memory-efficient handling of large responses. Use this crawler when working with non-HTML content, requiring maximum performance, implementing custom parsing logic, or needing access to raw response data. {HttpExample} ### Using custom parsers Since `HttpCrawler` provides raw HTTP responses, you can integrate any parsing library. Note that helpers like `enqueue_links` and `extract_links` are not available with this approach. The following examples demonstrate how to integrate with several popular parsing libraries, including [lxml](https://lxml.de/) (high-performance parsing with XPath 1.0), [lxml with SaxonC-HE](https://pypi.org/project/saxonche/) (XPath 3.1 support), [selectolax](https://github.com/rushter/selectolax) (high-speed CSS selectors), [PyQuery](https://pyquery.readthedocs.io/) (jQuery-like syntax), and [scrapling](https://github.com/D4Vinci/Scrapling) (a Scrapy/Parsel-style API offering BeautifulSoup-like methods). {LxmlParser} {LxmlSaxoncheParser} {LexborParser} {PyqueryParser} {ScraplingParser} ## Custom HTTP crawler While the built-in crawlers cover most use cases, you might need a custom HTTP crawler for specialized parsing requirements. To create a custom HTTP crawler, inherit directly from `AbstractHttpCrawler`. This approach requires implementing: 1. **Custom parser class**: Inherit from `AbstractHttpParser`. 2. **Custom context class**: Define what data and helpers are available to handlers. 3. **Custom crawler class**: Tie everything together. This approach is recommended when you need tight integration between parsing and the crawling context, or when you're building a reusable crawler for a specific technology or format. The following example demonstrates how to create a custom crawler using `selectolax` with the `Lexbor` engine. ### Parser implementation The parser converts HTTP responses into a parsed document and provides methods for element selection. Implement `AbstractHttpParser` using `selectolax` with required methods for parsing and querying: {SelectolaxParserSource} This is enough to use your parser with `AbstractHttpCrawler.create_parsed_http_crawler_class` factory method. For more control, continue with custom context and crawler classes below. ### Crawling context definition (optional) The crawling context is passed to request handlers and provides access to the parsed content. Extend `ParsedHttpCrawlingContext` to define the interface your handlers will work with. Here you can implement additional helpers for the crawler context. {SelectolaxContextSource} ### Crawler composition The crawler class connects the parser and context. Extend `AbstractHttpCrawler` and configure the context pipeline to use your custom components: {SelectolaxCrawlerSource} ### Crawler usage The custom crawler works like any built-in crawler. Request handlers receive your custom context with full access to framework helpers like `enqueue_links`. Additionally, the custom parser can be used with `AdaptivePlaywrightCrawler` for adaptive crawling: {SelectolaxCrawlerRunSource} {AdaptiveCrawlerRunSource} ## Conclusion This guide provided a comprehensive overview of HTTP crawlers in Crawlee. You learned about the three main crawler types - `BeautifulSoupCrawler` for fault-tolerant HTML parsing, `ParselCrawler` for high-performance extraction with XPath and CSS selectors, and `HttpCrawler` for raw response processing. You also discovered how to integrate third-party parsing libraries with `HttpCrawler` and how to create fully custom crawlers using `AbstractHttpCrawler` for specialized parsing requirements. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! ================================================ FILE: docs/guides/playwright_crawler.mdx ================================================ --- id: playwright-crawler title: Playwright crawler description: Learn how to use PlaywrightCrawler for browser-based web scraping. --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import MultipleLaunchExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/multiple_launch_example.py'; import BrowserConfigurationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_configuration_example.py'; import NavigationHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/navigation_hooks_example.py'; import BrowserPoolPageHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_pool_page_hooks_example.py'; import PluginBrowserConfigExample from '!!raw-loader!./code_examples/playwright_crawler/plugin_browser_configuration_example.py'; A `PlaywrightCrawler` is a browser-based crawler. In contrast to HTTP-based crawlers like `ParselCrawler` or `BeautifulSoupCrawler`, it uses a real browser to render pages and extract data. It is built on top of the [Playwright](https://playwright.dev/python/) browser automation library. While browser-based crawlers are typically slower and less efficient than HTTP-based crawlers, they can handle dynamic, client-side rendered sites that standard HTTP-based crawlers cannot manage. ## When to use Playwright crawler Use `PlaywrightCrawler` in scenarios that require full browser capabilities, such as: - **Dynamic content rendering**: Required when pages rely on heavy JavaScript to load or modify content in the browser. - **Anti-scraping protection**: Helpful for sites using JavaScript-based security or advanced anti-automation measures. - **Complex cookie management**: Necessary for sites with session or cookie requirements that standard HTTP-based crawlers cannot handle easily. If [HTTP-based crawlers](https://crawlee.dev/python/docs/guides/http-crawlers) are insufficient, `PlaywrightCrawler` can address these challenges. See a [basic example](../examples/playwright-crawler) for a typical usage demonstration. ## Advanced configuration The `PlaywrightCrawler` uses other Crawlee components under the hood, notably `BrowserPool` and `PlaywrightBrowserPlugin`. These components let you to configure the browser and context settings, launch multiple browsers, and apply pre-navigation hooks. You can create your own instances of these components and pass them to the `PlaywrightCrawler` constructor. - The `PlaywrightBrowserPlugin` manages how browsers are launched and how browser contexts are created. It accepts [browser launch](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch) and [new context](https://playwright.dev/python/docs/api/class-browser#browser-new-context) options. - The `BrowserPool` manages the lifecycle of browser instances (launching, recycling, etc.). You can customize its behavior to suit your needs. ## Managing multiple browsers The `BrowserPool` allows you to manage multiple browsers. Each browser instance is managed by a separate `PlaywrightBrowserPlugin` and can be configured independently. This is useful for scenarios like testing multiple configurations or implementing browser rotation to help avoid blocks or detect different site behaviors. {MultipleLaunchExample} ## Browser launch and context configuration The `PlaywrightBrowserPlugin` provides access to all relevant Playwright configuration options for both [browser launches](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch) and [new browser contexts](https://playwright.dev/python/docs/api/class-browser#browser-new-context). You can specify these options in the constructor of `PlaywrightBrowserPlugin` or `PlaywrightCrawler`: {BrowserConfigurationExample} You can also configure each plugin used by `BrowserPool`: {PluginBrowserConfigExample} For an example of how to implement a custom browser plugin, see the [Camoufox example](../examples/playwright-crawler-with-camoufox). [Camoufox](https://camoufox.com/) is a stealth browser plugin designed to reduce detection by anti-scraping measures and is fully compatible with `PlaywrightCrawler`. ## Page configuration with lifecycle page hooks For additional setup or event-driven actions around page creation and closure, the `BrowserPool` exposes four lifecycle hooks: `pre_page_create_hook`, `post_page_create_hook`, `pre_page_close_hook`, and `post_page_close_hook`. To use them, create a `BrowserPool` instance and pass it to `PlaywrightCrawler` via the `browser_pool` argument. {BrowserPoolPageHooksExample} ## Navigation hooks Navigation hooks allow for additional configuration at specific points during page navigation. The `pre_navigation_hook` is called before each navigation and provides `PlaywrightPreNavCrawlingContext` - including the [page](https://playwright.dev/python/docs/api/class-page) instance and a `block_requests` helper for filtering unwanted resource types and URL patterns. See the [block requests example](https://crawlee.dev/python/docs/examples/playwright-crawler-with-block-requests) for a dedicated walkthrough. Similarly, the `post_navigation_hook` is called after each navigation and provides `PlaywrightPostNavCrawlingContext` - useful for post-load checks such as detecting CAPTCHAs or verifying page state. {NavigationHooksExample} ## Conclusion This guide introduced the `PlaywrightCrawler` and explained how to configure it using `BrowserPool` and `PlaywrightBrowserPlugin`. You learned how to launch multiple browsers, configure browser and context settings, use `BrowserPool` lifecycle page hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! ================================================ FILE: docs/guides/playwright_crawler_adaptive.mdx ================================================ --- id: adaptive-playwright-crawler title: Adaptive Playwright crawler description: Learn how to use the Adaptive Playwright crawler to automatically switch between browser-based and HTTP-only crawling. --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import AdaptivePlaywrightCrawlerHandler from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_adaptive/handler.py'; import AdaptivePlaywrightCrawlerPreNavHooks from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_adaptive/pre_nav_hooks.py'; import AdaptivePlaywrightCrawlerInitBeautifulSoup from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_beautifulsoup.py'; import AdaptivePlaywrightCrawlerInitParsel from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_parsel.py'; import AdaptivePlaywrightCrawlerInitPrediction from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_prediction.py'; An `AdaptivePlaywrightCrawler` is a combination of `PlaywrightCrawler` and some implementation of HTTP-based crawler such as `ParselCrawler` or `BeautifulSoupCrawler`. It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit. Detection is done based on the `RenderingTypePredictor` with default implementation `DefaultRenderingTypePredictor`. It predicts which crawling method should be used and learns from already crawled pages. ## When to use AdaptivePlaywrightCrawler Use `AdaptivePlaywrightCrawler` in scenarios where some target pages have to be crawled with `PlaywrightCrawler`, but for others faster HTTP-based crawler is sufficient. This way, you can achieve lower costs when crawling multiple different websites. Another use case is performing selector-based data extraction without prior knowledge of whether the selector exists in the static page or is dynamically added by a code executed in a browsing client. ## Request handler and adaptive context helpers Request handler for `AdaptivePlaywrightCrawler` works on special context type - `AdaptivePlaywrightCrawlingContext`. This context is sometimes created by HTTP-based sub crawler and sometimes by playwright based sub crawler. Due to its dynamic nature, you can't always access [page](https://playwright.dev/python/docs/api/class-page) object. To overcome this limitation, there are three helper methods on this context that can be called regardless of how the context was created. `wait_for_selector` accepts `css` selector as first argument and timeout as second argument. The function will try to locate this selector a return once it is found(within timeout). In practice this means that if HTTP-based sub crawler was used, the function will find the selector only if it is part of the static content. If not, the adaptive crawler will fall back to the playwright sub crawler and will wait try to locate the selector within the timeout using playwright. `query_selector_one` accepts `css` selector as first argument and timeout as second argument. This function acts similar to `wait_for_selector`, but it also returns one selector if any selector is found. Return value type is determined by used HTTP-based sub crawler. For example, it will be `Selector` for `ParselCrawler` and `Tag` for `BeautifulSoupCrawler`. `query_selector_all` same as `query_selector_one`, but returns all found selectors. `parse_with_static_parser` will re-parse the whole page. Return value type is determined by used HTTP-based sub crawler. It has optional arguments: `selector` and `timeout`. If those optional arguments are used then the function first calls `wait_for_selector` and then do the parsing. This can be used in scenario where some specific element can signal, that page is already complete. See the following example about how to create request handler and use context helpers: {AdaptivePlaywrightCrawlerHandler} ## Crawler configuration To use `AdaptivePlaywrightCrawler` it is recommended to use one of the prepared factory methods that will create the crawler with specific HTTP-based sub crawler variant: `AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser` or `AdaptivePlaywrightCrawler.with_parsel_static_parser`. `AdaptivePlaywrightCrawler` is internally composed of two sub crawlers and you can do a detailed configuration of both of them. For detailed configuration options of the sub crawlers, please refer to their pages: `PlaywrightCrawler`, `ParselCrawler`, `BeautifulSoupCrawler`. In the following example you can see how to create and configure `AdaptivePlaywrightCrawler` with two different HTTP-based sub crawlers: {AdaptivePlaywrightCrawlerInitBeautifulSoup} {AdaptivePlaywrightCrawlerInitParsel} ### Prediction related arguments To control which pages are crawled by which method you can use following arguments: `RenderingTypePredictor` - Class that can give recommendations about which sub crawler should be used for specific url. Predictor will also recommend to use both sub crawlers for some page from time to time, to check that the given recommendation was correct. Predictor should be able to learn from previous results and gradually give more reliable recommendations. `result_checker` - Is a function that checks result created from crawling a page. By default, it always returns `True`. `result_comparator` - Is a function that compares two results (HTTP-based sub crawler result and playwright based sub crawler result) and returns `True` if they are considered the same. By default, this function compares calls of context helper `push_data` by each sub crawler. This function is used by `rendering_type_predictor` to evaluate whether HTTP-based crawler has the same results as playwright based sub crawler. See the following example about how to pass prediction related arguments: {AdaptivePlaywrightCrawlerInitPrediction} ## Page configuration with pre-navigation hooks In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the `pre_navigation_hook` method of the `AdaptivePlaywrightCrawler`. This method is called before the page navigates to the target URL and allows you to configure the page instance. Due to the dynamic nature of `AdaptivePlaywrightCrawler` it is possible that the hook will be executed for HTTP-based sub crawler or playwright-based sub crawler. Using [page](https://playwright.dev/python/docs/api/class-page) object for hook that will be executed on HTTP-based sub crawler will raise an exception. To overcome this you can use optional argument `playwright_only` = `True` when registering the hook. See the following example about how to register the pre navigation hooks: {AdaptivePlaywrightCrawlerPreNavHooks} ================================================ FILE: docs/guides/playwright_crawler_stagehand.mdx ================================================ --- id: playwright-crawler-stagehand title: Playwright with Stagehand description: How to integrate Stagehand AI-powered automation with PlaywrightCrawler. --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import SupportClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/support_classes.py'; import BrowserClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/browser_classes.py'; import StagehandRun from '!!raw-loader!./code_examples/playwright_crawler_stagehand/stagehand_run.py'; [Stagehand](https://docs.stagehand.dev/) is a framework that combines [Playwright](https://playwright.dev/python/) with AI-driven natural language understanding and decision-making capabilities. With Stagehand, you can use natural language instructions to interact with web pages instead of writing complex selectors and automation logic. Stagehand supports multiple AI models through [`LiteLLM`](https://docs.litellm.ai/docs/). This guide demonstrates how to integrate Stagehand with `PlaywrightCrawler` using [Gemini](https://ai.google.dev/gemini-api/docs) as the AI model provider. :::info This guide is based on stagehand-python v0.4.0 with local configuration settings and may not be compatible with newer versions. ::: ## Get Gemini API key You need to register with [Google AI Studio](https://aistudio.google.com/) and navigate to [Get API key](https://aistudio.google.com/app/apikey) to obtain your API key. ## Create support classes for Stagehand To integrate Stagehand with Crawlee, you need to create wrapper classes that allow `PlaywrightBrowserPlugin` to manage the Playwright lifecycle. Create `CrawleeStagehand` - a custom Stagehand subclass that overrides the `init` method to prevent Stagehand from launching its own Playwright instance. Create `CrawleeStagehandPage` - a wrapper class for `StagehandPage` that implements the [Playwright Page](https://playwright.dev/python/docs/next/api/class-page) behavior expected by `PlaywrightCrawler`. {SupportClasses} ## Create browser integration classes You need to create a custom browser plugin and controller that properly initialize Stagehand and obtain browser pages from `StagehandContext`. Create `StagehandPlugin` - a subclass of `PlaywrightBrowserPlugin` that holds the Stagehand instance and creates `PlaywrightPersistentBrowser` instances. Create `StagehandBrowserController` - a subclass of `PlaywrightBrowserController` that lazily initializes `StagehandContext` and creates new pages with AI capabilities on demand. {BrowserClasses} ## Create a crawler Now you can create a `PlaywrightCrawler` that uses Stagehand's AI capabilities to interact with web pages using natural language commands: {StagehandRun} The integration works through several key components: - `CrawleeStagehand` prevents Stagehand from launching its own Playwright instance, allowing Crawlee to manage the browser lifecycle - `StagehandPlugin` extends the Playwright browser plugin to create Stagehand-enabled browser instances - `StagehandBrowserController` uses `StagehandContext` to create pages with AI capabilities - `CrawleeStagehandPage` provides interface compatibility between Stagehand pages and Crawlee's expectations In the request handler, you can use natural language commands like `page.extract('Extract title page')` to perform intelligent data extraction without writing complex selectors. ================================================ FILE: docs/guides/proxy_management.mdx ================================================ --- id: proxy-management title: Proxy management description: Using proxies to get around those annoying IP-blocks --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import CodeBlock from '@theme/CodeBlock'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import QuickStartExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/quick_start_example.py'; import IntegrationBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/integration_bs_example.py'; import IntegrationPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/integration_pw_example.py'; import TiersBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/tiers_bs_example.py'; import TiersPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/tiers_pw_example.py'; import InspectionBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/inspecting_bs_example.py'; import InspectionPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/inspecting_pw_example.py'; import SessionBsExample from '!!raw-loader!./code_examples/proxy_management/session_bs_example.py'; import SessionPwExample from '!!raw-loader!./code_examples/proxy_management/session_pw_example.py'; [IP address blocking](https://en.wikipedia.org/wiki/IP_address_blocking) is one of the oldest and most effective ways of preventing access to a website. It is therefore paramount for a good web scraping library to provide easy to use but powerful tools which can work around IP blocking. The most powerful weapon in our anti IP blocking arsenal is a [proxy server](https://en.wikipedia.org/wiki/Proxy_server). With Crawlee we can use our own proxy servers or proxy servers acquired from third-party providers. [//]: # (Check out the [avoid blocking guide](./avoid-blocking) for more information about blocking.) ## Quick start If you already have proxy URLs of your own, you can start using them immediately in only a few lines of code. {QuickStartExample} Examples of how to use our proxy URLs with crawlers are shown below in [Crawler integration](#crawler-integration) section. ## Proxy configuration All our proxy needs are managed by the `ProxyConfiguration` class. We create an instance using the `ProxyConfiguration` constructor function based on the provided options. ### Crawler integration `ProxyConfiguration` integrates seamlessly into `BeautifulSoupCrawler` and `PlaywrightCrawler`. {IntegrationBsExample} {IntegrationPwExample} Our crawlers will now use the selected proxies for all connections. ### IP Rotation and session management The `proxy_configuration.new_url()` method allows us to pass a `session_id` parameter. This creates a `session_id`-`proxy_url` pair, ensuring that subsequent `new_url()` calls with the same `session_id` return the same `proxy_url`. This is extremely useful in scraping, because we want to create the impression of a real user. See the `SessionPool` class for more information on how maintaining a real session helps avoid blocking. For more details on session management, check out the [Session management](./session-management) guide. When no `session_id` is provided, our proxy URLs are rotated round-robin. {SessionBsExample} {SessionPwExample} ### Tiered proxies When you use HTTP proxies in real world crawling scenarios, you have to decide which type of proxy to use to reach the sweet spot between cost efficiency and reliably avoiding blocking. Some websites may allow crawling with no proxy, on some you may get away with using datacenter proxies, which are cheap but easily detected, and sometimes you need to use expensive residential proxies. To take the guesswork out of this process, Crawlee allows you to configure multiple tiers of proxy URLs. When crawling, it will automatically pick the lowest tier (smallest index) where it doesn't encounter blocking. If you organize your proxy server URLs in tiers so that the lowest tier contains the cheapest, least reliable ones and each higher tier contains more expensive, more reliable ones, you will get an optimal anti-blocking performance. In an active tier, Crawlee will alternate between proxies in a round-robin fashion, just like it would with `proxy_urls`. {TiersBsExample} {TiersPwExample} ## Inspecting current proxy in crawlers The `BeautifulSoupCrawler` and `PlaywrightCrawler` provide access to information about the currently used proxy via the request handler using a `proxy_info` object. This object allows easy access to the proxy URL. {InspectionBsExample} {InspectionPwExample} ================================================ FILE: docs/guides/request_loaders.mdx ================================================ --- id: request-loaders title: Request loaders description: How to manage the requests your crawler will go through. --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import RlBasicExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example.py'; import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_basic_example.py'; import RlTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example.py'; import RlExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example_explicit.py'; import SitemapTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example.py'; import SitemapExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example_explicit.py'; import RlBasicPersistExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example_with_persist.py'; import SitemapPersistExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_example_with_persist.py'; The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/crawlee/request_loaders) sub-package extends the functionality of the `RequestQueue`, providing additional tools for managing URLs and requests. If you are new to Crawlee and unfamiliar with the `RequestQueue`, consider starting with the [Storages](https://crawlee.dev/python/docs/guides/storages) guide first. Request loaders define how requests are fetched and stored, enabling various use cases such as reading URLs from files, external APIs, or combining multiple sources together. ## Overview The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/crawlee/request_loaders) sub-package introduces the following abstract classes: - `RequestLoader`: The base interface for reading requests in a crawl. - `RequestManager`: Extends `RequestLoader` with write capabilities. - `RequestManagerTandem`: Combines a read-only `RequestLoader` with a writable `RequestManager`. And specific request loader implementations: - `RequestList`: A lightweight implementation for managing a static list of URLs. - `SitemapRequestLoader`: A specialized loader that reads URLs from XML and plain-text sitemaps following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html) with filtering capabilities. Below is a class diagram that illustrates the relationships between these components and the `RequestQueue`: ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Abstract classes %% ======================== class Storage { <> + id + name + open() + drop() } class RequestLoader { <> + handled_count + total_count + fetch_next_request() + mark_request_as_handled() + is_empty() + is_finished() + to_tandem() } class RequestManager { <> + add_request() + add_requests_batched() + reclaim_request() + drop() } %% ======================== %% Specific classes %% ======================== class RequestQueue class RequestList class SitemapRequestLoader class RequestManagerTandem %% ======================== %% Inheritance arrows %% ======================== Storage --|> RequestQueue RequestManager --|> RequestQueue RequestLoader --|> RequestManager RequestLoader --|> RequestList RequestLoader --|> SitemapRequestLoader RequestManager --|> RequestManagerTandem ``` ## Request loaders The `RequestLoader` interface defines the foundation for fetching requests during a crawl. It provides abstract methods for basic operations like retrieving, marking, and checking the status of requests. Concrete implementations, such as `RequestList`, build on this interface to handle specific scenarios. You can create your own custom loader that reads from an external file, web endpoint, database, or any other specific data source. For more details, refer to the `RequestLoader` API reference. :::info NOTE To learn how to use request loaders in your crawlers, see the [Request manager tandem](#request-manager-tandem) section below. ::: ### Request list The `RequestList` can accept an asynchronous generator as input, allowing requests to be streamed rather than loading them all into memory at once. This can significantly reduce memory usage, especially when working with large sets of URLs. Here is a basic example of working with the `RequestList`: {RlBasicExample} ### Request list with persistence The `RequestList` supports state persistence, allowing it to resume from where it left off after interruption. This is particularly useful for long-running crawls or when you need to pause and resume crawling later. To enable persistence, provide `persist_state_key` and optionally `persist_requests_key` parameters, and disable automatic cleanup by setting `purge_on_start = False` in the configuration. The `persist_state_key` saves the loader's progress, while `persist_requests_key` ensures that the request data doesn't change between runs. For more details on resuming interrupted crawls, see the [Resuming a paused crawl](../examples/resuming-paused-crawl) example. {RlBasicPersistExample} ### Sitemap request loader The `SitemapRequestLoader` is a specialized request loader that reads URLs from sitemaps following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats. It's particularly useful when you want to crawl a website systematically by following its sitemap structure. :::note The `SitemapRequestLoader` is designed specifically for sitemaps that follow the standard Sitemaps protocol. HTML pages containing links are not supported by this loader - those should be handled by regular crawlers using the `enqueue_links` functionality. ::: The loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. The `SitemapRequestLoader` provides streaming processing of sitemaps, ensuring efficient memory usage without loading the entire sitemap into memory. {SitemapExample} ### Sitemap request loader with persistence Similarly, the `SitemapRequestLoader` supports state persistence to resume processing from where it left off. This is especially valuable when processing large sitemaps that may take considerable time to complete. {SitemapPersistExample} When using persistence with `SitemapRequestLoader`, make sure to use the context manager (`async with`) to properly save the state when the work is completed. ## Request managers The `RequestManager` extends `RequestLoader` with write capabilities. In addition to reading requests, a request manager can add and reclaim them. This is essential for dynamic crawling projects where new URLs may emerge during the crawl process, or when certain requests fail and need to be retried. For more details, refer to the `RequestManager` API reference. ## Request manager tandem The `RequestManagerTandem` class allows you to combine the read-only capabilities of a `RequestLoader` (like `RequestList`) with the read-write capabilities of a `RequestManager` (like `RequestQueue`). This is useful for scenarios where you need to load initial requests from a static source (such as a file or database) and dynamically add or retry requests during the crawl. Additionally, it provides deduplication capabilities, ensuring that requests are not processed multiple times. Under the hood, `RequestManagerTandem` checks whether the read-only loader still has pending requests. If so, each new request from the loader is transferred to the manager. Any newly added or reclaimed requests go directly to the manager side. ### Request list with request queue This section describes the combination of the `RequestList` and `RequestQueue` classes. This setup is particularly useful when you have a static list of URLs that you want to crawl, but also need to handle dynamic requests discovered during the crawl process. The `RequestManagerTandem` class facilitates this combination, with the `RequestLoader.to_tandem` method available as a convenient shortcut. Requests from the `RequestList` are processed first by being enqueued into the default `RequestQueue`, which handles persistence and retries for failed requests. {RlExplicitTandemExample} {RlTandemExample} ### Sitemap request loader with request queue Similar to the `RequestList` example above, you can combine a `SitemapRequestLoader` with a `RequestQueue` using the `RequestManagerTandem` class. This setup is particularly useful when you want to crawl URLs from a sitemap while also handling dynamic requests discovered during the crawl process. URLs from the sitemap are processed first by being enqueued into the default `RequestQueue`, which handles persistence and retries for failed requests. {SitemapExplicitTandemExample} {SitemapTandemExample} ## Conclusion This guide explained the `request_loaders` sub-package, which extends the functionality of the `RequestQueue` with additional tools for managing URLs and requests. You learned about the `RequestLoader`, `RequestManager`, and `RequestManagerTandem` classes, as well as the `RequestList` and `SitemapRequestLoader` implementations. You also saw practical examples of how to work with these classes to handle various crawling scenarios. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! ================================================ FILE: docs/guides/request_router.mdx ================================================ --- id: request-router title: Request router description: Learn how to use the Router class to organize request handlers, error handlers, and pre-navigation hooks in Crawlee. --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import BasicRequestHandlers from '!!raw-loader!roa-loader!./code_examples/request_router/basic_request_handlers.py'; import SimpleDefaultHandler from '!!raw-loader!roa-loader!./code_examples/request_router/simple_default_handler.py'; import CustomRouterDefaultOnly from '!!raw-loader!roa-loader!./code_examples/request_router/custom_router_default_only.py'; import HttpPreNavigation from '!!raw-loader!roa-loader!./code_examples/request_router/http_pre_navigation.py'; import ErrorHandler from '!!raw-loader!roa-loader!./code_examples/request_router/error_handler.py'; import FailedRequestHandler from '!!raw-loader!roa-loader!./code_examples/request_router/failed_request_handler.py'; import PlaywrightPreNavigation from '!!raw-loader!roa-loader!./code_examples/request_router/playwright_pre_navigation.py'; import AdaptiveCrawlerHandlers from '!!raw-loader!roa-loader!./code_examples/request_router/adaptive_crawler_handlers.py'; The `Router` class manages request flow and coordinates the execution of user-defined logic in Crawlee projects. It routes incoming requests to appropriate user-defined handlers based on labels, manages error scenarios, and provides hooks for pre-navigation execution. The `Router` serves as the orchestrator for all crawling operations, ensuring that each request is processed by the correct handler according to its type and label. ## Request handlers Request handlers are user-defined functions that process individual requests and their corresponding responses. Each handler receives a crawling context as its primary argument, which provides access to the current request, response data, and utility methods for data extraction, link enqueuing, and storage operations. Handlers determine how different types of pages are processed and how data is extracted and stored. :::note The code examples in this guide use `ParselCrawler` for demonstration, but the `Router` works with all crawler types. ::: ### Built-in router Every crawler instance includes a built-in `Router` accessible through the `crawler.router` property. This approach simplifies initial setup and covers basic use cases where request routing requirements are straightforward. {SimpleDefaultHandler} The default handler processes all requests that either lack a label or have a label for which no specific handler has been registered. ### Custom router Applications requiring explicit control over router configuration or router reuse across multiple crawler instances can create custom `Router` instances. Custom routers provide complete control over request routing configuration and enable modular application architecture. Router instances can be configured independently and attached to your crawler instances as needed. You can also implement a custom request router class from scratch or by inheriting from `Router`. This allows you to define custom routing logic or manage request handlers in a different way. {CustomRouterDefaultOnly} ### Advanced routing by labels More complex crawling projects often require different processing logic for various page types. The router supports label-based routing, which allows registration of specialized handlers for specific content categories. This pattern enables clean separation of concerns and targeted processing logic for different URL patterns or content types. {BasicRequestHandlers} ## Error handlers Crawlee provides error handling mechanisms to manage request processing failures. It distinguishes between recoverable errors that may succeed on retry and permanent failures that require alternative handling strategies. ### Error handler The error handler executes when exceptions occur during request processing, before any retry attempts. This handler receives the error context and can implement custom recovery logic, modify request parameters, or determine whether the request should be retried. Error handlers enable control over failure scenarios and allow applications to implement error recovery strategies. {ErrorHandler} ### Failed request handler The failed request handler executes when a request has exhausted all retry attempts and is considered permanently failed. This handler serves as the final opportunity to log failures, store failed requests for later analysis, create alternative requests, or implement fallback processing strategies. {FailedRequestHandler} ## Pre-navigation hooks Pre-navigation hooks execute before each request is processed, providing opportunities to configure request parameters, modify browser settings, or implement request-specific optimizations. You can use pre-navigation hooks for example for viewport configuration, resource blocking, timeout management, header customization, custom proxy rotation, and request interception. ### HTTP crawler HTTP crawlers support pre-navigation hooks that execute before making HTTP requests. These hooks enable request modification, header configuration, and other HTTP-specific optimizations. {HttpPreNavigation} ### Playwright crawler Playwright crawlers provide extensive pre-navigation capabilities that allow browser page configuration before navigation. These hooks can modify browser behavior and configure page settings. {PlaywrightPreNavigation} ### Adaptive Playwright crawler The `AdaptivePlaywrightCrawler` implements a dual-hook system with common hooks that execute for all requests and Playwright-specific hooks that execute only when browser automation is required. This is perfect for projects that need both static and dynamic content handling. {AdaptiveCrawlerHandlers} ## Conclusion This guide introduced you to the `Router` class and how to organize your crawling logic. You learned how to use built-in and custom routers, implement request handlers with label-based routing, handle errors with error and failed request handlers, and configure pre-navigation hooks for different crawler types. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! ================================================ FILE: docs/guides/running_in_web_server.mdx ================================================ --- id: running-in-web-server title: Running in web server description: Running in web server --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import Crawler from '!!raw-loader!./code_examples/running_in_web_server/crawler.py'; import Server from '!!raw-loader!./code_examples/running_in_web_server/server.py'; Most of the time, Crawlee jobs are run as batch jobs. You have a list of URLs you want to scrape every week or you might want to scrape a whole website once per day. After the scrape, you send the data to your warehouse for analytics. Batch jobs are efficient because they can use Crawlee's built-in autoscaling to fully utilize the resources you have available. But sometimes you have a use-case where you need to return scrape data as soon as possible. There might be a user waiting on the other end so every millisecond counts. This is where running Crawlee in a web server comes in. We will build a simple HTTP server that receives a page URL and returns the page title in the response. ## Set up a web server There are many popular web server frameworks for Python, such as [Flask](https://flask.palletsprojects.com/en/stable/), [Django](https://www.djangoproject.com/), [Pyramid](https://trypyramid.com/), ... In this guide, we will use the [FastAPI](https://fastapi.tiangolo.com/) to keep things simple. This will be our core server setup: {Server} The server has two endpoints. - `/` - The index is just giving short description of the server with example link to the second endpoint. - `/scrape` - This is the endpoint that receives a `url` parameter and returns the page title scraped from the URL To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and from the directory where the example code is located you can use the following command: ``` fastapi dev server.py ``` ## Create a crawler We will create a standard `ParselCrawler` and use the `keep_alive=true` option to keep the crawler running even if there are no requests currently in the `RequestQueue`. This way it will always be waiting for new requests to come in. {Crawler} Crawler is defined inside of [Lifespan](https://fastapi.tiangolo.com/advanced/events/#lifespan) which is a FastAPI way to run some start up/ teardown code for the app. There are two objects that we want to save to the app state so that they can be accessed in any endpoint through `request.state`: - `crawler` holds instance of our crawler and allows the app to interact with it. - `requests_to_results` is dictionary that is used to temporarily register expected results for each request and populate them when they are made available by the crawler. ================================================ FILE: docs/guides/scaling_crawlers.mdx ================================================ --- id: scaling-crawlers title: Scaling crawlers description: Learn how to scale your crawlers by controlling concurrency and limiting requests per minute. --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import MaxTasksPerMinuteExample from '!!raw-loader!roa-loader!./code_examples/scaling_crawlers/max_tasks_per_minute_example.py'; import MinAndMaxConcurrencyExample from '!!raw-loader!roa-loader!./code_examples/scaling_crawlers/min_and_max_concurrency_example.py'; As we build our crawler, we may want to control how many tasks it performs at any given time. In other words, how many requests it makes to the web we are trying to scrape. Crawlee offers several options to fine-tune the number of parallel tasks, limit the number of requests per minute, and optimize scaling based on available system resources. :::tip All of these options are available across all crawlers provided by Crawlee. In this guide, we are using the `BeautifulSoupCrawler` as an example. You should also explore the `ConcurrencySettings`. ::: ## Max tasks per minute The `max_tasks_per_minute` setting in `ConcurrencySettings` controls how many total tasks the crawler can process per minute. It ensures that tasks are spread evenly throughout the minute, preventing a sudden burst at the `max_concurrency` limit followed by idle time. By default, this is set to `Infinity`, meaning the crawler can run at full speed, limited only by `max_concurrency`. Use this option if you want to throttle your crawler to avoid overwhelming the target website with continuous requests. {MaxTasksPerMinuteExample} ## Minimum and maximum concurrency The `min_concurrency` and `max_concurrency` options in the `ConcurrencySettings` define the minimum and maximum number of parallel tasks that can run at any given time. By default, crawlers start with a single parallel task and gradually scale up to a maximum of concurrent requests. :::caution Avoid setting minimum concurrency too high If you set `min_concurrency` too high compared to the available system resources, the crawler may run very slowly or even crash. It is recommended to stick with the default value and let the crawler automatically adjust concurrency based on the system's available resources. ::: ## Desired concurrency The `desired_concurrency` option in the `ConcurrencySettings` specifies the initial number of parallel tasks to start with, assuming sufficient resources are available. It defaults to the same value as `min_concurrency`. {MinAndMaxConcurrencyExample} ## Autoscaled pool The `AutoscaledPool` manages a pool of asynchronous, resource-intensive tasks that run in parallel. It automatically starts new tasks only when there is enough free CPU and memory. To monitor system resources, it leverages the `Snapshotter` and `SystemStatus` classes. If any task raises an exception, the error is propagated, and the pool is stopped. Every crawler uses an `AutoscaledPool` under the hood. ================================================ FILE: docs/guides/service_locator.mdx ================================================ --- id: service-locator title: Service locator description: Crawlee's service locator is a central registry for global services, managing and providing access to them throughout the whole framework. --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import ServiceLocatorConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_configuration.py'; import ServiceLocatorStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_storage_client.py'; import ServiceLocatorEventManager from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_event_manager.py'; import ServiceCrawlerConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_configuration.py'; import ServiceCrawlerStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_storage_client.py'; import ServiceCrawlerEventManager from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_event_manager.py'; import ServiceStorageConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_storage_configuration.py'; import ServiceStorageStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_storage_storage_client.py'; import ServiceConflicts from '!!raw-loader!roa-loader!./code_examples/service_locator/service_conflicts.py'; The `ServiceLocator` is a central registry for global services. It manages and provides access to these services throughout the framework, ensuring their consistent configuration and across all components. The service locator manages three core services: `Configuration`, `EventManager`, and `StorageClient`. All services are initialized lazily with defaults when first accessed. ## Services There are three core services that are managed by the service locator: ### Configuration `Configuration` is a class that provides access to application-wide settings and parameters. It allows you to configure various aspects of Crawlee, such as timeouts, logging level, persistence intervals, and various other settings. The configuration can be set directly in the code or via environment variables. ### StorageClient `StorageClient` is the backend implementation for storages in Crawlee. It provides a unified interface for `Dataset`, `KeyValueStore`, and `RequestQueue`, regardless of the underlying storage implementation. Storage clients were already explained in the storage clients section. Refer to the [Storage clients guide](./storage-clients) for more information about storage clients and how to use them. ### EventManager `EventManager` is responsible for coordinating internal events in Crawlee. It allows you to register event listeners and emit events throughout the framework. Examples of such events aborting, migrating, system info, or browser-specific events like page created, page closed and more. It provides a way to listen to events and execute custom logic when certain events occur. ## Service registration There are several ways to register services in Crawlee, depending on your use case and preferences. ### Via service locator Services can be registered globally through the `ServiceLocator` before they are first accessed. There is a singleton `service_locator` instance that is used throughout the framework, making the services available to all components throughout the whole framework. {ServiceLocatorStorageClient} {ServiceLocatorConfiguration} {ServiceLocatorEventManager} ### Via crawler constructors Alternatively services can be passed to the crawler constructors. They will be registered globally to the `ServiceLocator` under the hood, making them available to all components and reaching consistent configuration. {ServiceCrawlerStorageClient} {ServiceCrawlerConfiguration} {ServiceCrawlerEventManager} ### Via storage constructors Alternatively, services can be provided when opening specific storage instances, which uses them only for that particular instance without affecting global configuration. {ServiceStorageStorageClient} {ServiceStorageConfiguration} ## Conflict prevention Once a service has been retrieved from the service locator, attempting to set a different instance will raise a `ServiceConflictError` to prevent accidental configuration conflicts. {ServiceConflicts} ## Conclusion The `ServiceLocator` is a tool for managing global services in Crawlee. It provides a consistent way to configure and access services throughout the framework, ensuring that all components have access to the same configuration and services. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! ================================================ FILE: docs/guides/session_management.mdx ================================================ --- id: session-management title: Session management description: How to manage your cookies, proxy IP rotations and more. --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import BasicSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_basic.py'; import HttpSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_http.py'; import BeautifulSoupSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_beautifulsoup.py'; import ParselSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_parsel.py'; import PlaywrightSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_playwright.py'; import StandaloneSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_standalone.py'; import OneSession from '!!raw-loader!roa-loader!./code_examples/session_management/one_session_http.py'; import MultiSessions from '!!raw-loader!roa-loader!./code_examples/session_management/multi_sessions_http.py'; The `SessionPool` class provides a robust way to manage the rotation of proxy IP addresses, cookies, and other custom settings in Crawlee. Its primary advantage is the ability to filter out blocked or non-functional proxies, ensuring that your scraper avoids retrying requests through known problematic proxies. Additionally, it enables storing information tied to specific IP addresses, such as cookies, authentication tokens, and custom headers. This association reduces the probability of detection and blocking by ensuring cookies and other identifiers are used consistently with the same IP address. Finally, it ensures even IP address rotation by randomly selecting sessions. This helps prevent overuse of a limited pool of available IPs, reducing the risk of IP bans and enhancing the efficiency of your scraper. For more details on configuring proxies, refer to the [Proxy management](./proxy-management) guide. Now, let's explore examples of how to use the `SessionPool` in different scenarios: - with `BasicCrawler`; - with `HttpCrawler`; - with `BeautifulSoupCrawler`; - with `ParselCrawler`; - with `PlaywrightCrawler`; - without a crawler (standalone usage to manage sessions manually). {BasicSource} {HttpSource} {BeautifulSoupSource} {ParselSource} {PlaywrightSource} {StandaloneSource} These examples demonstrate the basics of configuring and using the `SessionPool`. Please, bear in mind that `SessionPool` requires some time to establish a stable pool of working IPs. During the initial setup, you may encounter errors as the pool identifies and filters out blocked or non-functional IPs. This stabilization period is expected and will improve over time. ## Configuring a single session In some cases, you need full control over session usage. For example, when working with websites requiring authentication or initialization of certain parameters like cookies. When working with a site that requires authentication, we typically don't want multiple sessions with different browser fingerprints or client parameters accessing the site. In this case, we need to configure the `SessionPool` appropriately: {OneSession} ## Binding requests to specific sessions In the previous example, there's one obvious limitation - you're restricted to only one session. In some cases, we need to achieve the same behavior but using multiple sessions in parallel, such as authenticating with different profiles or using different proxies. To do this, use the `session_id` parameter for the `Request` object to bind a request to a specific session: {MultiSessions} ================================================ FILE: docs/guides/storage_clients.mdx ================================================ --- id: storage-clients title: Storage clients description: How to work with storage clients in Crawlee, including the built-in clients and how to create your own. --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import CodeBlock from '@theme/CodeBlock'; import MemoryStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/memory_storage_client_basic_example.py'; import FileSystemStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_basic_example.py'; import FileSystemStorageClientConfigurationExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_configuration_example.py'; import CustomStorageClientExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/custom_storage_client_example.py'; import RegisteringStorageClientsExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/registering_storage_clients_example.py'; import SQLStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/sql_storage_client_basic_example.py'; import SQLStorageClientConfigurationExample from '!!raw-loader!./code_examples/storage_clients/sql_storage_client_configuration_example.py'; import RedisStorageClientBasicExample from '!!raw-loader!./code_examples/storage_clients/redis_storage_client_basic_example.py'; import RedisStorageClientConfigurationExample from '!!raw-loader!./code_examples/storage_clients/redis_storage_client_configuration_example.py'; Storage clients provide a unified interface for interacting with `Dataset`, `KeyValueStore`, and `RequestQueue`, regardless of the underlying implementation. They handle operations like creating, reading, updating, and deleting storage instances, as well as managing data persistence and cleanup. This abstraction makes it easy to switch between different environments, such as local development and cloud production setups. ## Built-in storage clients Crawlee provides three main storage client implementations: - `FileSystemStorageClient` - Provides persistent file system storage with in-memory caching. - `MemoryStorageClient` - Stores data in memory with no persistence. - `SqlStorageClient` - Provides persistent storage using a SQL database ([SQLite](https://sqlite.org/), [PostgreSQL](https://www.postgresql.org/), [MySQL](https://www.mysql.com/) or [MariaDB](https://mariadb.org/)). Requires installing the extra dependency: `crawlee[sql_sqlite]` for SQLite, `crawlee[sql_postgres]` for PostgreSQL or `crawlee[sql_mysql]` for MySQL and MariaDB. - `RedisStorageClient` - Provides persistent storage using a [Redis](https://redis.io/) database v8.0+. Requires installing the extra dependency `crawlee[redis]`. - [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com), implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Abstract classes %% ======================== class StorageClient { <> } %% ======================== %% Specific classes %% ======================== class FileSystemStorageClient class MemoryStorageClient class SqlStorageClient class RedisStorageClient class ApifyStorageClient %% ======================== %% Inheritance arrows %% ======================== StorageClient --|> FileSystemStorageClient StorageClient --|> MemoryStorageClient StorageClient --|> SqlStorageClient StorageClient --|> RedisStorageClient StorageClient --|> ApifyStorageClient ``` ### File system storage client The `FileSystemStorageClient` provides persistent storage by writing data directly to the file system. It uses intelligent caching and batch processing for better performance while storing data in human-readable JSON format. This is the default storage client used by Crawlee when no other storage client is specified, making it ideal for large datasets and long-running operations where data persistence is required. :::warning Concurrency limitation The `FileSystemStorageClient` is not safe for concurrent access from multiple crawler processes. Use it only when running a single crawler process at a time. ::: This storage client is ideal for large datasets, and long-running operations where data persistence is required. Data can be easily inspected and shared with other tools. {FileSystemStorageClientBasicExample} Configuration options for the `FileSystemStorageClient` can be set through environment variables or the `Configuration` class: - **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`) - The root directory for all storage data. - **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start. Data is stored using the following directory structure: ```text {CRAWLEE_STORAGE_DIR}/ ├── datasets/ │ └── {DATASET_NAME}/ │ ├── __metadata__.json │ ├── 000000001.json │ └── 000000002.json ├── key_value_stores/ │ └── {KVS_NAME}/ │ ├── __metadata__.json │ ├── key1.json │ ├── key2.txt │ └── key3.json └── request_queues/ └── {RQ_NAME}/ ├── __metadata__.json ├── {REQUEST_ID_1}.json └── {REQUEST_ID_2}.json ``` Where: - `{CRAWLEE_STORAGE_DIR}` - The root directory for local storage. - `{DATASET_NAME}`, `{KVS_NAME}`, `{RQ_NAME}` - The unique names for each storage instance (defaults to `"default"`). - Files are stored directly without additional metadata files for simpler structure. Here is an example of how to configure the `FileSystemStorageClient`: {FileSystemStorageClientConfigurationExample} ### Memory storage client The `MemoryStorageClient` stores all data in memory using Python data structures. It provides fast access but does not persist data between runs, meaning all data is lost when the program terminates. This storage client is primarily suitable for testing and development, and is usually not a good fit for production use. However, in some cases where speed is prioritized over persistence, it can make sense. :::warning Persistence limitation The `MemoryStorageClient` does not persist data between runs. All data is lost when the program terminates. ::: {MemoryStorageClientBasicExample} ### SQL storage client :::warning Experimental feature The `SqlStorageClient` is experimental. Its API and behavior may change in future releases. ::: The `SqlStorageClient` provides persistent storage using a SQL database (SQLite by default, or PostgreSQL, MySQL, MariaDB). It supports all Crawlee storage types and enables concurrent access from multiple independent clients or processes. :::note dependencies The `SqlStorageClient` is not included in the core Crawlee package. To use it, you need to install Crawlee with the appropriate extra dependency: - For SQLite support, run: pip install 'crawlee[sql_sqlite]' - For PostgreSQL support, run: pip install 'crawlee[sql_postgres]' - For MySQL or MariaDB support, run: pip install 'crawlee[sql_mysql]' ::: By default, SqlStorageClient uses SQLite. To use a different database, just provide the appropriate connection string via the `connection_string` parameter. No other code changes are needed—the same client works for all supported databases. {SQLStorageClientBasicExample} Data is organized in relational tables. Below are the main tables and columns used for each storage type: ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Storage Clients %% ======================== class SqlDatasetClient { <> } class SqlKeyValueStoreClient { <> } %% ======================== %% Dataset Tables %% ======================== class datasets { <> + dataset_id (PK) + internal_name + name + accessed_at + created_at + modified_at + item_count + buffer_locked_until } class dataset_records { <
> + item_id (PK) + dataset_id (FK) + data } class dataset_metadata_buffer { <
> + id (PK) + accessed_at + modified_at + delta_item_count } %% ======================== %% Key-Value Store Tables %% ======================== class key_value_stores { <
> + key_value_store_id (PK) + internal_name + name + accessed_at + created_at + modified_at + buffer_locked_until } class key_value_store_records { <
> + key_value_store_id (FK, PK) + key (PK) + value + content_type + size } class key_value_store_metadata_buffer { <
> + id (PK) + accessed_at + modified_at } %% ======================== %% Client to Table arrows %% ======================== SqlDatasetClient --> datasets SqlDatasetClient --> dataset_records SqlDatasetClient --> dataset_metadata_buffer SqlKeyValueStoreClient --> key_value_stores SqlKeyValueStoreClient --> key_value_store_records SqlKeyValueStoreClient --> key_value_store_metadata_buffer ``` ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Storage Clients %% ======================== class SqlRequestQueueClient { <> } %% ======================== %% Request Queue Tables %% ======================== class request_queues { <
> + request_queue_id (PK) + internal_name + name + accessed_at + created_at + modified_at + had_multiple_clients + handled_request_count + pending_request_count + total_request_count + buffer_locked_until } class request_queue_records { <
> + request_id (PK) + request_queue_id (FK, PK) + data + sequence_number + is_handled + time_blocked_until + client_key } class request_queue_state { <
> + request_queue_id (FK, PK) + sequence_counter + forefront_sequence_counter } class request_queue_metadata_buffer { <
> + id (PK) + accessed_at + modified_at + client_id + delta_handled_count + delta_pending_count + delta_total_count + need_recalc } %% ======================== %% Client to Table arrows %% ======================== SqlRequestQueueClient --> request_queues SqlRequestQueueClient --> request_queue_records SqlRequestQueueClient --> request_queue_state SqlRequestQueueClient --> request_queue_metadata_buffer ``` Configuration options for the `SqlStorageClient` can be set through environment variables or the `Configuration` class: - **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`) - The root directory where the default SQLite database will be created if no connection string is provided. - **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start. Configuration options for the `SqlStorageClient` can be set via constructor arguments: - **`connection_string`** (default: SQLite in `Configuration` storage dir) - SQLAlchemy connection string, e.g. `sqlite+aiosqlite:///my.db`, `postgresql+asyncpg://user:pass@host/db`, `mysql+aiomysql://user:pass@host/db` or `mariadb+aiomysql://user:pass@host/db`. - **`engine`** - Pre-configured SQLAlchemy AsyncEngine (optional). For advanced scenarios, you can configure `SqlStorageClient` with a custom SQLAlchemy engine and additional options via the `Configuration` class. This is useful, for example, when connecting to an external PostgreSQL database or customizing connection pooling. :::warning If you use MySQL or MariaDB, pass the `isolation_level='READ COMMITTED'` argument to `create_async_engine`. MySQL/MariaDB default to the `REPEATABLE READ` isolation level, which can cause unnecessary locking, deadlocks, or stale reads when multiple Crawlee workers access the same tables concurrently. Using `READ COMMITTED` ensures more predictable row-level locking and visibility semantics for `SqlStorageClient`. ::: {SQLStorageClientConfigurationExample} ### Redis storage client :::warning Experimental feature The `RedisStorageClient` is experimental. Its API and behavior may change in future releases. ::: The `RedisStorageClient` provides persistent storage using [Redis](https://redis.io/) database. It supports concurrent access from multiple independent clients or processes and uses Redis native data structures for efficient operations. :::note dependencies The `RedisStorageClient` is not included in the core Crawlee package. To use it, you need to install Crawlee with the Redis extra dependency: pip install 'crawlee[redis]' Additionally, Redis version 8.0 or higher is required. ::: :::note Redis persistence Data persistence in Redis depends on your [database configuration](https://redis.io/docs/latest/operate/oss_and_stack/management/persistence/). ::: The client requires either a Redis connection string or a pre-configured Redis client instance. Use a pre-configured client when you need custom Redis settings such as connection pooling, timeouts, or SSL/TLS encryption. {RedisStorageClientBasicExample} Data is organized using Redis key patterns. Below are the main data structures used for each storage type: ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Storage Client %% ======================== class RedisDatasetClient { <> } %% ======================== %% Dataset Keys %% ======================== class DatasetKeys { datasets:[name]:items - JSON Array datasets:[name]:metadata - JSON Object } class DatasetsIndexes { datasets:id_to_name - Hash datasets:name_to_id - Hash } %% ======================== %% Client to Keys arrows %% ======================== RedisDatasetClient --> DatasetKeys RedisDatasetClient --> DatasetsIndexes ``` ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Storage Clients %% ======================== class RedisKeyValueStoreClient { <> } %% ======================== %% Key-Value Store Keys %% ======================== class KeyValueStoreKeys { key_value_stores:[name]:items - Hash key_value_stores:[name]:metadata_items - Hash key_value_stores:[name]:metadata - JSON Object } class KeyValueStoresIndexes { key_value_stores:id_to_name - Hash key_value_stores:name_to_id - Hash } %% ======================== %% Client to Keys arrows %% ======================== RedisKeyValueStoreClient --> KeyValueStoreKeys RedisKeyValueStoreClient --> KeyValueStoresIndexes ``` ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Storage Clients %% ======================== class RedisRequestQueueClient { <> } %% ======================== %% Request Queue Keys %% ======================== class RequestQueueKeys{ request_queues:[name]:queue - List request_queues:[name]:data - Hash request_queues:[name]:in_progress - Hash request_queues:[name]:added_bloom_filter - Bloom Filter | bloom queue_dedup_strategy request_queues:[name]:handled_bloom_filter - Bloom Filter | bloom queue_dedup_strategy request_queues:[name]:pending_set - Set | default queue_dedup_strategy request_queues:[name]:handled_set - Set | default queue_dedup_strategy request_queues:[name]:metadata - JSON Object } class RequestQueuesIndexes { request_queues:id_to_name - Hash request_queues:name_to_id - Hash } %% ======================== %% Client to Keys arrows %% ======================== RedisRequestQueueClient --> RequestQueueKeys RedisRequestQueueClient --> RequestQueuesIndexes ``` Configuration options for the `RedisStorageClient` can be set through environment variables or the `Configuration` class: - **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start. Configuration options for the `RedisStorageClient` can be set via constructor arguments: - **`connection_string`** - Redis connection string, e.g. `redis://localhost:6379/0`. - **`redis`** - Pre-configured Redis client instance (optional). {RedisStorageClientConfigurationExample} ## Creating a custom storage client A storage client consists of two parts: the storage client factory and individual storage type clients. The `StorageClient` acts as a factory that creates specific clients (`DatasetClient`, `KeyValueStoreClient`, `RequestQueueClient`) where the actual storage logic is implemented. Here is an example of a custom storage client that implements the `StorageClient` interface: {CustomStorageClientExample} Custom storage clients can implement any storage logic, such as connecting to a database, using a cloud storage service, or integrating with other systems. They must implement the required methods for creating, reading, updating, and deleting data in the respective storages. ## Registering storage clients Storage clients can be registered in multiple ways: - **Globally** - Using the `ServiceLocator` or passing directly to the crawler. - **Per storage** - When opening a specific storage instance like `Dataset`, `KeyValueStore`, or `RequestQueue`. {RegisteringStorageClientsExample} You can also register different storage clients for each storage instance, allowing you to use different backends for different storages. This is useful when you want to use a fast in-memory storage for `RequestQueue` while persisting scraping results in `Dataset` or `KeyValueStore`. ## Conclusion Storage clients in Crawlee provide different backends for data storage. Use `MemoryStorageClient` for testing and fast operations without persistence, or `FileSystemStorageClient` for environments where data needs to persist. You can also create custom storage clients for specialized backends by implementing the `StorageClient` interface. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! ================================================ FILE: docs/guides/storages.mdx ================================================ --- id: storages title: Storages description: How to work with storages in Crawlee, how to manage requests and how to store and retrieve scraping results. --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import OpeningExample from '!!raw-loader!roa-loader!./code_examples/storages/opening.py'; import RqBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_basic_example.py'; import RqWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_with_crawler_example.py'; import RqWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_with_crawler_explicit_example.py'; import RqHelperAddRequestsExample from '!!raw-loader!roa-loader!./code_examples/storages/helper_add_requests_example.py'; import RqHelperEnqueueLinksExample from '!!raw-loader!roa-loader!./code_examples/storages/helper_enqueue_links_example.py'; import DatasetBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_basic_example.py'; import DatasetWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_with_crawler_example.py'; import DatasetWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_with_crawler_explicit_example.py'; import KvsBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_basic_example.py'; import KvsWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_example.py'; import KvsWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_explicit_example.py'; import CleaningDoNotPurgeExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_do_not_purge_example.py'; import CleaningPurgeExplicitlyExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_purge_explicitly_example.py'; Crawlee offers several storage types for managing and persisting your crawling data. Request-oriented storages, such as the `RequestQueue`, help you store and deduplicate URLs, while result-oriented storages, like `Dataset` and `KeyValueStore`, focus on storing and retrieving scraping results. This guide explains when to use each type, how to interact with them, and how to control their lifecycle. ## Overview Crawlee's storage system consists of two main layers: - **Storages** (`Dataset`, `KeyValueStore`, `RequestQueue`): High-level interfaces for interacting with different storage types. - **Storage clients** (`MemoryStorageClient`, `FileSystemStorageClient`, etc.): Backend implementations that handle the actual data persistence and management. For more information about storage clients and their configuration, see the [Storage clients guide](./storage-clients). ```mermaid --- config: class: hideEmptyMembersBox: true --- classDiagram %% ======================== %% Abstract classes %% ======================== class Storage { <> } %% ======================== %% Specific classes %% ======================== class Dataset class KeyValueStore class RequestQueue %% ======================== %% Inheritance arrows %% ======================== Storage --|> Dataset Storage --|> KeyValueStore Storage --|> RequestQueue ``` ### Named and unnamed storages Crawlee supports two types of storages: - **Named storages**: Persistent storages with a specific name that persist across runs. These are useful when you want to share data between different crawler runs or access the same storage from multiple places. - **Unnamed storages**: Temporary storages identified by an alias that are scoped to a single run. These are automatically purged at the start of each run (when `purge_on_start` is enabled, which is the default). ### Default storage Each storage type (`Dataset`, `KeyValueStore`, `RequestQueue`) has a default instance that can be accessed without specifying `id`, `name` or `alias`. Default unnamed storage is accessed by calling storage's `open` method without parameters. This is the most common way to use storages in simple crawlers. {OpeningExample} ## Request queue The `RequestQueue` is the primary storage for URLs in Crawlee, especially useful for deep crawling. It supports dynamic addition of URLs, making it ideal for recursive tasks where URLs are discovered and added during the crawling process (e.g., following links across multiple pages). Each Crawlee project has a **default request queue**, which can be used to store URLs during a specific run. The following code demonstrates the usage of the `RequestQueue`: {RqBasicExample} {RqWithCrawlerExample} {RqWithCrawlerExplicitExample} ### Request-related helpers Crawlee provides helper functions to simplify interactions with the `RequestQueue`: - The `add_requests` function allows you to manually add specific URLs to the configured request storage. In this case, you must explicitly provide the URLs you want to be added to the request storage. If you need to specify further details of the request, such as a `label` or `user_data`, you have to pass instances of the `Request` class to the helper. - The `enqueue_links` function is designed to discover new URLs in the current page and add them to the request storage. It can be used with default settings, requiring no arguments, or you can customize its behavior by specifying link element selectors, choosing different enqueue strategies, or applying include/exclude filters to control which URLs are added. See [Crawl website with relative links](../examples/crawl-website-with-relative-links) example for more details. {RqHelperAddRequestsExample} {RqHelperEnqueueLinksExample} ### Request manager The `RequestQueue` implements the `RequestManager` interface, offering a unified API for interacting with various request storage types. This provides a unified way to interact with different request storage types. If you need custom functionality, you can create your own request storage by subclassing the `RequestManager` class and implementing its required methods. For a detailed explanation of the `RequestManager` and other related components, refer to the [Request loaders guide](https://crawlee.dev/python/docs/guides/request-loaders). ## Dataset The `Dataset` is designed for storing structured data, where each entry has a consistent set of attributes, such as products in an online store or real estate listings. Think of a `Dataset` as a table: each entry corresponds to a row, with attributes represented as columns. Datasets are append-only, allowing you to add new records but not modify or delete existing ones. Every Crawlee project run is associated with a default dataset, typically used to store results specific to that crawler execution. However, using this dataset is optional. The following code demonstrates basic operations of the dataset: {DatasetBasicExample} {DatasetWithCrawlerExample} {DatasetWithCrawlerExplicitExample} ### Dataset-related helpers Crawlee provides the following helper function to simplify interactions with the `Dataset`: - The `push_data` function allows you to manually add data to the dataset. You can optionally specify the dataset ID or its name. ## Key-value store The `KeyValueStore` is designed to save and retrieve data records or files efficiently. Each record is uniquely identified by a key and is associated with a specific MIME type, making the `KeyValueStore` ideal for tasks like saving web page screenshots, PDFs, or tracking the state of crawlers. The following code demonstrates the usage of the `KeyValueStore`: {KvsBasicExample} {KvsWithCrawlerExample} {KvsWithCrawlerExplicitExample} To see a real-world example of how to get the input from the key-value store, see the [Screenshots](https://crawlee.dev/python/docs/examples/capture-screenshots-using-playwright) example. ### Key-value store-related helpers Crawlee provides the following helper function to simplify interactions with the `KeyValueStore`: - The `get_key_value_store` function retrieves the key-value store for the current crawler run. If the KVS does not exist, it will be created. You can also specify the KVS's ID or its name. ## Cleaning up the storages By default, Crawlee cleans up all unnamed storages (including the default one) at the start of each run, so every crawl begins with a clean state. This behavior is controlled by `Configuration.purge_on_start` (default: True). In contrast, named storages are never purged automatically and persist across runs. The exact behavior may vary depending on the storage client implementation. ### When purging happens The cleanup occurs as soon as a storage is accessed: - When opening a storage explicitly (e.g., `RequestQueue.open`, `Dataset.open`, `KeyValueStore.open`). - When using helper functions that implicitly open storages (e.g., `push_data`). - Automatically when `BasicCrawler.run` is invoked. ### Disabling automatic purging To disable automatic purging, set `purge_on_start=False` in your configuration: {CleaningDoNotPurgeExample} ### Manual purging Purge on start behavior just triggers the storage's `purge` method, which removes all data from the storage. If you want to purge the storage manually, you can do so by calling the `purge` method on the storage instance. Or if you want to delete the storage completely, you can call the `drop` method on the storage instance, which will remove the storage, including metadata and all its data. {CleaningPurgeExplicitlyExample} Note that purging behavior may vary between storage client implementations. For more details on storage configuration and client implementations, see the [Storage clients guide](./storage-clients). ## Conclusion This guide introduced you to the different storage types available in Crawlee and how to interact with them. You learned about the distinction between named storages (persistent across runs) and unnamed storages with aliases (temporary and purged on start). You discovered how to manage requests using the `RequestQueue` and store and retrieve scraping results using the `Dataset` and `KeyValueStore`. You also learned how to use helper functions to simplify interactions with these storages and how to control storage cleanup behavior. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! ================================================ FILE: docs/guides/trace_and_monitor_crawlers.mdx ================================================ --- id: trace-and-monitor-crawlers title: Trace and monitor crawlers description: Learn how to instrument your crawlers with OpenTelemetry to trace request handling, identify bottlenecks, monitor performance, and visualize telemetry data using Jaeger for performance optimization. --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import InstrumentCrawler from '!!raw-loader!./code_examples/trace_and_monitor_crawlers/instrument_crawler.py'; [OpenTelemtery](https://opentelemetry.io/) is a collection of APIs, SDKs, and tools to instrument, generate, collect, and export telemetry data (metrics, logs, and traces) to help you analyze your software’s performance and behavior. In the context of crawler development, it can be used to better understand how the crawler internally works, identify bottlenecks, debug, log metrics, and more. The topic described in this guide requires at least a basic understanding of OpenTelemetry. A good place to start is [What is open telemetry](https://opentelemetry.io/docs/what-is-opentelemetry/). In this guide, it will be shown how to set up OpenTelemetry and instrument a specific crawler to see traces of individual requests that are being processed by the crawler. OpenTelemetry on its own does not provide out of the box tool for convenient visualisation of the exported data (apart from printing to the console), but there are several good available tools to do that. In this guide, we will use [Jaeger](https://www.jaegertracing.io/) to visualise the telemetry data. To better understand concepts such as exporter, collector, and visualisation backend, please refer to the [OpenTelemetry documentation](https://opentelemetry.io/docs/collector/). ## Set up the Jaeger This guide will show how to set up the environment locally to run the example code and visualize the telemetry data in Jaeger that will be running locally in a [docker](https://www.docker.com/) container. To start the preconfigured Docker container, you can use the following command: ```bash docker run -d --name jaeger -e COLLECTOR_OTLP_ENABLED=true -p 16686:16686 -p 4317:4317 -p 4318:4318 jaegertracing/all-in-one:latest ``` For more details about the Jaeger setup, see the [getting started](https://www.jaegertracing.io/docs/2.7/getting-started/) section in their documentation. You can see the Jaeger UI in your browser by navigating to http://localhost:16686 ## Instrument the Crawler Now you can proceed with instrumenting the crawler to send the telemetry data to Jaeger and running it. To have the Python environment ready, you should install either **crawlee[all]** or **crawlee[otel]**, This will ensure that OpenTelemetry dependencies are installed, and you can run the example code snippet. In the following example, you can see the function `instrument_crawler` that contains the instrumentation setup and is called before the crawler is started. If you have already set up the Jaeger, then you can just run the following code snippet. {InstrumentCrawler} ## Analyze the results In the Jaeger UI, you can search for different traces, apply filtering, compare traces, view their detailed attributes, view timing details, and more. For the detailed description of the tool's capabilities, please refer to the [Jaeger documentation](https://www.jaegertracing.io/docs/1.47/deployment/frontend-ui/#trace-page). ![Jaeger search view](/img/guides/jaeger_otel_search_view_example.png 'Example visualisation of search view in Jaeger') ![Jaeger trace view](/img/guides/jaeger_otel_trace_example.png 'Example visualisation of crawler request trace in Jaeger') You can use different tools to consume the OpenTelemetry data that might better suit your needs. Please see the list of known Vendors in [OpenTelemetry documentation](https://opentelemetry.io/ecosystem/vendors/). ## Customize the instrumentation You can customize the `CrawlerInstrumentor`. Depending on the arguments used during its initialization, the instrumentation will be applied to different parts of the Crawlee code. By default, it instruments some functions that can give quite a good picture of each individual request handling. To turn this default instrumentation off, you can pass `request_handling_instrumentation=False` during initialization. You can also extend instrumentation by passing `instrument_classes=[...]` initialization argument that contains classes you want to be auto-instrumented. All their public methods will be automatically instrumented. Bear in mind that instrumentation has some runtime costs as well. The more instrumentation is used, the more overhead it will add to the crawler execution. You can also create your instrumentation by selecting only the methods you want to instrument. For more details, see the `CrawlerInstrumentor` source code and the [Python documentation for OpenTelemetry](https://opentelemetry.io/docs/languages/python/). If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). ================================================ FILE: docs/introduction/01_setting_up.mdx ================================================ --- id: setting-up title: Setting up --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; This guide will help you get started with Crawlee by setting it up on your computer. Follow the steps below to ensure a smooth installation process. ## Prerequisites Before installing Crawlee itself, make sure that your system meets the following requirements: - **Python 3.10 or higher**: Crawlee requires Python 3.10 or a newer version. You can download Python from the [official website](https://python.org/downloads/). - **Python package manager**: While this guide uses [pip](https://pip.pypa.io/) (the most common package manager), you can also use any package manager you want. You can download pip from the [official website](https://pip.pypa.io/en/stable/installation/). ### Verifying prerequisites To check if Python and pip are installed, run the following commands: ```sh python --version ``` ```sh python -m pip --version ``` If these commands return the respective versions, you're ready to continue. ## Installing Crawlee Crawlee is available as [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal. ### Basic installation To install the core package, run: ```sh python -m pip install crawlee ``` After installation, verify that Crawlee is installed correctly by checking its version: ```sh python -c 'import crawlee; print(crawlee.__version__)' ``` ### Full installation If you do not mind the package size, you can run the following command to install Crawlee with all optional features: ```sh python -m pip install 'crawlee[all]' ``` ### Installing specific extras Depending on your use case, you may want to install specific extras to enable additional functionality: For using the `BeautifulSoupCrawler`, install the `beautifulsoup` extra: ```sh python -m pip install 'crawlee[beautifulsoup]' ``` For using the `ParselCrawler`, install the `parsel` extra: ```sh python -m pip install 'crawlee[parsel]' ``` For using the `CurlImpersonateHttpClient`, install the `curl-impersonate` extra: ```sh python -m pip install 'crawlee[curl-impersonate]' ``` If you plan to use a (headless) browser with `PlaywrightCrawler`, install Crawlee with the `playwright` extra: ```sh python -m pip install 'crawlee[playwright]' ``` After installing the playwright extra, install the necessary Playwright dependencies: ```sh playwright install ``` ### Installing multiple extras You can install multiple extras at once by using a comma as a separator: ```sh python -m pip install 'crawlee[beautifulsoup,curl-impersonate]' ``` ## Start a new project The quickest way to get started with Crawlee is by using the Crawlee CLI and selecting one of the prepared templates. The CLI helps you set up a new project in seconds. ### Using Crawlee CLI with uv First, ensure you have [uv](https://pypi.org/project/uv/) installed. You can check if it is installed by running: ```sh uv --version ``` If [uv](https://pypi.org/project/uv/) is not installed, follow the official [installation guide](https://docs.astral.sh/uv/getting-started/installation/). Then, run the Crawlee CLI using `uvx` and choose from the available templates: ```sh uvx 'crawlee[cli]' create my-crawler ``` ### Using Crawlee CLI directly If you already have `crawlee` installed, you can spin it up by running: ```sh crawlee create my_crawler ``` Follow the interactive prompts in the CLI to choose a crawler type and set up your new project. ### Running your project To run your newly created project, navigate to the project directory, activate the virtual environment, and execute the Python interpreter with the project module: cd my_crawler/ source .venv/bin/activate python -m my_crawler cd my_crawler/ venv\Scripts\activate python -m my_crawler Congratulations! You have successfully set up and executed your first Crawlee project. ## Next steps Next, you will learn how to create a very simple crawler and Crawlee components while building it. ================================================ FILE: docs/introduction/02_first_crawler.mdx ================================================ --- id: first-crawler title: First crawler --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import RequestQueueExample from '!!raw-loader!roa-loader!./code_examples/02_request_queue.py'; import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/02_bs.py'; import BeautifulSoupBetterExample from '!!raw-loader!roa-loader!./code_examples/02_bs_better.py'; Now, you will build your first crawler. But before you do, let's briefly introduce the Crawlee classes involved in the process. ## How Crawlee works There are 3 main crawler classes available for use in Crawlee. - `BeautifulSoupCrawler` - `ParselCrawler` - `PlaywrightCrawler` We'll talk about their differences later. Now, let's talk about what they have in common. The general idea of each crawler is to go to a web page, open it, do some stuff there, save some results, continue to the next page, and repeat this process until the crawler's done its job. So the crawler always needs to find answers to two questions: _Where should I go?_ and _What should I do there?_ Answering those two questions is the only required setup. The crawlers have reasonable defaults for everything else. ### The where - `Request` and `RequestQueue` All crawlers use instances of the `Request` class to determine where they need to go. Each request may hold a lot of information, but at the very least, it must hold a URL - a web page to open. But having only one URL would not make sense for crawling. Sometimes you have a pre-existing list of your own URLs that you wish to visit, perhaps a thousand. Other times you need to build this list dynamically as you crawl, adding more and more URLs to the list as you progress. Most of the time, you will use both options. The requests are stored in a `RequestQueue`, a dynamic queue of `Request` instances. You can seed it with start URLs and also add more requests while the crawler is running. This allows the crawler to open one page, extract interesting data, such as links to other pages on the same domain, add them to the queue (called _enqueuing_) and repeat this process to build a queue of virtually unlimited number of URLs. ### The what - request handler In the request handler you tell the crawler what to do at each and every page it visits. You can use it to handle extraction of data from the page, processing the data, saving it, calling APIs, doing calculations and so on. The request handler is a user-defined function, invoked automatically by the crawler for each `Request` from the `RequestQueue`. It always receives a single argument - `BasicCrawlingContext` (or its descendants). Its properties change depending on the crawler class used, but it always includes the `request` property, which represents the currently crawled URL and related metadata. ## Building a crawler Let's put the theory into practice and start with something easy. Visit a page and get its HTML title. In this tutorial, you'll scrape the Crawlee website [https://crawlee.dev](https://crawlee.dev), but the same code will work for any website. ### Adding requests to the crawling queue Earlier you learned that the crawler uses a queue of requests as its source of URLs to crawl. Let's create it and add the first request. {RequestQueueExample} The `RequestQueue.add_request` method automatically converts the object with URL string to a `Request` instance. So now you have a `RequestQueue` that holds one request which points to `https://crawlee.dev`. :::tip Bulk add requests The code above is for illustration of the request queue concept. Soon you'll learn about the `BasicCrawler.add_requests` method which allows you to skip this initialization code, and it also supports adding a large number of requests without blocking. ::: ### Building a BeautifulSoupCrawler Crawlee comes with three main crawler classes: `BeautifulSoupCrawler`, `ParselCrawler`, and `PlaywrightCrawler`. You can read their short descriptions in the [Quick start](../quick-start) lesson. Unless you have a good reason to start with a different one, you should try building a `BeautifulSoupCrawler` first. It is an HTTP crawler with HTTP2 support, anti-blocking features and integrated HTML parser - [BeautifulSoup](https://pypi.org/project/beautifulsoup4/). It's fast, simple, cheap to run and does not require complicated dependencies. The only downside is that it won't work out of the box for websites which require JavaScript rendering. But you might not need JavaScript rendering at all, because many modern websites use server-side rendering. Let's continue with the earlier `RequestQueue` example. {BeautifulSoupExample} When you run the example, you will see the title of https://crawlee.dev printed to the log. What really happens is that `BeautifulSoupCrawler` first makes an HTTP request to `https://crawlee.dev`, then parses the received HTML with BeautifulSoup and makes it available as the `context` argument of the request handler. ```log [__main__] INFO The title of "https://crawlee.dev" is "Crawlee · Build reliable crawlers. Fast. | Crawlee". ``` ### Add requests faster Earlier we mentioned that you'll learn how to use the `BasicCrawler.add_requests` method to skip the request queue initialization. It's simple. Every crawler has an implicit `RequestQueue` instance, and you can add requests to it with the `BasicCrawler.add_requests` method. In fact, you can go even further and just use the first parameter of `crawler.run()`! {BeautifulSoupBetterExample} When you run this code, you'll see exactly the same output as with the earlier, longer example. The `RequestQueue` is still there, it's just managed by the crawler automatically. :::info This method not only makes the code shorter, it will help with performance too! Internally it calls `RequestQueue.add_requests_batched` method. It will wait only for the initial batch of 1000 requests to be added to the queue before resolving, which means the processing will start almost instantly. After that, it will continue adding the rest of the requests in the background (again, in batches of 1000 items, once every second). ::: ## Next steps Next, you'll learn about crawling links. That means finding new URLs on the pages you crawl and adding them to the `RequestQueue` for the crawler to visit. ================================================ FILE: docs/introduction/03_adding_more_urls.mdx ================================================ --- id: adding-more-urls title: Adding more URLs --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import OriginalCodeExample from '!!raw-loader!roa-loader!./code_examples/03_original_code.py'; import FindingNewLinksExample from '!!raw-loader!roa-loader!./code_examples/03_finding_new_links.py'; import EnqueueStrategyExample from '!!raw-loader!roa-loader!./code_examples/03_enqueue_strategy.py'; import GlobsExample from '!!raw-loader!roa-loader!./code_examples/03_globs.py'; import TransformExample from '!!raw-loader!roa-loader!./code_examples/03_transform_request.py'; Previously you've built a very simple crawler that downloads HTML of a single page, reads its title and prints it to the console. This is the original source code: {OriginalCodeExample} Now you'll use the example from the previous section and improve on it. You'll add more URLs to the queue and thanks to that the crawler will keep going, finding new links, enqueuing them into the `RequestQueue` and then scraping them. ## How crawling works The process is simple: 1. Find new links on the page. 2. Filter only those pointing to the same domain, in this case [crawlee.dev](https://crawlee.dev/). 3. Enqueue (add) them to the `RequestQueue`. 4. Visit the newly enqueued links. 5. Repeat the process. In the following paragraphs you will learn about the `enqueue_links` function which simplifies crawling to a single function call. :::tip context awareness The `enqueue_links` function is context aware. It means that it will read the information about the currently crawled page from the context, and you don't need to explicitly provide any arguments. However, you can specify filtering criteria or an enqueuing strategy if desired. It will find the links and automatically add the links to the running crawler's `RequestQueue`. ::: ## Limit your crawls When you're just testing your code or when your crawler could potentially find millions of links, it's very useful to set a maximum limit of crawled pages. The option is called `max_requests_per_crawl`, is available in all crawlers, and you can set it like this: ```python crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) ``` This means that no new requests will be started after the 20th request is finished. The actual number of processed requests might be a little higher thanks to parallelization, because the running requests won't be forcefully aborted. It's not even possible in most cases. ## Finding new links There are numerous approaches to finding links to follow when crawling the web. For our purposes, we will be looking for `` elements that contain the `href` attribute because that's what you need in most cases. For example: ```html This is a link to Crawlee introduction ``` Since this is the most common case, it is also the `enqueue_links` default. {FindingNewLinksExample} If you need to override the default selection of elements in `enqueue_links`, you can use the `selector` argument. ```python await context.enqueue_links(selector='a.article-link') ``` ## Filtering links to same domain Websites typically contain a lot of links that lead away from the original page. This is normal, but when crawling a website, we usually want to crawl that one site and not let our crawler wander away to Google, Facebook and Twitter. Therefore, we need to filter out the off-domain links and only keep the ones that lead to the same domain. ```python # The default behavior of enqueue_links is to stay on the same hostname, so it does not require # any parameters. This will ensure the subdomain stays the same. await context.enqueue_links() ``` The default behavior of `enqueue_links` is to stay on the same hostname. This **does not include subdomains**. To include subdomains in your crawl, use the `strategy` argument. The `strategy` argument is an instance of the `EnqueueStrategy` type alias. {EnqueueStrategyExample} When you run the code, you will see the crawler log the **title** of the first page, then the **enqueueing** message showing number of URLs, followed by the **title** of the first enqueued page and so on and so on. ## Skipping duplicate URLs Skipping of duplicate URLs is critical, because visiting the same page multiple times would lead to duplicate results. This is automatically handled by the `RequestQueue` which deduplicates requests using their `unique_key`. This `unique_key` is automatically generated from the request's URL by lowercasing the URL, lexically ordering query parameters, removing fragments and a few other tweaks that ensure the queue only includes unique URLs. ## Advanced filtering arguments While the defaults for `enqueue_links` can be often exactly what you need, it also gives you fine-grained control over which URLs should be enqueued. One way we already mentioned above. It is using the `EnqueueStrategy` type alias. You can use the `all` strategy if you want to follow every single link, regardless of its domain, or you can enqueue links that target the same domain name with the `same-domain` strategy. ```python # Wanders the internet. await context.enqueue_links(strategy='all') ``` ### Filter URLs with patterns For even more control, you can use the `include` or `exclude` parameters, either as glob patterns or regular expressions, to filter the URLs. Refer to the API documentation for `enqueue_links` for detailed information on these and other available options. {GlobsExample} ### Transform requests before enqueuing For cases where you need to modify or filter requests before they are enqueued, you can use the `transform_request_function` parameter. This function receives a `RequestOptions` object and should return either a modified `RequestOptions` object, or a string of type `RequestTransformAction`, which only allows the values `skip` and `unchanged`. Returning `skip` means the request will be skipped, while `unchanged` will add it without any changes {TransformExample} ## Next steps Next, you will start your project of scraping a production website and learn some more Crawlee tricks in the process. ================================================ FILE: docs/introduction/04_real_world_project.mdx ================================================ --- id: real-world-project title: Real-world project --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import SanityCheckExample from '!!raw-loader!roa-loader!./code_examples/04_sanity_check.py'; > _Hey, guys, you know, it's cool that we can scrape the `` elements of web pages, but that's not very useful. Can we finally scrape some real data and save it somewhere in a machine-readable format? Because that's why I started reading this tutorial in the first place!_ We hear you, young padawan! First, learn how to crawl, you must. Only then, walk through data, you can! ## Making a production-grade crawler Making a production-grade crawler is not difficult, but there are many pitfalls of scraping that can catch you off guard. So for the real world project you'll learn how to scrape an [Warehouse store example](https://warehouse-theme-metal.myshopify.com/collections) instead of the Crawlee website. It contains a list of products of different categories, and each product has its own detail page. The website requires JavaScript rendering, which allows us to showcase more features of Crawlee. We've also added some helpful tips that prepare you for the real-world issues that you will surely encounter when scraping at scale. :::tip Not interested in theory? If you're not interested in crawling theory, feel free to [skip to the next chapter](./crawling) and get right back to coding. ::: ## Drawing a plan Sometimes scraping is really straightforward, but most of the time, it really pays off to do a bit of research first and try to answer some of these questions: - How is the website structured? - Can I scrape it only with HTTP requests (read "with some <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>, e.g. <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>")? - Do I need a headless browser for something? - Are there any anti-scraping protections in place? - Do I need to parse the HTML or can I get the data otherwise, such as directly from the website's API? For the purposes of this tutorial, let's assume that the website cannot be scraped with <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>. It actually can, but we would have to dive a bit deeper than this introductory guide allows. So for now we will make things easier for you, scrape it with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, and you'll learn about headless browsers in the process. ## Choosing the data you need A good first step is to figure out what data you want to scrape and where to find it. For the time being, let's just agree that we want to scrape all products from all categories available on the [all collections page of the store](https://warehouse-theme-metal.myshopify.com/collections) and for each product we want to get its: - URL - Manufacturer - SKU - Title - Current price - Stock available You will notice that some information is available directly on the list page, but for details such as "SKU" we'll also need to open the product's detail page. ![data to scrape](/img/getting-started/scraping-practice.jpg 'Overview of data to be scraped.') ### The start URL(s) This is where you start your crawl. It's convenient to start as close to the data as possible. For example, it wouldn't make much sense to start at https://warehouse-theme-metal.myshopify.com and look for a `collections` link there, when we already know that everything we want to extract can be found at the https://warehouse-theme-metal.myshopify.com/collections page. ## Exploring the page Let's take a look at the https://warehouse-theme-metal.myshopify.com/collections page more carefully. There are some **categories** on the page, and each category has a list of **items**. On some category pages, at the bottom you will notice there are links to the next pages of results. This is usually called **the pagination**. ### Categories and sorting When you click the categories, you'll see that they load a page of products filtered by that category. By going through a few categories and observing the behavior, we can also observe that we can sort by different conditions (such as `Best selling`, or `Price, low to high`), but for this example, we will not be looking into those. :::caution Limited pagination Be careful, because on some websites, like [amazon.com](https://amazon.com), this is not true and the sum of products in categories is actually larger than what's available without filters. Learn more in our [tutorial on scraping websites with limited pagination](https://docs.apify.com/tutorials/scrape-paginated-sites). ::: ### Pagination The pagination of the demo Warehouse Store is simple enough. When switching between pages, you will see that the URL changes to: ```text https://warehouse-theme-metal.myshopify.com/collections/headphones?page=2 ``` Try clicking on the link to page 4. You'll see that the pagination links update and show more pages. But can you trust that this will include all pages and won't stop at some point? :::caution Test your assumptions Similarly to the issue with filters explained above, the existence of pagination does not guarantee that you can simply paginate through all the results. Always test your assumptions about pagination. Otherwise, you might miss a chunk of results, and not even know about it. ::: At the time of writing the `Headphones` collection results counter showed 75 results - products. Quick count of products on one page of results makes 24. 6 rows times 4 products. This means that there are 4 pages of results. If you're not convinced, you can visit a page somewhere in the middle, like `https://warehouse-theme-metal.myshopify.com/collections/headphones?page=2` and see how the pagination looks there. ## The crawling strategy Now that you know where to start and how to find all the collection details, let's look at the crawling process. 1. Visit the store page containing the list of categories (our start URL). 2. Enqueue all links to all categories. 3. Enqueue all product pages from the current page. 4. Enqueue links to next pages of results. 5. Open the next page in queue. - When it's a results list page, go to 2. - When it's a product page, scrape the data. 6. Repeat until all results pages and all products have been processed. `PlaywrightCrawler` will make sure to visit the pages for you, if you provide the correct requests, and you already know how to enqueue pages, so this should be fairly easy. Nevertheless, there are few more tricks that we'd like to showcase. ## Sanity check Let's check that everything is set up correctly before writing the scraping logic itself. You might realize that something in your previous analysis doesn't quite add up, or the website might not behave exactly as you expected. The example below creates a new crawler that visits the start URL and prints the text content of all the categories on that page. When you run the code, you will see the _very badly formatted_ content of the individual category card. <RunnableCodeBlock className="language-python" language="python"> {SanityCheckExample} </RunnableCodeBlock> If you're wondering how to get that `.collection-block-item` selector. We'll explain it in the next chapter on DevTools. ## DevTools - the scraper's toolbox :::info DevTool choice We'll use Chrome DevTools here, since it's the most common browser, but feel free to use any other, they're all very similar. ::: Let's open DevTools by going to https://warehouse-theme-metal.myshopify.com/collections in Chrome and then right-clicking anywhere in the page and selecting **Inspect**, or by pressing **F12** or whatever your system prefers. With DevTools, you can inspect or manipulate any aspect of the currently open web page. You can learn more about DevTools in their [official documentation](https://developer.chrome.com/docs/devtools/). ## Selecting elements In the DevTools, choose the **Select an element** tool and try hovering over one of the Actor cards. ![select an element](/img/getting-started/select-an-element.jpg 'Finding the select an element tool.') You'll see that you can select different elements inside the card. Instead, select the whole card, not just some of its contents, such as its title or description. ![selected element](/img/getting-started/selected-element.jpg 'Selecting an element by hovering over it.') Selecting an element will highlight it in the DevTools HTML inspector. When carefully look at the elements, you'll see that there are some **classes** attached to the different HTML elements. Those are called **CSS classes**, and we can make a use of them in scraping. Conversely, by hovering over elements in the HTML inspector, you will see them highlight on the page. Inspect the page's structure around the collection card. You'll see that all the card's data is displayed in an `<a>` element with a `class` attribute that includes **collection-block-item**. It should now make sense how we got that `.collection-block-item` selector. It's just a way to find all elements that are annotated with the `collection-block-item`. It's always a good idea to double-check that you're not getting any unwanted elements with this class. To do that, go into the **Console** tab of DevTools and run: ```ts document.querySelectorAll('.collection-block-item'); ``` You will see that only the 31 collection cards will be returned, and nothing else. :::tip Learn more about CSS selectors and DevTools CSS selectors and DevTools are quite a big topic. If you want to learn more, visit the [Web scraping for beginners course](https://developers.apify.com/academy/web-scraping-for-beginners) in the Apify Academy. **It's free and open-source** ❤️. ::: ## Next steps Next, you will crawl the whole store, including all the listing pages and all the product detail pages. ================================================ FILE: docs/introduction/05_crawling.mdx ================================================ --- id: crawling title: Crawling --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import CrawlingListingExample from '!!raw-loader!roa-loader!./code_examples/05_crawling_listing.py'; import CrawlingDetailExample from '!!raw-loader!roa-loader!./code_examples/05_crawling_detail.py'; To crawl the whole [Warehouse store example](https://warehouse-theme-metal.myshopify.com/collections) and find all the data, you first need to visit all the pages with products - going through all categories available and also all the product detail pages. ## Crawling the listing pages In previous lessons, you used the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> function like this: ```python await enqueue_links() ``` While useful in that scenario, you need something different now. Instead of finding all the `<a href="..">` elements with links to the same hostname, you need to find only the specific ones that will take your crawler to the next page of results. Otherwise, the crawler will visit a lot of other pages that you're not interested in. Using the power of DevTools and yet another <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> parameter, this becomes fairly easy. <RunnableCodeBlock className="language-python" language="python"> {CrawlingListingExample} </RunnableCodeBlock> The code should look pretty familiar to you. It's a very simple request handler where we log the currently processed URL to the console and enqueue more links. But there are also a few new, interesting additions. Let's break it down. ### The `selector` parameter of `enqueue_links` When you previously used <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink>, you were not providing any `selector` parameter, and it was fine, because you wanted to use the default value, which is `a` - finds all `<a>` elements. But now, you need to be more specific. There are multiple `<a>` links on the `Categories` page, and you're only interested in those that will take your crawler to the available list of results. Using the DevTools, you'll find that you can select the links you need using the `.collection-block-item` selector, which selects all the elements that have the `class=collection-block-item` attribute. ### The `label` of `enqueue_links` You will see `label` used often throughout Crawlee, as it's a convenient way of labelling a <ApiLink to="class/Request">`Request`</ApiLink> instance for quick identification later. You can access it with `request.label` and it's a `string`. You can name your requests any way you want. Here, we used the label `CATEGORY` to note that we're enqueueing pages that represent a category of products. The <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> function will add this label to all requests before enqueueing them to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. Why this is useful will become obvious in a minute. ## Crawling the detail pages In a similar fashion, you need to collect all the URLs to the product detail pages, because only from there you can scrape all the data you need. The following code only repeats the concepts you already know for another set of links. <RunnableCodeBlock className="language-python" language="python"> {CrawlingDetailExample} </RunnableCodeBlock> The crawling code is now complete. When you run the code, you'll see the crawler visit all the listing URLs and all the detail URLs. ## Next steps This concludes the Crawling lesson, because you have taught the crawler to visit all the pages it needs. Let's continue with scraping data. ================================================ FILE: docs/introduction/06_scraping.mdx ================================================ --- id: scraping title: Scraping --- import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import ScrapingExample from '!!raw-loader!roa-loader!./code_examples/06_scraping.py'; In the [Real-world project](./real-world-project#choosing-the-data-you-need) chapter, you've created a list of the information you wanted to collect about the products in the example Warehouse store. Let's review that and figure out ways to access the data. - URL - Manufacturer - SKU - Title - Current price - Stock available ![data to scrape](/img/getting-started/scraping-practice.jpg 'Overview of data to be scraped.') ## Scraping the URL and manufacturer Some information is lying right there in front of us without even having to touch the product detail pages. The `URL` we already have - the `context.request.url`. And by looking at it carefully, we realize that we can also extract the manufacturer from the URL (as all product urls start with `/products/<manufacturer>`). We can just split the `string` and be on our way then! :::info url vs loaded url You can use `request.loaded_url` as well. Remember the difference: `request.url` is what you enqueue, `request.loaded_url` is what gets processed (after possible redirects). ::: By splitting the `request.url`, we can extract the manufacturer name directly from the URL. This is done by first splitting the URL to get the product identifier and then splitting that identifier to get the manufacturer name. ```python # context.request.url: # https://warehouse-theme-metal.myshopify.com/products/sennheiser-mke-440-professional-stereo-shotgun-microphone-mke-440 # Split the URL and get the last part. url_part = context.request.url.split('/').pop() # url_part: sennheiser-mke-440-professional-stereo-shotgun-microphone-mke-440 # Split the last part by '-' and get the first element. manufacturer = url_part.split('-')[0] # manufacturer: 'sennheiser' ``` :::tip Storing information It's a matter of preference, whether to store this information separately in the resulting dataset, or not. Whoever uses the dataset can easily parse the `manufacturer` from the `URL`, so should you duplicate the data unnecessarily? Our opinion is that unless the increased data consumption would be too large to bear, it's better to make the dataset as rich as possible. For example, someone might want to filter by `manufacturer`. ::: :::caution Adapt and extract One thing you may notice is that the `manufacturer` might have a `-` in its name. If that's the case, your best bet is extracting it from the details page instead, but it's not mandatory. At the end of the day, you should always adjust and pick the best solution for your use case, and website you are crawling. ::: Now it's time to add more data to the results. Let's open one of the product detail pages, for example the [Sony XBR-950G](https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv) page and use our DevTools-Fu 🥋 to figure out how to get the title of the product. ## Scraping title To scrape the product title from a webpage, you need to identify its location in the HTML structure. By using the element selector tool in your browser's DevTools, you can see that the title is within an `<h1>` tag, which is a common practice for important headers. This `<h1>` tag is enclosed in a `<div>` with the class product-meta. We can leverage this structure to create a combined selector `.product-meta h1`. This selector targets any `<h1>` element that is a child of an element with the class `product-meta`. ![product title](/img/getting-started/title.jpg 'Finding product title in DevTools.') :::tip Verifying selectors with DevTools Remember that you can press CTRL+F (or CMD+F on Mac) in the **Elements** tab of DevTools to open the search bar where you can quickly search for elements using their selectors. Always verify your scraping process and assumptions using the DevTools. It's faster than changing the crawler code all the time. ::: To get the title, you need to locate it using Playwright with the `.product-meta h1` selector. This selector specifically targets the `<h1>` element you need. If multiple elements match, it will throw an error, which is beneficial as it prevents returning incorrect data silently. Ensuring the accuracy of your selectors is crucial for reliable data extraction. ```python title = await context.page.locator('.product-meta h1').text_content() ``` ## Scraping SKU Using the DevTools, you can find that the product SKU is inside a `<span>` tag with the class `product-meta__sku-number`. Since there is no other `<span>` with that class on the page, you can safely use this selector to extract the SKU. ![product sku selector](/img/getting-started/sku.jpg 'Finding product SKU in DevTools.') ```python # Find the SKU element using the selector and get its text content. sku = await context.page.locator('span.product-meta__sku-number').text_content() ``` ## Scraping current price Using DevTools, you can find that the current price is within a `<span>` element tagged with the `price` class. However, it is nested alongside another `<span>` element with the `visually-hidden` class. To avoid extracting the wrong text, you can filter the elements to get the correct one using the `has_text` helper. ![product current price selector](/img/getting-started/current-price.jpg 'Finding product current price in DevTools.') ```python # Locate the price element and filter out the visually hidden elements. price_element = context.page.locator('span.price', has_text='$').first # Extract the text content of the price element. current_price_string = await price_element.text_content() or '' # current_price_string: 'Sale price$1,398.00' # Split the string by the '$' sign to get the numeric part. raw_price = current_price_string.split('$')[1] # raw_price: '1,398.00' # Convert the raw price string to a float after removing commas. price = float(raw_price.replace(',', '')) # price: 1398.00 ``` It might look a little complex at first glance, but let's walk through what you did. First, you locate the correct part of the `price` span by filtering for elements containing the `$` sign. This ensures that you get the actual price element. Once you have the right element, you extract its text content, which gives you a string similar to `Sale price$1,398.00`. To get the numeric value, you split this string by the `$` sign. Next, you remove any commas from the resulting numeric string and convert it to a float, allowing you to work with the price as a number. This process ensures that you accurately extract and convert the current price from the product page. ## Scraping stock availability The final step is to scrape the stock availability information. There is a `<span>` with the class `product-form__inventory`, which contains the text `In stock` if the product is available. You can use the `has_text` helper to filter out the correct element. ```python # Locate the element that contains the text 'In stock' and filter out other elements. in_stock_element = context.page.locator( selector='span.product-form__inventory', has_text='In stock', ).first # Check if the element exists by counting the matching elements. in_stock = await in_stock_element.count() > 0 ``` For this, all that matters is whether the element exists or not. You can use the `count()` method to check if any elements match the selector. If there are, it means the product is in stock. ## Trying it out You have everything that is needed, so grab your newly created scraping logic, dump it into your original request handler and see the magic happen! <RunnableCodeBlock className="language-python" language="python"> {ScrapingExample} </RunnableCodeBlock> When you run the crawler, you will see the crawled URLs and their scraped data printed to the console. The output will look something like this: ```json { "url": "https://warehouse-theme-metal.myshopify.com/products/sony-str-za810es-7-2-channel-hi-res-wi-fi-network-av-receiver", "manufacturer": "sony", "title": "Sony STR-ZA810ES 7.2-Ch Hi-Res Wi-Fi Network A/V Receiver", "sku": "SON-692802-STR-DE", "price": 698, "in_stock": true } ``` ## Next steps Next, you'll see how to save the data you scraped to the disk for further processing. ================================================ FILE: docs/introduction/07_saving_data.mdx ================================================ --- id: saving-data title: Saving data --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import FirstCodeExample from '!!raw-loader!./code_examples/07_first_code.py'; import FinalCodeExample from '!!raw-loader!roa-loader!./code_examples/07_final_code.py'; A data extraction job would not be complete without saving the data for later use and processing. You've come to the final and most difficult part of this tutorial so make sure to pay attention very carefully! ## Save data to the dataset Crawlee provides a <ApiLink to="class/Dataset">`Dataset`</ApiLink> class, which acts as an abstraction over tabular storage, making it useful for storing scraping results. To get started: - Add the necessary imports: Include the <ApiLink to="class/Dataset">`Dataset`</ApiLink> and any required crawler classes at the top of your file. - Create a Dataset instance: Use the asynchronous <ApiLink to="class/Dataset#open">`Dataset.open`</ApiLink> constructor to initialize the dataset instance within your crawler's setup. Here's an example: <CodeBlock language="python"> {FirstCodeExample} </CodeBlock> Finally, instead of logging the extracted data to stdout, we can export them to the dataset: ```python # ... @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: # ... data = { 'manufacturer': manufacturer, 'title': title, 'sku': sku, 'price': price, 'in_stock': in_stock, } # Push the data to the dataset. await dataset.push_data(data) # ... ``` ### Using a context helper Instead of importing a new class and manually creating an instance of the dataset, you can use the context helper <ApiLink to="class/PushDataFunction">`context.push_data`</ApiLink>. Remove the dataset import and instantiation, and replace `dataset.push_data` with the following: ```python # ... @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: # ... data = { 'manufacturer': manufacturer, 'title': title, 'sku': sku, 'price': price, 'in_stock': in_stock, } # Push the data to the dataset. await context.push_data(data) # ... ``` ### Final code And that's it. Unlike earlier, we are being serious now. That's it, you're done. The final code looks like this: <RunnableCodeBlock className="language-python" language="python"> {FinalCodeExample} </RunnableCodeBlock> ## What `push_data` does? A helper <ApiLink to="class/PushDataFunction">`context.push_data`</ApiLink> saves data to the default dataset. You can provide additional arguments there like `id` or `name` to open a different dataset. Dataset is a storage designed to hold data in a format similar to a table. Each time you call <ApiLink to="class/PushDataFunction">`context.push_data`</ApiLink> or direct <ApiLink to="class/Dataset#push_data">`Dataset.push_data`</ApiLink> a new row in the table is created, with the property names serving as column titles. In the default configuration, the rows are represented as JSON files saved on your file system, but other backend storage systems can be plugged into Crawlee as well. More on that later. :::info Automatic dataset initialization Each time you start Crawlee a default <ApiLink to="class/Dataset">`Dataset`</ApiLink> is automatically created, so there's no need to initialize it or create an instance first. You can create as many datasets as you want and even give them names. For more details see the <ApiLink to="class/Dataset#open">`Dataset.open`</ApiLink> function. ::: {/* TODO: mention result storage guide once it's done :::info Automatic dataset initialization Each time you start Crawlee a default <ApiLink to="class/Dataset">`Dataset`</ApiLink> is automatically created, so there's no need to initialize it or create an instance first. You can create as many datasets as you want and even give them names. For more details see the [Result storage guide](../guides/result-storage#dataset) and the `Dataset.open()` function. ::: */} ## Finding saved data Unless you changed the configuration that Crawlee uses locally, which would suggest that you knew what you were doing, and you didn't need this tutorial anyway, you'll find your data in the storage directory that Crawlee creates in the working directory of the running script: ```text {PROJECT_FOLDER}/storage/datasets/default/ ``` The above folder will hold all your saved data in numbered files, as they were pushed into the dataset. Each file represents one invocation of <ApiLink to="class/Dataset#push_data">`Dataset.push_data`</ApiLink> or one table row. {/* TODO: add mention of "Result storage guide" once it's ready: :::tip Single file data storage options If you would like to store your data in a single big file, instead of many small ones, see the [Result storage guide](../guides/result-storage#key-value-store) for Key-value stores. ::: */} ## Next steps Next, you'll see some improvements that you can add to your crawler code that will make it more readable and maintainable in the long run. ================================================ FILE: docs/introduction/08_refactoring.mdx ================================================ --- id: refactoring title: Refactoring --- import ApiLink from '@site/src/components/ApiLink'; import CodeBlock from '@theme/CodeBlock'; import MainExample from '!!raw-loader!./code_examples/08_main.py'; import RoutesExample from '!!raw-loader!./code_examples/08_routes.py'; It may seem that the data is extracted and the crawler is done, but honestly, this is just the beginning. For the sake of brevity, we've completely omitted error handling, proxies, logging, architecture, tests, documentation and other stuff that a reliable software should have. The good thing is, error handling is mostly done by Crawlee itself, so no worries on that front, unless you need some custom magic. :::info Navigating automatic bot-protextion avoidance You might be wondering about the **anti-blocking, bot-protection avoiding stealthy features** and why we haven't highlighted them yet. The reason is straightforward: these features are **automatically used** within the default configuration, providing a smooth start without manual adjustments. ::: {/* TODO: add this to the info once the relevant guide is ready However, the default configuration, while powerful, may not cover every scenario. If you want to learn more, browse the [Avoid getting blocked](../guides/avoid-blocking), [Proxy management](../guides/proxy-management) and [Session management](../guides/session-management) guides. */} To promote good coding practices, let's look at how you can use a <ApiLink to="class/Router">`Router`</ApiLink> class to better structure your crawler code. ## Request routing In the following code, we've made several changes: - Split the code into multiple files. - Added custom instance of <ApiLink to="class/Router">`Router`</ApiLink> to make our routing cleaner, without if clauses. - Moved route definitions to a separate `routes.py` file. - Simplified the `main.py` file to focus on the general structure of the crawler. ### Routes file First, let's define our routes in a separate file: <CodeBlock className="language-python" title="src/routes.py"> {RoutesExample} </CodeBlock> ### Main file Next, our main file becomes much simpler and cleaner: <CodeBlock className="language-python" title="src/main.py"> {MainExample} </CodeBlock> By structuring your code this way, you achieve better separation of concerns, making the code easier to read, manage and extend. The <ApiLink to="class/Router">`Router`</ApiLink> class keeps your routing logic clean and modular, replacing if clauses with function decorators. ## Summary Refactoring your crawler code with these practices enhances readability, maintainability, and scalability. ### Splitting your code into multiple files There's no reason not to split your code into multiple files and keep your logic separate. Less code in a single file means less complexity to handle at any time, which improves overall readability and maintainability. Consider further splitting the routes into separate files for even better organization. ### Using a router to structure your crawling Initially, using a simple `if` / `else` statement for selecting different logic based on the crawled pages might appear more readable. However, this approach can become cumbersome with more than two types of pages, especially when the logic for each page extends over dozens or even hundreds of lines of code. It's good practice in any programming language to split your logic into bite-sized chunks that are easy to read and reason about. Scrolling through a thousand line long `request_handler()` where everything interacts with everything and variables can be used everywhere is not a beautiful thing to do and a pain to debug. That's why we prefer the separation of routes into their own files. ## Next steps In the next and final step, you'll see how to deploy your Crawlee project to the cloud. If you used the CLI to bootstrap your project, you already have a `Dockerfile` ready, and the next section will show you how to deploy it to the [Apify platform](../deployment/apify-platform) with ease. ================================================ FILE: docs/introduction/09_running_in_cloud.mdx ================================================ --- id: deployment title: Running your crawler in the Cloud sidebar_label: Running in the Cloud description: Deploying Crawlee-python projects to the Apify platform --- import CodeBlock from '@theme/CodeBlock'; import MainExample from '!!raw-loader!./code_examples/09_apify_sdk.py'; ## Apify platform Crawlee is developed by [**Apify**](https://apify.com), the web scraping and automation platform. You could say it is the **home of Crawlee projects**. In this section you'll see how to deploy the crawler there with just a few simple steps. You can deploy a **Crawlee** project wherever you want, but using the [**Apify platform**](https://console.apify.com) will give you the best experience. {/*In case you want to deploy your Crawlee project to other platforms, check out the [**Deployment**](../deployment) section.*/} With a few simple steps, you can convert your Crawlee project into a so-called **Actor**. Actors are serverless micro-apps that are easy to develop, run, share, and integrate. The infra, proxies, and storages are ready to go. [Learn more about Actors](https://apify.com/actors). {/*:::info Choosing between Crawlee CLI and Apify CLI for project setup We started this guide by using the Crawlee CLI to bootstrap the project - it offers the basic Crawlee templates, including a ready-made `Dockerfile`. If you know you will be deploying your project to the Apify platform, you might want to start with the Apify CLI instead. It also offers several project templates, and those are all set up to be used on the Apify platform right ahead. :::*/} ## Dependencies Before we get started, you'll need to install two new dependencies: - [**Apify SDK**](https://pypi.org/project/apify/), a toolkit for working with the Apify platform. This will allow us to wire the storages (e.g. [`RequestQueue`](https://docs.apify.com/sdk/python/reference/class/RequestQueue) and [`Dataset`](https://docs.apify.com/sdk/python/reference/class/Dataset)) to the Apify cloud products. The Apify SDK, like Crawlee itself, is available as a PyPI package and can be installed with any Python package manager. To install it using [pip](https://pip.pypa.io/), run: ```sh pip install apify ``` - [**Apify CLI**](https://docs.apify.com/cli/), a command-line tool that will help us with authentication and deployment. It is a [Node.js](https://nodejs.org/) package, and can be installed using any Node.js package manager. In this guide, we will use [npm](https://npmjs.com/). We will install it globally, so you can use it across all your Crawlee and Apify projects. To install it using npm, run: ```sh npm install -g apify-cli ``` ## Logging in to the Apify platform The next step will be [creating your Apify account](https://console.apify.com/sign-up). Don't worry, we have a **free tier**, so you can try things out before you buy in! Once you have that, it's time to log in with the just-installed [Apify CLI](https://docs.apify.com/cli/). You will need your personal access token, which you can find at https://console.apify.com/account#/integrations. ```sh apify login ``` ## Adjusting the code Now that you have your account set up, you will need to adjust the code a tiny bit. We will use the [Apify SDK](https://docs.apify.com/sdk/python/), which will help us to wire the Crawlee storages (like the [`RequestQueue`](https://docs.apify.com/sdk/python/reference/class/RequestQueue)) to their Apify platform counterparts - otherwise Crawlee would keep things only in memory. Open your `src/main.py` file, and wrap everything in your `main` function with the [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) context manager. Your code should look like this: <CodeBlock className="language-python" title="src/main.py"> {MainExample} </CodeBlock> The context manager will configure Crawlee to use the Apify API instead of its default memory storage interface. It also sets up few other things, like listening to the platform events via websockets. After the body is finished, it handles graceful shutdown. :::info Understanding `async with Actor` behavior with environment variables The [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) context manager works conditionally based on the environment variables, namely based on the `APIFY_IS_AT_HOME` env var, which is set to `true` on the Apify platform. This means that your project will remain working the same locally, but will use the Apify API when deployed to the Apify platform. ::: ## Initializing the project You will also need to initialize the project for Apify, to do that, use the Apify CLI again: ```sh apify init ``` The CLI will check the project structure and guide you through the setup process. If prompted, follow the instructions and answer the questions to configure the project correctly. For more information follow the [Apify CLI documentation](https://docs.apify.com/cli/docs). This will create a folder called `.actor`, and an `actor.json` file inside it - this file contains the configuration relevant to the Apify platform, namely the Actor name, version, build tag, and few other things. Check out the [relevant documentation](https://docs.apify.com/platform/actors/development/actor-definition/actor-json) to see all the different things you can set there up. ## Ship it! And that's all, your project is now ready to be published on the Apify platform. You can use the Apify CLI once more to do that: ```sh apify push ``` This command will create an archive from your project, upload it to the Apify platform and initiate a Docker build. Once finished, you will get a link to your new Actor on the platform. ## Learning more about web scraping :::tip Explore Apify Academy Resources If you want to learn more about web scraping and browser automation, check out the [Apify Academy](https://developers.apify.com/academy). It's full of courses and tutorials on the topic. From beginner to advanced. And the best thing: **It's free and open source** ❤️ {/*If you want to do one more project, checkout our tutorial on building a [HackerNews scraper using Crawlee](https://blog.apify.com/crawlee-web-scraping-tutorial/).*/} ::: ## Thank you! 🎉 That's it! Thanks for reading the whole introduction and if there's anything wrong, please 🙏 let us know on [GitHub](https://github.com/apify/crawlee-python) or in our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! 👋 ================================================ FILE: docs/introduction/code_examples/02_bs.py ================================================ import asyncio # Add import of crawler and crawling context. from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.storages import RequestQueue async def main() -> None: # First you create the request queue instance. rq = await RequestQueue.open() # And then you add one or more requests to it. await rq.add_request('https://crawlee.dev') crawler = BeautifulSoupCrawler(request_manager=rq) # Define a request handler and attach it to the crawler using the decorator. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: # Extract <title> text with BeautifulSoup. # See BeautifulSoup documentation for API docs. url = context.request.url title = context.soup.title.string if context.soup.title else '' context.log.info(f'The title of {url} is: {title}.') await crawler.run() if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/02_bs_better.py ================================================ import asyncio # You don't need to import RequestQueue anymore. from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: url = context.request.url title = context.soup.title.string if context.soup.title else '' context.log.info(f'The title of {url} is: {title}.') # Start the crawler with the provided URLs. await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/02_request_queue.py ================================================ import asyncio from crawlee.storages import RequestQueue async def main() -> None: # First you create the request queue instance. rq = await RequestQueue.open() # And then you add one or more requests to it. await rq.add_request('https://crawlee.dev') if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/03_enqueue_strategy.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}.') # See the `EnqueueStrategy` type alias for more strategy options. # highlight-next-line await context.enqueue_links( # highlight-next-line strategy='same-domain', # highlight-next-line ) await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/03_finding_new_links.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: # Let's limit our crawls to make our tests shorter and safer. crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: url = context.request.url title = context.soup.title.string if context.soup.title else '' context.log.info(f'The title of {url} is: {title}.') # The enqueue_links function is available as one of the fields of the context. # It is also context aware, so it does not require any parameters. await context.enqueue_links() await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/03_globs.py ================================================ import asyncio from crawlee import Glob from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}.') # Enqueue links that match the 'include' glob pattern and # do not match the 'exclude' glob pattern. # highlight-next-line await context.enqueue_links( # highlight-next-line include=[Glob('https://someplace.com/**/cats')], # highlight-next-line exclude=[Glob('https://**/archive/**')], # highlight-next-line ) await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/03_original_code.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: url = context.request.url title = context.soup.title.string if context.soup.title else '' context.log.info(f'The title of {url} is: {title}.') await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/03_transform_request.py ================================================ from __future__ import annotations import asyncio from crawlee import HttpHeaders, RequestOptions, RequestTransformAction from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext def transform_request( request_options: RequestOptions, ) -> RequestOptions | RequestTransformAction: # Skip requests to PDF files if request_options['url'].endswith('.pdf'): return 'skip' if '/docs' in request_options['url']: # Add custom headers to requests to specific URLs request_options['headers'] = HttpHeaders({'Custom-Header': 'value'}) elif '/blog' in request_options['url']: # Add label for certain URLs request_options['label'] = 'BLOG' else: # Signal that the request should proceed without any transformation return 'unchanged' return request_options async def main() -> None: crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}.') # Transform request before enqueueing await context.enqueue_links(transform_request_function=transform_request) @crawler.router.handler('BLOG') async def blog_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Blog Processing {context.request.url}.') await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/04_sanity_check.py ================================================ import asyncio # Instead of BeautifulSoupCrawler let's use Playwright to be able to render JavaScript. from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: crawler = PlaywrightCrawler() @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: # Wait for the collection cards to render on the page. This ensures that # the elements we want to interact with are present in the DOM. await context.page.wait_for_selector('.collection-block-item') # Execute a function within the browser context to target the collection # card elements and extract their text content, trimming any leading or # trailing whitespace. category_texts = await context.page.eval_on_selector_all( '.collection-block-item', '(els) => els.map(el => el.textContent.trim())', ) # Log the extracted texts. for i, text in enumerate(category_texts): context.log.info(f'CATEGORY_{i + 1}: {text}') await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/05_crawling_detail.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: crawler = PlaywrightCrawler() @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # We're not processing detail pages yet, so we just pass. if context.request.label == 'DETAIL': pass # We are now on a category page. We can use this to paginate through and # enqueue all products, as well as any subsequent pages we find. elif context.request.label == 'CATEGORY': # Wait for the product items to render. await context.page.wait_for_selector('.product-item > a') # Enqueue links found within elements matching the provided selector. # These links will be added to the crawling queue with the label DETAIL. await context.enqueue_links( selector='.product-item > a', label='DETAIL', ) # Find the "Next" button to paginate through the category pages. next_button = await context.page.query_selector('a.pagination__next') # If a "Next" button is found, enqueue the next page of results. if next_button: await context.enqueue_links( selector='a.pagination__next', label='CATEGORY', ) # This indicates we're on the start page with no specific label. # On the start page, we want to enqueue all the category pages. else: # Wait for the collection cards to render. await context.page.wait_for_selector('.collection-block-item') # Enqueue links found within elements matching the provided selector. # These links will be added to the crawling queue with the label CATEGORY. await context.enqueue_links( selector='.collection-block-item', label='CATEGORY', ) await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/05_crawling_listing.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: crawler = PlaywrightCrawler() @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # Wait for the category cards to render on the page. This ensures that # the elements we want to interact with are present in the DOM. await context.page.wait_for_selector('.collection-block-item') # Enqueue links found within elements that match the specified selector. # These links will be added to the crawling queue with the label CATEGORY. await context.enqueue_links( selector='.collection-block-item', label='CATEGORY', ) await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/06_scraping.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: crawler = PlaywrightCrawler( # Let's limit our crawls to make our tests shorter and safer. max_requests_per_crawl=10, ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # We're not processing detail pages yet, so we just pass. if context.request.label == 'DETAIL': # Split the URL and get the last part to extract the manufacturer. url_part = context.request.url.split('/').pop() manufacturer = url_part.split('-')[0] # Extract the title using the combined selector. title = await context.page.locator('.product-meta h1').text_content() # Extract the SKU using its selector. sku = await context.page.locator( 'span.product-meta__sku-number' ).text_content() # Locate the price element that contains the '$' sign and filter out # the visually hidden elements. price_element = context.page.locator('span.price', has_text='$').first current_price_string = await price_element.text_content() or '' raw_price = current_price_string.split('$')[1] price = float(raw_price.replace(',', '')) # Locate the element that contains the text 'In stock' # and filter out other elements. in_stock_element = context.page.locator( selector='span.product-form__inventory', has_text='In stock', ).first in_stock = await in_stock_element.count() > 0 # Put it all together in a dictionary. data = { 'manufacturer': manufacturer, 'title': title, 'sku': sku, 'price': price, 'in_stock': in_stock, } # Print the extracted data. context.log.info(data) # We are now on a category page. We can use this to paginate through and # enqueue all products, as well as any subsequent pages we find. elif context.request.label == 'CATEGORY': # Wait for the product items to render. await context.page.wait_for_selector('.product-item > a') # Enqueue links found within elements matching the provided selector. # These links will be added to the crawling queue with the label DETAIL. await context.enqueue_links( selector='.product-item > a', label='DETAIL', ) # Find the "Next" button to paginate through the category pages. next_button = await context.page.query_selector('a.pagination__next') # If a "Next" button is found, enqueue the next page of results. if next_button: await context.enqueue_links( selector='a.pagination__next', label='CATEGORY', ) # This indicates we're on the start page with no specific label. # On the start page, we want to enqueue all the category pages. else: # Wait for the collection cards to render. await context.page.wait_for_selector('.collection-block-item') # Enqueue links found within elements matching the provided selector. # These links will be added to the crawling queue with the label CATEGORY. await context.enqueue_links( selector='.collection-block-item', label='CATEGORY', ) await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/07_final_code.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: crawler = PlaywrightCrawler( # Let's limit our crawls to make our tests shorter and safer. max_requests_per_crawl=10, ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}') # We're not processing detail pages yet, so we just pass. if context.request.label == 'DETAIL': # Split the URL and get the last part to extract the manufacturer. url_part = context.request.url.split('/').pop() manufacturer = url_part.split('-')[0] # Extract the title using the combined selector. title = await context.page.locator('.product-meta h1').text_content() # Extract the SKU using its selector. sku = await context.page.locator( 'span.product-meta__sku-number' ).text_content() # Locate the price element that contains the '$' sign and filter out # the visually hidden elements. price_element = context.page.locator('span.price', has_text='$').first current_price_string = await price_element.text_content() or '' raw_price = current_price_string.split('$')[1] price = float(raw_price.replace(',', '')) # Locate the element that contains the text 'In stock' and filter out # other elements. in_stock_element = context.page.locator( selector='span.product-form__inventory', has_text='In stock', ).first in_stock = await in_stock_element.count() > 0 # Put it all together in a dictionary. data = { 'manufacturer': manufacturer, 'title': title, 'sku': sku, 'price': price, 'in_stock': in_stock, } # Push the data to the dataset. await context.push_data(data) # We are now on a category page. We can use this to paginate through and # enqueue all products, as well as any subsequent pages we find. elif context.request.label == 'CATEGORY': # Wait for the product items to render. await context.page.wait_for_selector('.product-item > a') # Enqueue links found within elements matching the provided selector. # These links will be added to the crawling queue with the label DETAIL. await context.enqueue_links( selector='.product-item > a', label='DETAIL', ) # Find the "Next" button to paginate through the category pages. next_button = await context.page.query_selector('a.pagination__next') # If a "Next" button is found, enqueue the next page of results. if next_button: await context.enqueue_links( selector='a.pagination__next', label='CATEGORY', ) # This indicates we're on the start page with no specific label. # On the start page, we want to enqueue all the category pages. else: # Wait for the collection cards to render. await context.page.wait_for_selector('.collection-block-item') # Enqueue links found within elements matching the provided selector. # These links will be added to the crawling queue with the label CATEGORY. await context.enqueue_links( selector='.collection-block-item', label='CATEGORY', ) await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/07_first_code.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.storages import Dataset # ... async def main() -> None: crawler = PlaywrightCrawler() dataset = await Dataset.open() # ... @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: ... # ... if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/08_main.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler from .routes import router async def main() -> None: crawler = PlaywrightCrawler( # Let's limit our crawls to make our tests shorter and safer. max_requests_per_crawl=10, # Provide our router instance to the crawler. request_handler=router, ) await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/08_routes.py ================================================ from crawlee.crawlers import PlaywrightCrawlingContext from crawlee.router import Router router = Router[PlaywrightCrawlingContext]() @router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: # This is a fallback route which will handle the start URL. context.log.info(f'default_handler is processing {context.request.url}') await context.page.wait_for_selector('.collection-block-item') await context.enqueue_links( selector='.collection-block-item', label='CATEGORY', ) @router.handler('CATEGORY') async def category_handler(context: PlaywrightCrawlingContext) -> None: # This replaces the context.request.label == CATEGORY branch of the if clause. context.log.info(f'category_handler is processing {context.request.url}') await context.page.wait_for_selector('.product-item > a') await context.enqueue_links( selector='.product-item > a', label='DETAIL', ) next_button = await context.page.query_selector('a.pagination__next') if next_button: await context.enqueue_links( selector='a.pagination__next', label='CATEGORY', ) @router.handler('DETAIL') async def detail_handler(context: PlaywrightCrawlingContext) -> None: # This replaces the context.request.label == DETAIL branch of the if clause. context.log.info(f'detail_handler is processing {context.request.url}') url_part = context.request.url.split('/').pop() manufacturer = url_part.split('-')[0] title = await context.page.locator('.product-meta h1').text_content() sku = await context.page.locator('span.product-meta__sku-number').text_content() price_element = context.page.locator('span.price', has_text='$').first current_price_string = await price_element.text_content() or '' raw_price = current_price_string.split('$')[1] price = float(raw_price.replace(',', '')) in_stock_element = context.page.locator( selector='span.product-form__inventory', has_text='In stock', ).first in_stock = await in_stock_element.count() > 0 data = { 'manufacturer': manufacturer, 'title': title, 'sku': sku, 'price': price, 'in_stock': in_stock, } await context.push_data(data) ================================================ FILE: docs/introduction/code_examples/09_apify_sdk.py ================================================ import asyncio # highlight-next-line from apify import Actor from crawlee.crawlers import PlaywrightCrawler from .routes import router async def main() -> None: # highlight-next-line async with Actor: crawler = PlaywrightCrawler( # Let's limit our crawls to make our tests shorter and safer. max_requests_per_crawl=10, # Provide our router instance to the crawler. request_handler=router, ) await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/introduction/code_examples/__init__.py ================================================ ================================================ FILE: docs/introduction/code_examples/routes.py ================================================ from crawlee.crawlers import PlaywrightCrawlingContext from crawlee.router import Router router = Router[PlaywrightCrawlingContext]() ================================================ FILE: docs/introduction/index.mdx ================================================ --- id: introduction title: Introduction --- import ApiLink from '@site/src/components/ApiLink'; Crawlee covers your crawling and scraping end-to-end and helps you **build reliable scrapers. Fast.** Your crawlers will appear human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it. ## What you will learn The goal of the introduction is to provide a step-by-step guide to the most important features of Crawlee. It will walk you through creating the simplest of crawlers that only prints text to console, all the way up to a full-featured scraper that collects links from a website and extracts data. ## 🛠 Features Why Crawlee is the preferred choice for web scraping and crawling? ### Why use Crawlee instead of just a random HTTP library with an HTML parser? - Unified interface for **HTTP & headless browser** crawling. - Automatic **parallel crawling** based on available system resources. - Written in Python with **type hints** - enhances DX (IDE autocompletion) and reduces bugs (static type checking). - Automatic **retries** on errors or when you are getting blocked. - Integrated **proxy rotation** and session management. - Configurable **request routing** - direct URLs to the appropriate handlers. - Persistent **queue for URLs** to crawl. - Pluggable **storage** of both tabular data and files. - Robust **error handling**. ### Why to use Crawlee rather than Scrapy? - Crawlee has out-of-the-box support for **headless browser** crawling (Playwright). - Crawlee has a **minimalistic & elegant interface** - Set up your scraper with fewer than 10 lines of code. - Complete **type hint** coverage. - Based on standard **Asyncio**. {/* TODO: ### 👾 HTTP crawling - ... */} {/* TODO: ### 💻 Real browser crawling - ... */} ## Next steps Next, you will install Crawlee and learn how to bootstrap projects with the prepared Crawlee templates. ================================================ FILE: docs/pyproject.toml ================================================ # Line length different from the rest of the code to make sure that the example codes visualised on the generated # documentation webpages are shown without vertical slider to make them more readable. [tool.ruff] # Inherit all from project top configuration file. extend = "../pyproject.toml" # Override just line length line-length = 90 # Maximum possible fit to the doc webpage. Longer lines need slider. ================================================ FILE: docs/quick-start/code_examples/beautifulsoup_crawler_example.py ================================================ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: # BeautifulSoupCrawler crawls the web using HTTP requests # and parses HTML using the BeautifulSoup library. crawler = BeautifulSoupCrawler(max_requests_per_crawl=10) # Define a request handler to process each crawled page # and attach it to the crawler using a decorator. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract relevant data from the page context. data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, } # Store the extracted data. await context.push_data(data) # Extract links from the current page and add them to the crawling queue. await context.enqueue_links() # Add first URL to the queue and start the crawl. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/quick-start/code_examples/parsel_crawler_example.py ================================================ import asyncio from crawlee.crawlers import ParselCrawler, ParselCrawlingContext async def main() -> None: # ParselCrawler crawls the web using HTTP requests # and parses HTML using the Parsel library. crawler = ParselCrawler(max_requests_per_crawl=10) # Define a request handler to process each crawled page # and attach it to the crawler using a decorator. @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract relevant data from the page context. data = { 'url': context.request.url, 'title': context.selector.xpath('//title/text()').get(), } # Store the extracted data. await context.push_data(data) # Extract links from the current page and add them to the crawling queue. await context.enqueue_links() # Add first URL to the queue and start the crawl. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/quick-start/code_examples/playwright_crawler_example.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: # PlaywrightCrawler crawls the web using a headless browser # controlled by the Playwright library. crawler = PlaywrightCrawler() # Define a request handler to process each crawled page # and attach it to the crawler using a decorator. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract relevant data from the page context. data = { 'url': context.request.url, 'title': await context.page.title(), } # Store the extracted data. await context.push_data(data) # Extract links from the current page and add them to the crawling queue. await context.enqueue_links() # Add first URL to the queue and start the crawl. await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/quick-start/code_examples/playwright_crawler_headful_example.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler async def main() -> None: crawler = PlaywrightCrawler( # Run with a visible browser window. # highlight-next-line headless=False, # Switch to the Firefox browser. browser_type='firefox', ) # ... if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: docs/quick-start/index.mdx ================================================ --- id: quick-start title: Quick start --- import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import CodeBlock from '@theme/CodeBlock'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import BeautifulsoupCrawlerExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_example.py'; import ParselCrawlerExample from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_example.py'; import PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_example.py'; import PlaywrightCrawlerHeadfulExample from '!!raw-loader!./code_examples/playwright_crawler_headful_example.py'; This short tutorial will help you start scraping with Crawlee in just a minute or two. For an in-depth understanding of how Crawlee works, check out the [Introduction](../introduction/index.mdx) section, which provides a comprehensive step-by-step guide to creating your first scraper. ## Choose your crawler Crawlee offers the following main crawler classes: <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>, and <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>. All crawlers share the same interface, providing maximum flexibility when switching between them. :::caution Minimum Python version Crawlee requires Python 3.10 or higher. ::: ### BeautifulSoupCrawler The <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> is a plain HTTP crawler that parses HTML using the well-known [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library. It crawls the web using an HTTP client that mimics a browser. This crawler is very fast and efficient but cannot handle JavaScript rendering. ### ParselCrawler The <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> is similar to the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> but uses the [Parsel](https://pypi.org/project/parsel/) library for HTML parsing. Parsel is a lightweight library that provides a CSS selector-based API for extracting data from HTML documents. If you are familiar with the [Scrapy](https://scrapy.org/) framework, you will feel right at home with Parsel. As with the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, the <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> cannot handle JavaScript rendering. ### PlaywrightCrawler The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> uses a headless browser controlled by the [Playwright](https://playwright.dev/) library. It can manage Chromium, Firefox, Webkit, and other browsers. Playwright is the successor to the [Puppeteer](https://pptr.dev/) library and is becoming the de facto standard in headless browser automation. If you need a headless browser, choose Playwright. ## Installation Crawlee is available the [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal. You can install Crawlee with all features or choose only the ones you need. For installing it using the [pip](https://pip.pypa.io/en/stable/) package manager, run the following command: ```sh python -m pip install 'crawlee[all]' ``` Verify that Crawlee is successfully installed: ```sh python -c 'import crawlee; print(crawlee.__version__)' ``` If you plan to use the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, you'll need to install Playwright dependencies, including the browser binaries. To do this, run the following command: ```sh playwright install ``` For detailed installation instructions, see the [Setting up](../introduction/01_setting_up.mdx) documentation page. ## Crawling Run the following example to perform a recursive crawl of the Crawlee website using the selected crawler. <Tabs groupId="quickStart"> <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler" default> <RunnableCodeBlock className="language-python" language="python"> {BeautifulsoupCrawlerExample} </RunnableCodeBlock> </TabItem> <TabItem value="ParselCrawler" label="ParselCrawler"> <RunnableCodeBlock className="language-python" language="python"> {ParselCrawlerExample} </RunnableCodeBlock> </TabItem> <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler"> <RunnableCodeBlock className="language-python" language="python"> {PlaywrightCrawlerExample} </RunnableCodeBlock> </TabItem> </Tabs> When you run the example, you will see Crawlee automating the data extraction process in your terminal. {/* TODO: improve the logging and add here a sample */} ## Running headful browser By default, browsers controlled by Playwright run in headless mode (without a visible window). However, you can configure the crawler to run in a headful mode, which is useful during the development phase to observe the browser's actions. You can also switch from the default Chromium browser to Firefox or WebKit. <CodeBlock language="python"> {PlaywrightCrawlerHeadfulExample} </CodeBlock> When you run the example code, you'll see an automated browser navigating through the Crawlee website. {/* TODO: add video example */} ## Results By default, Crawlee stores data in the `./storage` directory within your current working directory. The results of your crawl will be saved as JSON files under `./storage/datasets/default/`. To view the results, you can use the `cat` command: ```sh cat ./storage/datasets/default/000000001.json ``` The JSON file will contain data similar to the following: ```json { "url": "https://crawlee.dev/", "title": "Crawlee · Build reliable crawlers. Fast. | Crawlee" } ``` :::tip If you want to change the storage directory, you can set the `CRAWLEE_STORAGE_DIR` environment variable to your preferred path. ::: ## Examples and further reading For more examples showcasing various features of Crawlee, visit the [Examples](/docs/examples) section of the documentation. To get a deeper understanding of Crawlee and its components, read the step-by-step [Introduction](../introduction/index.mdx) guide. [//]: # (TODO: add related links once they are ready) ================================================ FILE: docs/upgrading/upgrading_to_v0x.md ================================================ --- id: upgrading-to-v0x title: Upgrading to v0.x --- This page summarizes the breaking changes between Crawlee for Python zero-based versions. ## Upgrading to v0.6 This section summarizes the breaking changes between v0.5.x and v0.6.0. ### HttpCrawlerOptions - Removed `HttpCrawlerOptions` - which contained options from `BasicCrawlerOptions` and unique options `additional_http_error_status_codes` and `ignore_http_error_status_codes`. Both of the unique options were added to `BasicCrawlerOptions` instead. ### HttpClient - The signature of the `HttpClient` class has been updated. The constructor parameters `additional_http_error_status_codes` and `ignore_http_error_status_codes` have been removed and are now only available in `BasicCrawlerOptions`. - The method `_raise_for_error_status_code` has been removed from `HttpClient`. Its logic has been moved to the `BasicCrawler` class. ### SessionCookies - Replaces the `dict` used for cookie storage in `Session.cookies` with a new `SessionCookies` class. `SessionCookies` uses `CookieJar`, which enables support for multiple domains. ### PlaywrightCrawler and PlaywrightBrowserPlugin - `PlaywrightCrawler` now use a persistent browser context instead of the standard browser context. - Added `user_data_dir` parameter for `PlaywrightCrawler` and `PlaywrightBrowserPlugin` to specify the directory for the persistent context. If not provided, a temporary directory will be created automatically. ### Configuration The `Configuration` fields `chrome_executable_path`, `xvfb`, and `verbose_log` have been removed. The `chrome_executable_path` and `xvfb` fields were unused, while `verbose_log` can be replaced by setting `log_level` to `DEBUG`. ### CLI dependencies CLI dependencies have been moved to optional dependencies. If you need the CLI, install `crawlee[cli]` ### Abstract base classes We decided to move away from [Hungarian notation](https://en.wikipedia.org/wiki/Hungarian_notation) and remove all the `Base` prefixes from the abstract classes. It includes the following public classes: - `BaseStorageClient` -> `StorageClient` - `BaseBrowserController` -> `BrowserController` - `BaseBrowserPlugin` -> `BrowserPlugin` ### EnqueueStrategy The `EnqueueStrategy` has been changed from an enum to a string literal type. All its values and their meaning remain unchanged. ## Upgrading to v0.5 This section summarizes the breaking changes between v0.4.x and v0.5.0. ### Crawlers & CrawlingContexts - All crawler and crawling context classes have been consolidated into a single sub-package called `crawlers`. - The affected classes include: `AbstractHttpCrawler`, `AbstractHttpParser`, `BasicCrawler`, `BasicCrawlerOptions`, `BasicCrawlingContext`, `BeautifulSoupCrawler`, `BeautifulSoupCrawlingContext`, `BeautifulSoupParserType`, `ContextPipeline`, `HttpCrawler`, `HttpCrawlerOptions`, `HttpCrawlingContext`, `HttpCrawlingResult`, `ParsedHttpCrawlingContext`, `ParselCrawler`, `ParselCrawlingContext`, `PlaywrightCrawler`, `PlaywrightCrawlingContext`, `PlaywrightPreNavCrawlingContext`. Example update: ```diff - from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext ``` ### Storage clients - All storage client classes have been moved into a single sub-package called `storage_clients`. - The affected classes include: `MemoryStorageClient`, `BaseStorageClient`. Example update: ```diff - from crawlee.memory_storage_client import MemoryStorageClient + from crawlee.storage_clients import MemoryStorageClient ``` ### CurlImpersonateHttpClient - The `CurlImpersonateHttpClient` changed its import location. Example update: ```diff - from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient + from crawlee.http_clients import CurlImpersonateHttpClient ``` ### BeautifulSoupParser - Renamed `BeautifulSoupParser` to `BeautifulSoupParserType`. Probably used only in type hints. Please replace previous usages of `BeautifulSoupParser` by `BeautifulSoupParserType`. - `BeautifulSoupParser` is now a new class that is used in refactored class `BeautifulSoupCrawler`. ### Service locator - The `crawlee.service_container` was completely refactored and renamed to `crawlee.service_locator`. - You can use it to set the configuration, event manager or storage client globally. Or you can pass them to your crawler instance directly and it will use the service locator under the hood. ### Statistics - The `crawlee.statistics.Statistics` class do not accept an event manager as an input argument anymore. It uses the default, global one. - If you want to set your custom event manager, do it either via the service locator or pass it to the crawler. ### Request - The properties `json_` and `order_no` were removed. They were there only for the internal purpose of the memory storage client, you should not need them. ### Request storages and loaders - The `request_provider` parameter of `BasicCrawler.__init__` has been renamed to `request_manager` - The `BasicCrawler.get_request_provider` method has been renamed to `BasicCrawler.get_request_manager` and it does not accept the `id` and `name` arguments anymore - If using a specific request queue is desired, pass it as the `request_manager` on `BasicCrawler` creation - The `RequestProvider` interface has been renamed to `RequestManager` and moved to the `crawlee.request_loaders` package - `RequestList` has been moved to the `crawlee.request_loaders` package - `RequestList` does not support `.drop()`, `.reclaim_request()`, `.add_request()` and `add_requests_batched()` anymore - It implements the new `RequestLoader` interface instead of `RequestManager` - `RequestManagerTandem` with a `RequestQueue` should be used to enable passing a `RequestList` (or any other `RequestLoader` implementation) as a `request_manager`, `await list.to_tandem()` can be used as a shortcut ### PlaywrightCrawler - The `PlaywrightPreNavigationContext` was renamed to `PlaywrightPreNavCrawlingContext`. - The input arguments in `PlaywrightCrawler.__init__` have been renamed: - `browser_options` is now `browser_launch_options`, - `page_options` is now `browser_new_context_options`. - These argument renaming changes have also been applied to `BrowserPool`, `PlaywrightBrowserPlugin`, and `PlaywrightBrowserController`. ## Upgrading to v0.4 This section summarizes the breaking changes between v0.3.x and v0.4.0. ### Request model - The `Request.query_params` field has been removed. Please add query parameters directly to the URL, which was possible before as well, and is now the only supported approach. - The `Request.payload` and `Request.data` fields have been consolidated. Now, only `Request.payload` remains, and it should be used for all payload data in requests. ### Extended unique key computation - The computation of `extended_unique_key` now includes HTTP headers. While this change impacts the behavior, the interface remains the same. ## Upgrading to v0.3 This section summarizes the breaking changes between v0.2.x and v0.3.0. ### Public and private interface declaration In previous versions, the majority of the package was fully public, including many elements intended for internal use only. With the release of v0.3, we have clearly defined the public and private interface of the package. As a result, some imports have been updated (see below). If you are importing something now designated as private, we recommend reconsidering its use or discussing your use case with us in the discussions/issues. Here is a list of the updated public imports: ```diff - from crawlee.enqueue_strategy import EnqueueStrategy + from crawlee import EnqueueStrategy ``` ```diff - from crawlee.models import Request + from crawlee import Request ``` ```diff - from crawlee.basic_crawler import Router + from crawlee.router import Router ``` ### Request queue There were internal changes that should not affect the intended usage: - The unused `BaseRequestQueueClient.list_requests()` method was removed - `RequestQueue` internals were updated to match the "Request Queue V2" implementation in Crawlee for JS ### Service container A new module, `crawlee.service_container`, was added to allow management of "global instances" - currently it contains `Configuration`, `EventManager` and `BaseStorageClient`. The module also replaces the `StorageClientManager` static class. It is likely that its interface will change in the future. If your use case requires working with it, please get in touch - we'll be glad to hear any feedback. ================================================ FILE: docs/upgrading/upgrading_to_v1.md ================================================ --- id: upgrading-to-v1 title: Upgrading to v1 --- This page summarizes the breaking changes between Crawlee for Python v0.6 and v1.0. ## Terminology change: "browser" in different contexts The word "browser" is now used distinctly in two contexts: - **Playwright context** - Refers to Playwright-supported browsers (`chromium`, `firefox`, `webkit`, `edge`). - **Fingerprinting context** - Refers to browsers supported by fingerprint generation (`chrome`, `firefox`, `safari`, `edge`). The type of `HeaderGeneratorOptions.browsers` has changed accordingly: **Before (v0.6):** ```python from crawlee.fingerprint_suite import HeaderGeneratorOptions HeaderGeneratorOptions(browsers=['chromium']) HeaderGeneratorOptions(browsers=['webkit']) ``` **Now (v1.0):** ```python from crawlee.fingerprint_suite import HeaderGeneratorOptions HeaderGeneratorOptions(browsers=['chrome']) HeaderGeneratorOptions(browsers=['safari']) ``` ## New default HTTP client Crawlee v1.0 now uses `ImpitHttpClient` (based on [impit](https://apify.github.io/impit/) library) as the **default HTTP client**, replacing `HttpxHttpClient` (based on [httpx](https://www.python-httpx.org/) library). If you want to keep using `HttpxHttpClient`, install Crawlee with `httpx` extra, e.g. using pip: ```bash pip install 'crawlee[httpx]' ``` And then provide the HTTP client explicitly to the crawler: ```python from crawlee.crawlers import HttpCrawler from crawlee.http_clients import HttpxHttpClient client = HttpxHttpClient() crawler = HttpCrawler(http_client=client) ``` See the [HTTP clients guide](https://crawlee.dev/python/docs/guides/http-clients) for all options. ## Changes in storages In Crawlee v1.0, the `Dataset`, `KeyValueStore`, and `RequestQueue` storage APIs have been updated for consistency and simplicity. Below is a detailed overview of what's new, what's changed, and what's been removed. See the [Storages guide](https://crawlee.dev/python/docs/guides/storages) for more details. ### Dataset The `Dataset` API now includes several new methods, such as: - `get_metadata` - retrieves metadata information for the dataset. - `purge` - completely clears the dataset, including all items (keeps the metadata only). - `list_items` - returns the dataset's items in a list format. Some older methods have been removed or replaced: - `from_storage_object` constructor has been removed. You should now use the `open` method with either a `name` or `id` parameter. - `get_info` method and the `storage_object` property have been replaced by the new `get_metadata` method. - `set_metadata` method has been removed. - `write_to_json` and `write_to_csv` methods have been removed; instead, use the `export_to` method for exporting data in different formats. ### Key-value store The `KeyValueStore` API now includes several new methods, such as: - `get_metadata` - retrieves metadata information for the key-value store. - `purge` - completely clears the key-value store, removing all keys and values (keeps the metadata only). - `delete_value` - deletes a specific key and its associated value. - `list_keys` - lists all keys in the key-value store. Some older methods have been removed or replaced: - `from_storage_object` - removed; use the `open` method with either a `name` or `id` instead. - `get_info` and `storage_object` - replaced by the new `get_metadata` method. - `set_metadata` method has been removed. ### Request queue The `RequestQueue` API now includes several new methods, such as: - `get_metadata` - retrieves metadata information for the request queue. - `purge` - completely clears the request queue, including all pending and processed requests (keeps the metadata only). - `add_requests` - replaces the previous `add_requests_batched` method, offering the same functionality under a simpler name. Some older methods have been removed or replaced: - `from_storage_object` - removed; use the `open` method with either a `name` or `id` instead. - `get_info` and `storage_object` - replaced by the new `get_metadata` method. - `get_request` has argument `unique_key` instead of `request_id` as the `id` field was removed from the `Request`. - `set_metadata` method has been removed. Some changes in the related model classes: - `resource_directory` in `RequestQueueMetadata` - removed; use the corresponding `path_to_*` property instead. - `stats` field in `RequestQueueMetadata` - removed as it was unused. - `RequestQueueHead` - replaced by `RequestQueueHeadWithLocks`. ## New architecture of storage clients In v1.0, the storage client system has been completely reworked to simplify implementation and make custom storage clients easier to write. See the [Storage clients guide](https://crawlee.dev/python/docs/guides/storage-clients) for more details. ### New dedicated storage clients Previously, `MemoryStorageClient` handled both in-memory storage and optional file system persistence. This has now been split into two distinct storage clients: - **`MemoryStorageClient`** - Stores all data in memory only. - **`FileSystemStorageClient`** - Persists data on the file system, with in-memory caching for better performance. **Before (v0.6):** ```python from crawlee.configuration import Configuration from crawlee.storage_clients import MemoryStorageClient # In-memory only configuration = Configuration(persist_storage=False) storage_client = MemoryStorageClient.from_config(configuration) # File-system persistence configuration = Configuration(persist_storage=True) storage_client = MemoryStorageClient.from_config(configuration) ``` **Now (v1.0):** ```python from crawlee.storage_clients import MemoryStorageClient, FileSystemStorageClient # In-memory only storage_client = MemoryStorageClient() # File-system persistence storage_client = FileSystemStorageClient() ``` ### Registering a storage client The way you register a storage client remains unchanged: ```python from crawlee import service_locator from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import MemoryStorageClient from crawlee.storages import Dataset # Create custom storage client storage_client = MemoryStorageClient() # Then register it globally service_locator.set_storage_client(storage_client) # Or use it for a single crawler only crawler = ParselCrawler(storage_client=storage_client) # Or use it for a single storage only dataset = await Dataset.open( name='my-dataset', storage_client=storage_client, ) ``` ### Instance caching Instance caching of `Dataset.open`, `KeyValueStore.open`, and `RequestQueue.open` now return the same instance for the same arguments. Direct calls to `StorageClient.open_*` always return new instances. ### Writing custom storage clients The interface for custom storage clients has been simplified: - One storage client per storage type (`RequestQueue`, `KeyValueStore`, `Dataset`). - Collection storage clients have been removed. - The number of methods that have to be implemented have been reduced. ## ServiceLocator changes ### ServiceLocator is stricter with registering services You can register the services just once, and you can no longer override already registered services. **Before (v0.6):** ```python from crawlee import service_locator from crawlee.storage_clients import MemoryStorageClient service_locator.set_storage_client(MemoryStorageClient()) service_locator.set_storage_client(MemoryStorageClient()) ``` **Now (v1.0):** ```python from crawlee import service_locator from crawlee.storage_clients import MemoryStorageClient service_locator.set_storage_client(MemoryStorageClient()) service_locator.set_storage_client(MemoryStorageClient()) # Raises an error ``` ### BasicCrawler has its own instance of ServiceLocator to track its own services Explicitly passed services to the crawler can be different the global ones accessible in `crawlee.service_locator`. `BasicCrawler` no longer causes the global services in `service_locator` to be set to the crawler's explicitly passed services. **Before (v0.6):** ```python from crawlee import service_locator from crawlee.crawlers import BasicCrawler from crawlee.storage_clients import MemoryStorageClient from crawlee.storages import Dataset async def main() -> None: custom_storage_client = MemoryStorageClient() crawler = BasicCrawler(storage_client=custom_storage_client) assert service_locator.get_storage_client() is custom_storage_client assert await crawler.get_dataset() is await Dataset.open() ``` **Now (v1.0):** ```python from crawlee import service_locator from crawlee.crawlers import BasicCrawler from crawlee.storage_clients import MemoryStorageClient from crawlee.storages import Dataset async def main() -> None: custom_storage_client = MemoryStorageClient() crawler = BasicCrawler(storage_client=custom_storage_client) assert service_locator.get_storage_client() is not custom_storage_client assert await crawler.get_dataset() is not await Dataset.open() ``` This allows two crawlers with different services at the same time. **Now (v1.0):** ```python from crawlee.crawlers import BasicCrawler from crawlee.storage_clients import MemoryStorageClient, FileSystemStorageClient from crawlee.configuration import Configuration from crawlee.events import LocalEventManager custom_configuration_1 = Configuration() custom_event_manager_1 = LocalEventManager.from_config(custom_configuration_1) custom_storage_client_1 = MemoryStorageClient() custom_configuration_2 = Configuration() custom_event_manager_2 = LocalEventManager.from_config(custom_configuration_2) custom_storage_client_2 = FileSystemStorageClient() crawler_1 = BasicCrawler( configuration=custom_configuration_1, event_manager=custom_event_manager_1, storage_client=custom_storage_client_1, ) crawler_2 = BasicCrawler( configuration=custom_configuration_2, event_manager=custom_event_manager_2, storage_client=custom_storage_client_2, ) # use crawlers without runtime crash... ``` ## Other smaller updates There are more smaller updates. ### Python version support We drop support for Python 3.9. The minimum supported version is now Python 3.10. ### Changes in Configuration The fields `persist_storage` and `persist_metadata` have been removed from the `Configuration`. Persistence is now determined only by which storage client class you use. ### Changes in Request `Request` objects no longer have `id` field and all its usages have been transferred to `unique_key` field. ### Changes in HttpResponse The method `HttpResponse.read` is now asynchronous. This affects all HTTP-based crawlers. **Before (v0.6):** ```python from crawlee.crawlers import ParselCrawler, ParselCrawlingContext async def main() -> None: crawler = ParselCrawler() @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: # highlight-next-line content = context.http_response.read() # ... await crawler.run(['https://crawlee.dev/']) ``` **Now (v1.0):** ```python from crawlee.crawlers import ParselCrawler, ParselCrawlingContext async def main() -> None: crawler = ParselCrawler() @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: # highlight-next-line content = await context.http_response.read() # ... await crawler.run(['https://crawlee.dev/']) ``` ### New storage naming restrictions We've introduced naming restrictions for storages to ensure compatibility with Apify Platform requirements and prevent potential conflicts. Storage names may include only letters (a–z, A–Z), digits (0–9), and hyphens (-), with hyphens allowed only in the middle of the name (for example, my-storage-1). ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["hatchling"] build-backend = "hatchling.build" [project] name = "crawlee" version = "1.6.0" description = "Crawlee for Python" authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }] license = { file = "LICENSE" } readme = "README.md" requires-python = ">=3.10" classifiers = [ "Development Status :: 5 - Production/Stable", "Environment :: Console", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", "Topic :: Software Development :: Libraries", ] keywords = [ "apify", "automation", "chrome", "crawlee", "crawler", "headless", "scraper", "scraping", ] dependencies = [ "async-timeout>=5.0.1", "cachetools>=5.5.0", "colorama>=0.4.0", "impit>=0.8.0", "more-itertools>=10.2.0", "protego>=0.5.0", "psutil>=6.0.0", "pydantic-settings>=2.12.0", "pydantic>=2.11.0", "pyee>=9.0.0", "tldextract>=5.1.0", "typing-extensions>=4.1.0", "yarl>=1.18.0", ] [project.optional-dependencies] all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,redis]"] adaptive-crawler = [ "jaro-winkler>=2.0.3", "playwright>=1.27.0", "scikit-learn>=1.6.0", "apify_fingerprint_datapoints>=0.0.3", "browserforge>=1.2.4" ] beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"] cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"] curl-impersonate = ["curl-cffi>=0.9.0"] httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"] parsel = ["parsel>=1.10.0"] playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"] otel = [ "opentelemetry-api>=1.34.1", "opentelemetry-distro[otlp]>=0.54", "opentelemetry-instrumentation>=0.54", "opentelemetry-instrumentation-httpx>=0.54", "opentelemetry-sdk>=1.34.1", "opentelemetry-semantic-conventions>=0.54", "wrapt>=1.17.0", ] sql_postgres = [ "sqlalchemy[asyncio]>=2.0.0,<3.0.0", "asyncpg>=0.24.0" ] sql_sqlite = [ "sqlalchemy[asyncio]>=2.0.0,<3.0.0", "aiosqlite>=0.21.0", ] sql_mysql = [ "sqlalchemy[asyncio]>=2.0.0,<3.0.0", "aiomysql>=0.3.2", "cryptography>=46.0.5", ] redis = ["redis[hiredis] >= 7.0.0"] [project.scripts] crawlee = "crawlee._cli:cli" [project.urls] "Apify Homepage" = "https://apify.com" "Changelog" = "https://crawlee.dev/python/docs/changelog" "Discord" = "https://discord.com/invite/jyEM2PRvMU" "Documentation" = "https://crawlee.dev/python/docs/quick-start" "Homepage" = "https://crawlee.dev/python" "Issue Tracker" = "https://github.com/apify/crawlee-python/issues" "Release Notes" = "https://crawlee.dev/python/docs/upgrading" "Source Code" = "https://github.com/apify/crawlee-python" [dependency-groups] dev = [ # TODO: Remove this constraint once pydoc-markdown updates its dependencies. # Package pydoc-markdown is unmaintained and pins old docspec-python with vulnerable black. # See https://github.com/apify/apify-client-python/pull/582/ for more details. # We explicitly constrain black>=24.3.0 to override the transitive dependency. "black>=24.3.0", "anyio<5.0.0", "apify_client", # For e2e tests. "build<2.0.0", # For e2e tests. "dycw-pytest-only<3.0.0", "fakeredis[probabilistic,json,lua]<3.0.0", "poethepoet<1.0.0", "pre-commit<5.0.0", "proxy-py<3.0.0", "pydoc-markdown<5.0.0", "pytest-asyncio<2.0.0", "pytest-cov<8.0.0", "pytest-rerunfailures<17.0.0", "pytest-timeout<3.0.0", "pytest-xdist<4.0.0", "pytest<10.0.0", "ruff~=0.15.0", "setuptools", # setuptools are used by pytest, but not explicitly required "ty~=0.0.0", "types-beautifulsoup4<5.0.0", "types-cachetools<7.0.0", "types-colorama<1.0.0", "types-psutil<8.0.0", "types-python-dateutil<3.0.0", "uvicorn[standard]<1.0.0", ] [tool.hatch.build.targets.wheel] packages = ["src/crawlee"] [tool.ruff] line-length = 120 include = ["src/**/*.py", "tests/**/*.py", "docs/**/*.py", "website/**/*.py"] extend-exclude = ["src/crawlee/project_template"] [tool.ruff.lint] select = ["ALL"] ignore = [ "ANN401", # Dynamically typed expressions (typing.Any) are disallowed in {filename} "ASYNC109", # Async function definition with a `timeout` parameter "BLE001", # Do not catch blind exception "C901", # `{name}` is too complex "COM812", # This rule may cause conflicts when used with the formatter "D100", # Missing docstring in public module "D104", # Missing docstring in public package "D107", # Missing docstring in `__init__` "D203", # One blank line required before class docstring "D213", # Multi-line docstring summary should start at the second line "D413", # Missing blank line after last section "EM", # flake8-errmsg "G004", # Logging statement uses f-string "ISC001", # This rule may cause conflicts when used with the formatter "FIX", # flake8-fixme "PLR0911", # Too many return statements "PLR0912", # Too many branches "PLR0913", # Too many arguments in function definition "PLR0915", # Too many statements "PYI034", # `__aenter__` methods in classes like `{name}` usually return `self` at runtime "PYI036", # The second argument in `__aexit__` should be annotated with `object` or `BaseException | None` "S102", # Use of `exec` detected "S105", # Possible hardcoded password assigned to "S106", # Possible hardcoded password assigned to argument: "{name}" "S301", # `pickle` and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue "S303", # Use of insecure MD2, MD4, MD5, or SHA1 hash function "S311", # Standard pseudo-random generators are not suitable for cryptographic purposes "TD002", # Missing author in TODO; try: `# TODO(<author_name>): ...` or `# TODO @<author_name>: ... "TRY003", # Avoid specifying long messages outside the exception class ] [tool.ruff.format] quote-style = "single" indent-style = "space" [tool.ruff.lint.per-file-ignores] "**/__init__.py" = [ "F401", # Unused imports ] "**/{tests}/*" = [ "ASYNC230", # Async functions should not open files with blocking methods like `open` "D", # Everything from the pydocstyle "INP001", # File {filename} is part of an implicit namespace package, add an __init__.py "PLR2004", # Magic value used in comparison, consider replacing {value} with a constant variable "S101", # Use of assert detected "SLF001", # Private member accessed: `{name}` "T20", # flake8-print "TRY301", # Abstract `raise` to an inner function ] "**/{docs,website}/**" = [ "D", # Everything from the pydocstyle "INP001", # File {filename} is part of an implicit namespace package, add an __init__.py "F841", # Local variable {variable} is assigned to but never used "N999", # Invalid module name "T201", # `print` found ] "**/docs/examples/code_examples/*crawler_with_error_snapshotter.py" = [ "PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code. ] "**/docs/guides/code_examples/running_in_web_server/server.py" = [ "TC002", # ruff false positive. Import actually needed during runtime. ] "**/docs/guides/code_examples/creating_web_archive/*.*" = [ "ASYNC230", # Ignore for simplicity of the example. ] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" inline-quotes = "single" [tool.ruff.lint.flake8-type-checking] runtime-evaluated-base-classes = [ "pydantic.BaseModel", "pydantic_settings.BaseSettings", ] [tool.ruff.lint.flake8-builtins] builtins-ignorelist = ["id"] [tool.ruff.lint.isort] known-first-party = ["crawlee"] [tool.pytest.ini_options] addopts = "-r a --verbose" asyncio_default_fixture_loop_scope = "function" asyncio_mode = "auto" timeout = 1800 markers = [ "run_alone: marks tests that must run in isolation", ] # Ignore DeprecationWarnings coming from Uvicorn's internal imports. Uvicorn relies on deprecated # modules from `websockets`, which triggers warnings during tests. These are safe to ignore until # Uvicorn updates its internals. filterwarnings = [ "ignore:websockets.legacy is deprecated:DeprecationWarning", "ignore:websockets.server.WebSocketServerProtocol is deprecated:DeprecationWarning", ] [tool.ty.environment] python-version = "3.10" [tool.ty.src] include = ["src", "tests", "scripts", "docs", "website"] exclude = [ "src/crawlee/project_template", "docs/guides/code_examples/storage_clients/custom_storage_client_example.py", ] [[tool.ty.overrides]] include = [ "docs/**/*.py", "website/**/*.py", ] [tool.ty.overrides.rules] unresolved-import = "ignore" [tool.coverage.report] exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:", "assert_never()"] [tool.ipdb] context = 7 # Run tasks with: uv run poe <task> [tool.poe.tasks] clean = "rm -rf .coverage .pytest_cache .ruff_cache .ty_cache .uv-cache build coverage-unit.xml dist htmlcov website/.docusaurus website/.yarn website/module_shortcuts.json website/node_modules " install-sync = "uv sync --all-extras" build = "uv build --verbose" publish-to-pypi = "uv publish --verbose --token ${APIFY_PYPI_TOKEN_CRAWLEE}" type-check = "uv run ty check" check-code = ["lint", "type-check", "unit-tests"] [tool.poe.tasks.install-dev] shell = "uv sync --all-extras && uv run pre-commit install && uv run playwright install" [tool.poe.tasks.lint] shell = "uv run ruff format --check && uv run ruff check" [tool.poe.tasks.format] shell = "uv run ruff check --fix && uv run ruff format" [tool.poe.tasks.unit-tests] shell = """ uv run pytest \ --numprocesses=1 \ -m "run_alone" \ tests/unit && \ uv run pytest \ --numprocesses=${TESTS_CONCURRENCY:-auto} \ -m "not run_alone" \ tests/unit """ [tool.poe.tasks.unit-tests-cov] shell = """ uv run pytest \ --numprocesses=1 \ -m "run_alone" \ --cov=src/crawlee \ --cov-report=xml:coverage-unit.xml \ tests/unit && \ uv run pytest \ --numprocesses=${TESTS_CONCURRENCY:-auto} \ -m "not run_alone" \ --cov=src/crawlee \ --cov-report=xml:coverage-unit.xml \ --cov-append \ tests/unit """ [tool.poe.tasks.e2e-templates-tests] cmd = """ uv run pytest \ --numprocesses=${TESTS_CONCURRENCY:-auto} \ tests/e2e/project_template """ [tool.poe.tasks.build-docs] shell = "./build_api_reference.sh && corepack enable && yarn && yarn build" cwd = "website" [tool.poe.tasks.run-docs] shell = "./build_api_reference.sh && corepack enable && yarn && yarn start" cwd = "website" ================================================ FILE: renovate.json ================================================ { "extends": ["config:base", ":semanticCommitTypeAll(chore)"], "ignorePaths": ["docs/**", "src/crawlee/project_template/**"], "pinVersions": false, "separateMajorMinor": false, "dependencyDashboard": false, "semanticCommits": "enabled", "lockFileMaintenance": { "enabled": true, "automerge": true, "automergeType": "branch" }, "packageRules": [ { "matchPaths": ["pyproject.toml"], "matchDepTypes": ["devDependencies"], "matchUpdateTypes": ["major", "minor"], "groupName": "major/minor dev dependencies", "groupSlug": "dev-dependencies", "automerge": true, "automergeType": "branch" } ], "schedule": ["before 7am every weekday"], "ignoreDeps": ["crawlee", "docusaurus-plugin-typedoc-api"] } ================================================ FILE: src/crawlee/__init__.py ================================================ from importlib import metadata from ._request import Request, RequestOptions, RequestState from ._service_locator import service_locator from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason from ._utils.globs import Glob __version__ = metadata.version('crawlee') __all__ = [ 'ConcurrencySettings', 'EnqueueStrategy', 'Glob', 'HttpHeaders', 'Request', 'RequestOptions', 'RequestState', 'RequestTransformAction', 'SkippedReason', 'service_locator', ] ================================================ FILE: src/crawlee/_autoscaling/__init__.py ================================================ from .autoscaled_pool import AutoscaledPool from .snapshotter import Snapshotter from .system_status import SystemStatus __all__ = ['AutoscaledPool', 'Snapshotter', 'SystemStatus'] ================================================ FILE: src/crawlee/_autoscaling/_types.py ================================================ from __future__ import annotations from dataclasses import dataclass, field from datetime import datetime, timedelta, timezone from typing import TYPE_CHECKING, Annotated from pydantic import Field from pydantic.dataclasses import dataclass as pydantic_dataclass if TYPE_CHECKING: from crawlee._utils.byte_size import ByteSize SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD = 0.97 @dataclass class LoadRatioInfo: """Represent the load ratio of a resource.""" limit_ratio: float """The maximum ratio of overloaded and non-overloaded samples. If the actual ratio exceeds this value, the resource is considered as overloaded.""" actual_ratio: float """The actual ratio of overloaded and non-overloaded samples.""" @property def is_overloaded(self) -> bool: """Indicate whether the resource is currently overloaded.""" return self.actual_ratio > self.limit_ratio @dataclass class SystemInfo: """Represent the current status of the system.""" cpu_info: LoadRatioInfo """The CPU load ratio.""" memory_info: LoadRatioInfo """The memory load ratio.""" event_loop_info: LoadRatioInfo """The event loop load ratio.""" client_info: LoadRatioInfo """The client load ratio.""" created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) """The time at which the system load information was measured.""" @property def is_system_idle(self) -> bool: """Indicate whether the system is currently idle or overloaded.""" return ( not self.cpu_info.is_overloaded and not self.memory_info.is_overloaded and not self.event_loop_info.is_overloaded and not self.client_info.is_overloaded ) def __str__(self) -> str: """Get a string representation of the system info.""" stats = { 'cpu': self.cpu_info.actual_ratio, 'mem': self.memory_info.actual_ratio, 'event_loop': self.event_loop_info.actual_ratio, 'client_info': self.client_info.actual_ratio, } return '; '.join(f'{name} = {ratio}' for name, ratio in stats.items()) @dataclass class CpuSnapshot: """A snapshot of CPU usage.""" used_ratio: float """The ratio of CPU currently in use.""" max_used_ratio: float """The maximum ratio of CPU that is considered acceptable.""" created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) """The time at which the system load information was measured.""" @property def is_overloaded(self) -> bool: """Indicate whether the CPU is considered as overloaded.""" return self.used_ratio > self.max_used_ratio @dataclass class MemorySnapshot: """A snapshot of memory usage.""" current_size: ByteSize """Memory usage of the current Python process and its children.""" system_wide_used_size: ByteSize | None """Memory usage of all processes, system-wide.""" max_memory_size: ByteSize """The maximum memory that can be used by `AutoscaledPool`.""" system_wide_memory_size: ByteSize | None """Total memory available in the whole system.""" max_used_memory_ratio: float """The maximum acceptable ratio of `current_size` to `max_memory_size`.""" created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) """The time at which the system load information was measured.""" @property def is_overloaded(self) -> bool: """Indicate whether the memory is considered as overloaded.""" if self.system_wide_memory_size is not None and self.system_wide_used_size is not None: system_wide_utilization = self.system_wide_used_size / self.system_wide_memory_size if system_wide_utilization > SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD: return True return (self.current_size / self.max_memory_size) > self.max_used_memory_ratio @dataclass class EventLoopSnapshot: """Snapshot of the state of the event loop.""" delay: timedelta """The current delay of the event loop.""" max_delay: timedelta """The maximum delay that is considered acceptable.""" created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) """The time at which the system load information was measured.""" @property def max_delay_exceeded(self) -> timedelta: """The amount of time by which the delay exceeds the maximum delay.""" return max(self.delay - self.max_delay, timedelta(seconds=0)) @property def is_overloaded(self) -> bool: """Indicate whether the event loop is considered as overloaded.""" return self.delay > self.max_delay @dataclass class ClientSnapshot: """Snapshot of the state of the client.""" error_count: int """The number of errors (HTTP 429) that occurred.""" new_error_count: int """The number of new errors (HTTP 429) that occurred since the last snapshot.""" max_error_count: int """The maximum number of errors that is considered acceptable.""" created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) """The time at which the system load information was measured.""" @property def is_overloaded(self) -> bool: """Indicate whether the client is considered as overloaded.""" return self.new_error_count > self.max_error_count Snapshot = MemorySnapshot | CpuSnapshot | EventLoopSnapshot | ClientSnapshot @pydantic_dataclass class Ratio: """Represents ratio of memory.""" value: Annotated[float, Field(gt=0.0, le=1.0)] ================================================ FILE: src/crawlee/_autoscaling/autoscaled_pool.py ================================================ # Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/autoscaling/autoscaled_pool.ts from __future__ import annotations import asyncio import math from contextlib import suppress from datetime import timedelta from logging import getLogger from typing import TYPE_CHECKING from crawlee._types import ConcurrencySettings from crawlee._utils.docs import docs_group from crawlee._utils.recurring_task import RecurringTask if TYPE_CHECKING: from collections.abc import Awaitable, Callable from crawlee._autoscaling import SystemStatus logger = getLogger(__name__) class AbortError(Exception): """Raised when an AutoscaledPool run is aborted. Not for direct use.""" class _AutoscaledPoolRun: def __init__(self) -> None: self.worker_tasks = list[asyncio.Task]() """A list of worker tasks currently in progress""" self.worker_tasks_updated = asyncio.Event() self.cleanup_done = asyncio.Event() self.result: asyncio.Future = asyncio.Future() @docs_group('Autoscaling') class AutoscaledPool: """Manages a pool of asynchronous resource-intensive tasks that are executed in parallel. The pool only starts new tasks if there is enough free CPU and memory available. If an exception is thrown in any of the tasks, it is propagated and the pool is stopped. """ _AUTOSCALE_INTERVAL = timedelta(seconds=10) """Interval at which the autoscaled pool adjusts the desired concurrency based on the latest system status.""" _LOGGING_INTERVAL = timedelta(minutes=1) """Interval at which the autoscaled pool logs its current state.""" _DESIRED_CONCURRENCY_RATIO = 0.9 """Minimum ratio of desired concurrency that must be reached before allowing further scale-up.""" _SCALE_UP_STEP_RATIO = 0.05 """Fraction of desired concurrency to add during each scale-up operation.""" _SCALE_DOWN_STEP_RATIO = 0.05 """Fraction of desired concurrency to remove during each scale-down operation.""" _TASK_TIMEOUT: timedelta | None = None """Timeout within which the `run_task_function` must complete.""" def __init__( self, *, system_status: SystemStatus, concurrency_settings: ConcurrencySettings | None = None, run_task_function: Callable[[], Awaitable], is_task_ready_function: Callable[[], Awaitable[bool]], is_finished_function: Callable[[], Awaitable[bool]], ) -> None: """Initialize a new instance. Args: system_status: Provides data about system utilization (load). concurrency_settings: Settings of concurrency levels. run_task_function: A function that performs an asynchronous resource-intensive task. is_task_ready_function: A function that indicates whether `run_task_function` should be called. This function is called every time there is free capacity for a new task and it should indicate whether it should start a new task or not by resolving to either `True` or `False`. Besides its obvious use, it is also useful for task throttling to save resources. is_finished_function: A function that is called only when there are no tasks to be processed. If it resolves to `True` then the pool's run finishes. Being called only when there are no tasks being processed means that as long as `is_task_ready_function` keeps resolving to `True`, `is_finished_function` will never be called. To abort a run, use the `abort` method. """ concurrency_settings = concurrency_settings or ConcurrencySettings() self._system_status = system_status self._run_task_function = run_task_function self._is_task_ready_function = is_task_ready_function self._is_finished_function = is_finished_function self._desired_concurrency = concurrency_settings.desired_concurrency self._max_concurrency = concurrency_settings.max_concurrency self._min_concurrency = concurrency_settings.min_concurrency self._max_tasks_per_minute = concurrency_settings.max_tasks_per_minute self._log_system_status_task = RecurringTask(self._log_system_status, self._LOGGING_INTERVAL) self._autoscale_task = RecurringTask(self._autoscale, self._AUTOSCALE_INTERVAL) self._is_paused = False self._current_run: _AutoscaledPoolRun | None = None async def run(self) -> None: """Start the autoscaled pool and return when all tasks are completed and `is_finished_function` returns True. If there is an exception in one of the tasks, it will be re-raised. """ if self._current_run is not None: raise RuntimeError('The pool is already running') run = _AutoscaledPoolRun() self._current_run = run logger.debug('Starting the pool') self._autoscale_task.start() self._log_system_status_task.start() orchestrator = asyncio.create_task( self._worker_task_orchestrator(run), name='autoscaled pool worker task orchestrator' ) try: await run.result except AbortError: orchestrator.cancel() for task in run.worker_tasks: if not task.done(): task.cancel() finally: with suppress(asyncio.CancelledError): await self._autoscale_task.stop() with suppress(asyncio.CancelledError): await self._log_system_status_task.stop() if not orchestrator.done(): orchestrator.cancel() elif not orchestrator.cancelled() and orchestrator.exception() is not None: logger.error('Exception in worker task orchestrator', exc_info=orchestrator.exception()) logger.info('Waiting for remaining tasks to finish') for task in run.worker_tasks: if not task.done(): with suppress(BaseException): await task run.cleanup_done.set() self._current_run = None logger.debug('Pool cleanup finished') async def abort(self) -> None: """Interrupt the autoscaled pool and all the tasks in progress.""" if not self._current_run: raise RuntimeError('The pool is not running') self._current_run.result.set_exception(AbortError()) await self._current_run.cleanup_done.wait() def pause(self) -> None: """Pause the autoscaled pool so that it does not start new tasks.""" self._is_paused = True def resume(self) -> None: """Resume a paused autoscaled pool so that it continues starting new tasks.""" self._is_paused = False @property def desired_concurrency(self) -> int: """The current desired concurrency, possibly updated by the pool according to system load.""" return self._desired_concurrency @property def current_concurrency(self) -> int: """The number of concurrent tasks in progress.""" if self._current_run is None: return 0 return len(self._current_run.worker_tasks) def _autoscale(self) -> None: """Inspect system load status and adjust desired concurrency if necessary. Do not call directly.""" status = self._system_status.get_historical_system_info() min_current_concurrency = math.floor(self._DESIRED_CONCURRENCY_RATIO * self.desired_concurrency) should_scale_up = ( status.is_system_idle and self._desired_concurrency < self._max_concurrency and self.current_concurrency >= min_current_concurrency ) should_scale_down = not status.is_system_idle and self._desired_concurrency > self._min_concurrency if should_scale_up: step = math.ceil(self._SCALE_UP_STEP_RATIO * self._desired_concurrency) self._desired_concurrency = min(self._max_concurrency, self._desired_concurrency + step) elif should_scale_down: step = math.ceil(self._SCALE_DOWN_STEP_RATIO * self._desired_concurrency) self._desired_concurrency = max(self._min_concurrency, self._desired_concurrency - step) def _log_system_status(self) -> None: system_status = self._system_status.get_historical_system_info() logger.info( f'current_concurrency = {self.current_concurrency}; ' f'desired_concurrency = {self.desired_concurrency}; ' f'{system_status!s}' ) async def _worker_task_orchestrator(self, run: _AutoscaledPoolRun) -> None: """Launch worker tasks whenever there is free capacity and a task is ready. Exits when `is_finished_function` returns True. """ finished = False try: while not (finished := await self._is_finished_function()) and not run.result.done(): run.worker_tasks_updated.clear() current_status = self._system_status.get_current_system_info() if not current_status.is_system_idle: logger.debug('Not scheduling new tasks - system is overloaded') elif self._is_paused: logger.debug('Not scheduling new tasks - the autoscaled pool is paused') elif self.current_concurrency >= self.desired_concurrency: logger.debug('Not scheduling new tasks - already running at desired concurrency') elif not await self._is_task_ready_function(): logger.debug('Not scheduling new task - no task is ready') else: logger.debug('Scheduling a new task') worker_task = asyncio.create_task(self._worker_task(), name='autoscaled pool worker task') worker_task.add_done_callback(lambda task: self._reap_worker_task(task, run)) run.worker_tasks.append(worker_task) if math.isfinite(self._max_tasks_per_minute): await asyncio.sleep(60 / self._max_tasks_per_minute) continue with suppress(asyncio.TimeoutError): await asyncio.wait_for(run.worker_tasks_updated.wait(), timeout=0.5) finally: if finished: logger.debug('`is_finished_function` reports that we are finished') elif run.result.done() and run.result.exception() is not None: logger.debug('Unhandled exception in `run_task_function`') if run.worker_tasks: logger.debug('Terminating - waiting for tasks to complete') await asyncio.wait(run.worker_tasks, return_when=asyncio.ALL_COMPLETED) logger.debug('Worker tasks finished') else: logger.debug('Terminating - no running tasks to wait for') if not run.result.done(): run.result.set_result(object()) def _reap_worker_task(self, task: asyncio.Task, run: _AutoscaledPoolRun) -> None: """Handle cleanup and tracking of a completed worker task. - Interrupt the run if the task encountered an exception. - Update the list of tasks in progress. - Notify the orchestrator about the task completion. """ run.worker_tasks_updated.set() run.worker_tasks.remove(task) if not task.cancelled() and (exception := task.exception()) and not run.result.done(): run.result.set_exception(exception) async def _worker_task(self) -> None: try: await asyncio.wait_for( self._run_task_function(), timeout=self._TASK_TIMEOUT.total_seconds() if self._TASK_TIMEOUT is not None else None, ) except asyncio.TimeoutError: timeout_str = self._TASK_TIMEOUT.total_seconds() if self._TASK_TIMEOUT is not None else '*not set*' logger.warning(f'Task timed out after {timeout_str} seconds') finally: logger.debug('Worker task finished') ================================================ FILE: src/crawlee/_autoscaling/py.typed ================================================ ================================================ FILE: src/crawlee/_autoscaling/snapshotter.py ================================================ # Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/autoscaling/snapshotter.ts from __future__ import annotations import functools from bisect import insort from datetime import datetime, timedelta, timezone from logging import getLogger from typing import TYPE_CHECKING, TypeVar, cast from crawlee import service_locator from crawlee._autoscaling._types import ClientSnapshot, CpuSnapshot, EventLoopSnapshot, MemorySnapshot, Ratio, Snapshot from crawlee._utils.byte_size import ByteSize from crawlee._utils.context import ensure_context from crawlee._utils.docs import docs_group from crawlee._utils.recurring_task import RecurringTask from crawlee._utils.system import MemoryInfo, MemoryUsageInfo, get_memory_info from crawlee.events._types import Event, EventSystemInfoData if TYPE_CHECKING: from types import TracebackType from crawlee.configuration import Configuration logger = getLogger(__name__) T = TypeVar('T', bound=Snapshot) @functools.lru_cache def _warn_once(warning_message: str) -> None: """Log a warning message only once.""" logger.warning(warning_message) class SortedSnapshotList(list[T]): """A list that maintains sorted order by `created_at` attribute for snapshot objects.""" def add(self, item: T) -> None: """Add an item to the list maintaining sorted order by `created_at` using binary search.""" insort(self, item, key=lambda item: item.created_at) @docs_group('Autoscaling') class Snapshotter: """Monitors and logs system resource usage at predefined intervals for performance optimization. The class monitors and records the state of various system resources (CPU, memory, event loop, and client API) at predefined intervals. This continuous monitoring helps in identifying resource overloads and ensuring optimal performance of the application. It is utilized in the `AutoscaledPool` module to adjust task allocation dynamically based on the current demand and system load. """ _EVENT_LOOP_SNAPSHOT_INTERVAL = timedelta(milliseconds=500) """The interval at which the event loop is sampled.""" _CLIENT_SNAPSHOT_INTERVAL = timedelta(milliseconds=1000) """The interval at which the client is sampled.""" _SNAPSHOT_HISTORY = timedelta(seconds=30) """The time interval for which the snapshots are kept.""" _RESERVE_MEMORY_RATIO = 0.5 """Fraction of memory kept in reserve. Used to calculate critical memory overload threshold.""" _MEMORY_WARNING_COOLDOWN_PERIOD = timedelta(milliseconds=10000) """Minimum time interval between logging successive critical memory overload warnings.""" _CLIENT_RATE_LIMIT_ERROR_RETRY_COUNT = 2 """Number of retries for a client request before considering it a failure due to rate limiting.""" def __init__( self, *, max_used_cpu_ratio: float, max_used_memory_ratio: float, max_event_loop_delay: timedelta, max_client_errors: int, max_memory_size: ByteSize | Ratio, ) -> None: """Initialize a new instance. In most cases, you should use the `from_config` constructor to create a new instance based on the provided configuration. Args: max_used_cpu_ratio: Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than the provided ratio, the CPU is considered overloaded. max_used_memory_ratio: Sets the ratio, defining the maximum ratio of memory usage. When the memory usage is higher than the provided ratio of `max_memory_size`, the memory is considered overloaded. max_event_loop_delay: Sets the maximum delay of the event loop. When the delay is higher than the provided value, the event loop is considered overloaded. max_client_errors: Sets the maximum number of client errors (HTTP 429). When the number of client errors is higher than the provided number, the client is considered overloaded. max_memory_size: Sets the maximum amount of system memory to be used by the `AutoscaledPool`. When of type `ByteSize` then it is used as fixed memory size. When of type `Ratio` then it allows for dynamic memory scaling based on the available system memory. """ self._max_used_cpu_ratio = max_used_cpu_ratio self._max_used_memory_ratio = max_used_memory_ratio self._max_event_loop_delay = max_event_loop_delay self._max_client_errors = max_client_errors self._max_memory_size = max_memory_size self._cpu_snapshots = self._get_sorted_list_by_created_at(list[CpuSnapshot]()) self._event_loop_snapshots = self._get_sorted_list_by_created_at(list[EventLoopSnapshot]()) self._memory_snapshots = self._get_sorted_list_by_created_at(list[MemorySnapshot]()) self._client_snapshots = self._get_sorted_list_by_created_at(list[ClientSnapshot]()) self._snapshot_event_loop_task = RecurringTask(self._snapshot_event_loop, self._EVENT_LOOP_SNAPSHOT_INTERVAL) self._snapshot_client_task = RecurringTask(self._snapshot_client, self._CLIENT_SNAPSHOT_INTERVAL) self._timestamp_of_last_memory_warning: datetime = datetime.now(timezone.utc) - timedelta(hours=1) # Flag to indicate the context state. self._active = False @classmethod def from_config(cls, config: Configuration | None = None) -> Snapshotter: """Initialize a new instance based on the provided `Configuration`. Args: config: The `Configuration` instance. Uses the global (default) one if not provided. """ config = config or service_locator.get_configuration() # Compute the maximum memory size based on the provided configuration. If `memory_mbytes` is provided, # it uses that value. Otherwise, it calculates the `max_memory_size` as a proportion of the system's # total available memory based on `available_memory_ratio`. max_memory_size = ( ByteSize.from_mb(config.memory_mbytes) if config.memory_mbytes else Ratio(value=config.available_memory_ratio) ) return cls( max_used_cpu_ratio=config.max_used_cpu_ratio, max_used_memory_ratio=config.max_used_memory_ratio, max_event_loop_delay=config.max_event_loop_delay, max_client_errors=config.max_client_errors, max_memory_size=max_memory_size, ) @staticmethod def _get_sorted_list_by_created_at(input_list: list[T]) -> SortedSnapshotList[T]: """Create a sorted list from the input list. Returns a custom list that maintains sorted order by created_at when items are added. """ result = SortedSnapshotList[T]() result.extend(input_list) return result @property def active(self) -> bool: """Indicate whether the context is active.""" return self._active async def __aenter__(self) -> Snapshotter: """Start capturing snapshots at configured intervals. Raises: RuntimeError: If the context manager is already active. """ if self._active: raise RuntimeError(f'The {self.__class__.__name__} is already active.') self._active = True event_manager = service_locator.get_event_manager() event_manager.on(event=Event.SYSTEM_INFO, listener=self._snapshot_cpu) event_manager.on(event=Event.SYSTEM_INFO, listener=self._snapshot_memory) self._snapshot_event_loop_task.start() self._snapshot_client_task.start() return self async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None, ) -> None: """Stop all resource capturing. This method stops capturing snapshots of system resources (CPU, memory, event loop, and client information). It should be called to terminate resource capturing when it is no longer needed. Raises: RuntimeError: If the context manager is not active. """ if not self._active: raise RuntimeError(f'The {self.__class__.__name__} is not active.') event_manager = service_locator.get_event_manager() event_manager.off(event=Event.SYSTEM_INFO, listener=self._snapshot_cpu) event_manager.off(event=Event.SYSTEM_INFO, listener=self._snapshot_memory) await self._snapshot_event_loop_task.stop() await self._snapshot_client_task.stop() self._active = False @ensure_context def get_memory_sample(self, duration: timedelta | None = None) -> list[Snapshot]: """Return a sample of the latest memory snapshots. Args: duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history. Returns: A sample of memory snapshots. """ snapshots = cast('list[Snapshot]', self._memory_snapshots) return self._get_sample(snapshots, duration) @ensure_context def get_event_loop_sample(self, duration: timedelta | None = None) -> list[Snapshot]: """Return a sample of the latest event loop snapshots. Args: duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history. Returns: A sample of event loop snapshots. """ snapshots = cast('list[Snapshot]', self._event_loop_snapshots) return self._get_sample(snapshots, duration) @ensure_context def get_cpu_sample(self, duration: timedelta | None = None) -> list[Snapshot]: """Return a sample of the latest CPU snapshots. Args: duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history. Returns: A sample of CPU snapshots. """ snapshots = cast('list[Snapshot]', self._cpu_snapshots) return self._get_sample(snapshots, duration) @ensure_context def get_client_sample(self, duration: timedelta | None = None) -> list[Snapshot]: """Return a sample of the latest client snapshots. Args: duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history. Returns: A sample of client snapshots. """ snapshots = cast('list[Snapshot]', self._client_snapshots) return self._get_sample(snapshots, duration) @staticmethod def _get_sample(snapshots: list[Snapshot], duration: timedelta | None = None) -> list[Snapshot]: """Return a time-limited sample from snapshots or full history if duration is None.""" if not duration: return snapshots if not snapshots: return [] latest_time = snapshots[-1].created_at return [snapshot for snapshot in snapshots if latest_time - snapshot.created_at <= duration] async def _snapshot_cpu(self, event_data: EventSystemInfoData) -> None: """Capture a snapshot of the current CPU usage. This method does not perform CPU usage measurement. Instead, it just reads the data received through the `event_data` parameter, which is expected to be supplied by the event manager. Must be `async` to ensure it is not scheduled to be run in own thread by the event manager, which could cause race conditions in snapshots manipulation(sorting and pruning). Args: event_data: System info data from which CPU usage is read. """ snapshot = CpuSnapshot( used_ratio=event_data.cpu_info.used_ratio, max_used_ratio=self._max_used_cpu_ratio, created_at=event_data.cpu_info.created_at, ) snapshots = cast('list[Snapshot]', self._cpu_snapshots) self._cpu_snapshots.add(snapshot) self._prune_snapshots(snapshots, self._cpu_snapshots[-1].created_at) async def _snapshot_memory(self, event_data: EventSystemInfoData) -> None: """Capture a snapshot of the current memory usage. This method does not perform memory usage measurement. Instead, it just reads the data received through the `event_data` parameter, which is expected to be supplied by the event manager. Must be `async` to ensure it is not scheduled to be run in own thread by the event manager, which could cause race conditions in snapshots manipulation(sorting and pruning). Args: event_data: System info data from which memory usage is read. """ match event_data.memory_info, self._max_memory_size: case MemoryInfo() as memory_info, Ratio() as ratio: max_memory_size = memory_info.total_size * ratio.value system_wide_used_size = memory_info.system_wide_used_size system_wide_memory_size = memory_info.total_size case MemoryUsageInfo(), Ratio() as ratio: # This is just hypothetical case, that will most likely not happen in practice. # `LocalEventManager` should always provide `MemoryInfo` in the event data. # When running on Apify, `self._max_memory_size` is always `ByteSize`, not `Ratio`. _warn_once( 'It is recommended that a custom implementation of `LocalEventManager` emits `SYSTEM_INFO` events ' 'with `MemoryInfo` and not just `MemoryUsageInfo`.' ) max_memory_size = get_memory_info().total_size * ratio.value system_wide_used_size = None system_wide_memory_size = None case MemoryInfo() as memory_info, ByteSize() as byte_size: max_memory_size = byte_size system_wide_used_size = memory_info.system_wide_used_size system_wide_memory_size = memory_info.total_size case MemoryUsageInfo(), ByteSize() as byte_size: max_memory_size = byte_size system_wide_used_size = None system_wide_memory_size = None case _, _: raise NotImplementedError('Unsupported combination of memory info and max memory size types.') snapshot = MemorySnapshot( current_size=event_data.memory_info.current_size, max_memory_size=max_memory_size, max_used_memory_ratio=self._max_used_memory_ratio, created_at=event_data.memory_info.created_at, system_wide_used_size=system_wide_used_size, system_wide_memory_size=system_wide_memory_size, ) snapshots = cast('list[Snapshot]', self._memory_snapshots) self._memory_snapshots.add(snapshot) self._prune_snapshots(snapshots, self._memory_snapshots[-1].created_at) self._evaluate_memory_load( event_data.memory_info.current_size, event_data.memory_info.created_at, max_memory_size=max_memory_size, ) async def _snapshot_event_loop(self) -> None: """Capture a snapshot of the current event loop usage. This method evaluates the event loop's latency by comparing the expected time between snapshots to the actual time elapsed since the last snapshot. The delay in the snapshot reflects the time deviation due to event loop overhead - it's calculated by subtracting the expected interval between snapshots from the actual time elapsed since the last snapshot. If there's no previous snapshot, the delay is considered zero. Must be `async` to ensure it is not scheduled to be run in own thread by the event manager, which could cause race conditions in snapshots manipulation(sorting and pruning). """ snapshot = EventLoopSnapshot(max_delay=self._max_event_loop_delay, delay=timedelta(seconds=0)) previous_snapshot = self._event_loop_snapshots[-1] if self._event_loop_snapshots else None if previous_snapshot: event_loop_delay = snapshot.created_at - previous_snapshot.created_at - self._EVENT_LOOP_SNAPSHOT_INTERVAL snapshot.delay = event_loop_delay snapshots = cast('list[Snapshot]', self._event_loop_snapshots) self._event_loop_snapshots.add(snapshot) self._prune_snapshots(snapshots, self._event_loop_snapshots[-1].created_at) async def _snapshot_client(self) -> None: """Capture a snapshot of the current API state by checking for rate limit errors (HTTP 429). Only errors produced by a 2nd retry of the API call are considered for snapshotting since earlier errors may just be caused by a random spike in the number of requests and do not necessarily signify API overloading. Must be `async` to ensure it is not scheduled to be run in own thread by the event manager, which could cause race conditions in snapshots manipulation(sorting and pruning). """ client = service_locator.get_storage_client() rate_limit_errors: dict[int, int] = client.get_rate_limit_errors() error_count = rate_limit_errors.get(self._CLIENT_RATE_LIMIT_ERROR_RETRY_COUNT, 0) previous_error_count = self._client_snapshots[-1].error_count if self._client_snapshots else 0 snapshot = ClientSnapshot( error_count=error_count, new_error_count=error_count - previous_error_count, max_error_count=self._max_client_errors, ) snapshots = cast('list[Snapshot]', self._client_snapshots) self._client_snapshots.add(snapshot) self._prune_snapshots(snapshots, self._client_snapshots[-1].created_at) def _prune_snapshots(self, snapshots: list[Snapshot], now: datetime) -> None: """Remove snapshots that are older than the `self._snapshot_history`. This method modifies the list of snapshots in place, removing all snapshots that are older than the defined snapshot history relative to the `now` parameter. Args: snapshots: List of snapshots to be pruned in place. now: The current date and time, used as the reference for pruning. """ # Find the index where snapshots start to be within the allowed history window. # We'll keep snapshots from this index onwards. keep_from_index = None for i, snapshot in enumerate(snapshots): if now - snapshot.created_at <= self._SNAPSHOT_HISTORY: keep_from_index = i break # If all snapshots are old, keep_from_index will remain None, so we clear the list. # Otherwise, we keep only the recent snapshots. if keep_from_index is not None: del snapshots[:keep_from_index] else: snapshots.clear() def _evaluate_memory_load( self, current_memory_usage_size: ByteSize, snapshot_timestamp: datetime, max_memory_size: ByteSize ) -> None: """Evaluate and logs critical memory load conditions based on the system information. Args: current_memory_usage_size: The current memory usage. snapshot_timestamp: The time at which the memory snapshot was taken. max_memory_size: The maximum memory size to be used for evaluation. """ # Check if the warning has been logged recently to avoid spamming if snapshot_timestamp < self._timestamp_of_last_memory_warning + self._MEMORY_WARNING_COOLDOWN_PERIOD: return threshold_memory_size = self._max_used_memory_ratio * max_memory_size buffer_memory_size = max_memory_size * (1 - self._max_used_memory_ratio) * self._RESERVE_MEMORY_RATIO overload_memory_threshold_size = threshold_memory_size + buffer_memory_size # Log a warning if current memory usage exceeds the critical overload threshold if current_memory_usage_size > overload_memory_threshold_size: memory_usage_percentage = round((current_memory_usage_size.bytes / max_memory_size.bytes) * 100) logger.warning( f'Memory is critically overloaded. Using {current_memory_usage_size} of ' f'{max_memory_size} ({memory_usage_percentage}%). ' 'Consider increasing available memory.' ) self._timestamp_of_last_memory_warning = snapshot_timestamp ================================================ FILE: src/crawlee/_autoscaling/system_status.py ================================================ # Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/autoscaling/system_status.ts from __future__ import annotations from datetime import timedelta from logging import getLogger from typing import TYPE_CHECKING from more_itertools import pairwise from crawlee._autoscaling._types import LoadRatioInfo, Snapshot, SystemInfo from crawlee._utils.docs import docs_group if TYPE_CHECKING: from crawlee._autoscaling import Snapshotter logger = getLogger(__name__) @docs_group('Autoscaling') class SystemStatus: """Provides a simple interface for evaluating system resource usage from snapshots collected by `Snapshotter`. This class aggregates and interprets snapshots from a Snapshotter instance to evaluate the current and historical status of system resources like CPU, memory, event loop, and client API usage. It exposes two methods `get_current_system_info` and `get_historical_system_info`. The system information is computed using a weighted average of overloaded messages in the snapshots, with the weights being the time intervals between the snapshots. Each resource is computed separately, and the system is considered as overloaded whenever at least one resource is overloaded. `get_current_system_info` returns a `SystemInfo` data structure that represents the current status of the system. The length of the current timeframe in seconds is configurable by the `max_snapshot_age` option and represents the max age of snapshots to be considered for the computation. `SystemStatus.get_historical_system_info` returns a `SystemInfo` that represents the long-term status of the system. It considers the full snapshot history available in the `Snapshotter` instance. """ def __init__( self, snapshotter: Snapshotter, *, max_snapshot_age: timedelta = timedelta(seconds=5), cpu_overload_threshold: float = 0.4, memory_overload_threshold: float = 0.2, event_loop_overload_threshold: float = 0.6, client_overload_threshold: float = 0.3, ) -> None: """Initialize a new instance. Args: snapshotter: The `Snapshotter` instance to be queried for `SystemStatus`. max_snapshot_age: Defines max age of snapshots used in the `SystemStatus.get_current_system_info` measurement. cpu_overload_threshold: Sets the threshold of overloaded snapshots in the CPU sample. If the sample exceeds this threshold, the system will be considered overloaded. memory_overload_threshold: Sets the threshold of overloaded snapshots in the memory sample. If the sample exceeds this threshold, the system will be considered overloaded. event_loop_overload_threshold: Sets the threshold of overloaded snapshots in the event loop sample. If the sample exceeds this threshold, the system will be considered overloaded. client_overload_threshold: Sets the threshold of overloaded snapshots in the Client sample. If the sample exceeds this threshold, the system will be considered overloaded. """ self._snapshotter = snapshotter self._max_snapshot_age = max_snapshot_age self._cpu_overload_threshold = cpu_overload_threshold self._memory_overload_threshold = memory_overload_threshold self._event_loop_overload_threshold = event_loop_overload_threshold self._client_overload_threshold = client_overload_threshold def get_current_system_info(self) -> SystemInfo: """Retrieve and evaluates the current status of system resources. Considers snapshots within the `_max_snapshot_age` timeframe and determines if the system is currently overloaded based on predefined thresholds for each resource type. Returns: An object representing the current system status. """ return self._get_system_info(sample_duration=self._max_snapshot_age) def get_historical_system_info(self) -> SystemInfo: """Retrieve and evaluates the historical status of system resources. Considers the entire history of snapshots from the Snapshotter to assess long-term system performance and determines if the system has been historically overloaded. Returns: An object representing the historical system status. """ return self._get_system_info() def _get_system_info(self, *, sample_duration: timedelta | None = None) -> SystemInfo: """Get system information based on the overload state of different resources within a specified duration. Args: sample_duration: Specific duration for which to evaluate the system status. If None, evaluates across the entire history available in the snapshotter. Returns: Aggregated system status indicating whether the system is idle or overloaded. """ mem_info = self._is_memory_overloaded(sample_duration) event_loop_info = self._is_event_loop_overloaded(sample_duration) cpu_info = self._is_cpu_overloaded(sample_duration) client_info = self._is_client_overloaded(sample_duration) return SystemInfo( memory_info=mem_info, event_loop_info=event_loop_info, cpu_info=cpu_info, client_info=client_info, ) def _is_cpu_overloaded(self, sample_duration: timedelta | None = None) -> LoadRatioInfo: """Determine if the CPU has been overloaded within a specified time duration. Args: sample_duration: The duration within which to analyze CPU snapshots. If None, evaluates across the entire history available in the snapshotter. Returns: CPU load ratio information. """ sample = self._snapshotter.get_cpu_sample(sample_duration) return self._is_sample_overloaded(sample, self._cpu_overload_threshold) def _is_memory_overloaded(self, sample_duration: timedelta | None = None) -> LoadRatioInfo: """Determine if memory has been overloaded within a specified time duration. Args: sample_duration: The duration within which to analyze memory snapshots. If None, evaluates across the entire history available in the snapshotter. Returns: Memory load ratio information. """ sample = self._snapshotter.get_memory_sample(sample_duration) return self._is_sample_overloaded(sample, self._memory_overload_threshold) def _is_event_loop_overloaded(self, sample_duration: timedelta | None = None) -> LoadRatioInfo: """Determine if the event loop has been overloaded within a specified time duration. Args: sample_duration: The duration within which to analyze event loop snapshots. If None, evaluates across the entire history available in the snapshotter. Returns: Event loop load ratio information. """ sample = self._snapshotter.get_event_loop_sample(sample_duration) return self._is_sample_overloaded(sample, self._event_loop_overload_threshold) def _is_client_overloaded(self, sample_duration: timedelta | None = None) -> LoadRatioInfo: """Determine if the client has been overloaded within a specified time duration. Args: sample_duration: The duration within which to analyze client snapshots. If None, evaluates across the entire history available in the snapshotter. Returns: Client load ratio information. """ sample = self._snapshotter.get_client_sample(sample_duration) return self._is_sample_overloaded(sample, self._client_overload_threshold) def _is_sample_overloaded(self, sample: list[Snapshot], threshold: float) -> LoadRatioInfo: """Determine if a sample of snapshot data is overloaded based on a specified ratio. Args: sample: A list of snapshot data to analyze. threshold: The threshold ratio to use for determining if the sample is overloaded. Returns: An object with an `is_overloaded` property set to `True` if the sample is considered overloaded based on the specified threshold ratio. Otherwise, `is_overloaded` is set to `False`. """ if not sample: return LoadRatioInfo(limit_ratio=threshold, actual_ratio=0) if len(sample) == 1: return LoadRatioInfo(limit_ratio=threshold, actual_ratio=float(sample[0].is_overloaded)) overloaded_time = 0.0 non_overloaded_time = 0.0 for previous, current in pairwise(sample): time = (current.created_at - previous.created_at).total_seconds() if time < 0: raise ValueError('Negative time. Code assumptions are not valid. Expected time sorted samples.') if current.is_overloaded: overloaded_time += time else: non_overloaded_time += time if (total_time := overloaded_time + non_overloaded_time) == 0: overloaded_ratio = 0.0 else: overloaded_ratio = overloaded_time / total_time return LoadRatioInfo(limit_ratio=threshold, actual_ratio=round(overloaded_ratio, 3)) ================================================ FILE: src/crawlee/_cli.py ================================================ # ruff: noqa: FBT002 from __future__ import annotations import importlib.resources import json import sys from pathlib import Path from typing import Annotated, cast from click import Choice try: import inquirer import typer from cookiecutter.main import cookiecutter from inquirer.render.console import ConsoleRender from rich.progress import Progress, SpinnerColumn, TextColumn except ModuleNotFoundError as exc: raise ImportError( "Missing required dependencies for the Crawlee CLI. It looks like you're running 'crawlee' " "without the CLI extra. Try using 'crawlee[cli]' instead." ) from exc cli = typer.Typer(no_args_is_help=True) template_directory = importlib.resources.files('crawlee') / 'project_template' with (template_directory / 'cookiecutter.json').open() as f: cookiecutter_json = json.load(f) crawler_choices = cookiecutter_json['crawler_type'] http_client_choices = cookiecutter_json['http_client'] package_manager_choices = cookiecutter_json['package_manager'] default_start_url = cookiecutter_json['start_url'] default_enable_apify_integration = cookiecutter_json['enable_apify_integration'] default_install_project = cookiecutter_json['install_project'] @cli.callback(invoke_without_command=True) def callback( version: Annotated[ bool, typer.Option( '-V', '--version', help='Print Crawlee version', ), ] = False, ) -> None: """Crawlee is a web scraping and browser automation library.""" if version: from crawlee import __version__ # noqa: PLC0415 typer.echo(__version__) def _prompt_for_project_name(initial_project_name: str | None) -> str: """Prompt the user for a non-empty project name that does not lead to an existing folder.""" while True: if initial_project_name is not None: project_name = initial_project_name initial_project_name = None else: project_name = ConsoleRender().render( inquirer.Text( name='project_name', message='Name of the new project folder', validate=lambda _, value: bool(value.strip()), ), ) if not project_name: typer.echo('Project name is required.', err=True) continue project_path = Path.cwd() / project_name if project_path.exists(): typer.echo(f'Folder {project_path} already exists. Please choose another name.', err=True) continue return project_name def _prompt_text(message: str, default: str) -> str: return cast( 'str', ConsoleRender().render( inquirer.Text( name='text', message=message, default=default, validate=lambda _, value: bool(value.strip()), ), ), ) def _prompt_choice(message: str, choices: list[str]) -> str: """Prompt the user to pick one from a list of choices.""" return cast( 'str', ConsoleRender().render( inquirer.List( name='choice', message=message, choices=[(choice[0].upper() + choice[1:], choice) for choice in choices], ), ), ) def _prompt_bool(message: str, *, default: bool) -> bool: return cast( 'bool', ConsoleRender().render( inquirer.Confirm( name='confirm', message=message, default=default, ), ), ) @cli.command() def create( project_name: str | None = typer.Argument( default=None, show_default=False, help='The name of the project and the directory that will be created to contain it. ' 'If none is given, you will be prompted.', ), crawler_type: str | None = typer.Option( None, '--crawler-type', '--template', show_default=False, click_type=Choice(crawler_choices), help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.', ), http_client: str | None = typer.Option( None, show_default=False, click_type=Choice(http_client_choices), help='The library that will be used to make HTTP requests in your crawler. ' 'If none is given, you will be prompted.', ), package_manager: str | None = typer.Option( default=None, show_default=False, click_type=Choice(package_manager_choices), help='Package manager to be used in the new project. If none is given, you will be prompted.', ), start_url: str | None = typer.Option( default=None, show_default=False, metavar='[START_URL]', help='The URL where crawling should start. If none is given, you will be prompted.', ), *, enable_apify_integration: bool | None = typer.Option( None, '--apify/--no-apify', show_default=False, help='Should Apify integration be set up for you? If not given, you will be prompted.', ), install_project: bool | None = typer.Option( None, '--install/--no-install', show_default=False, help='Should the project be installed now? If not given, you will be prompted.', ), ) -> None: """Bootstrap a new Crawlee project.""" try: # Prompt for project name if not provided. project_name = _prompt_for_project_name(project_name) # Prompt for crawler_type if not provided. if crawler_type is None: crawler_type = _prompt_choice('Please select the Crawler type', crawler_choices) # Prompt for http_client if not provided. if http_client is None: http_client = _prompt_choice('Please select the HTTP client', http_client_choices) # Prompt for package manager if not provided. if package_manager is None: package_manager = _prompt_choice('Please select the package manager', package_manager_choices) # Prompt for start URL if start_url is None: start_url = _prompt_text('Please specify the start URL', default=default_start_url) # Ask about Apify integration if not explicitly configured if enable_apify_integration is None: enable_apify_integration = _prompt_bool( 'Should Apify integration be set up for you?', default=default_enable_apify_integration ) # Ask about installing the project if install_project is None: install_project = _prompt_bool('Should the project be installed now?', default=default_install_project) if all( [ project_name, crawler_type, http_client, package_manager, start_url, enable_apify_integration is not None, install_project is not None, ] ): package_name = project_name.replace('-', '_') # Start the bootstrap process. with Progress( SpinnerColumn(), TextColumn('[progress.description]{task.description}'), transient=True, ) as progress: bootstrap_task = progress.add_task(description='Bootstrapping...', total=None) try: cookiecutter( template=str(template_directory), no_input=True, extra_context={ 'project_name': project_name, 'package_manager': package_manager, 'crawler_type': crawler_type, 'http_client': http_client, 'enable_apify_integration': enable_apify_integration, 'start_url': start_url, 'install_project': install_project, }, ) except Exception as exc: progress.update(bootstrap_task, visible=False) progress.refresh() # Print just the last line of the error message (the actual error without traceback) if 'Hook script failed' in str(exc): typer.echo('Project creation failed. Check the error message above.', err=True) else: typer.echo(f'Project creation failed: {exc!s}', err=True) sys.exit(1) typer.echo(f'Your project "{project_name}" was created.') if install_project: if package_manager == 'pip': typer.echo( f'To run it, navigate to the directory: "cd {project_name}", ' f'activate the virtual environment in ".venv" ("source .venv/bin/activate") ' f'and run your project using "python -m {package_name}".' ) else: typer.echo( f'To run it, navigate to the directory: "cd {project_name}", ' f'and run it using "{package_manager} run python -m {package_name}".' ) elif package_manager == 'pip': typer.echo( f'To run it, navigate to the directory: "cd {project_name}", ' f'install the dependencies listed in "requirements.txt" ' f'and run it using "python -m {package_name}".' ) else: install_command = 'sync' if package_manager == 'uv' else 'install' typer.echo( f'To run it, navigate to the directory: "cd {project_name}", ' f'install the project using "{package_manager} {install_command}", ' f'and run it using "{package_manager} run python -m {package_name}".' ) typer.echo(f'See the "{project_name}/README.md" for more information.') except KeyboardInterrupt: typer.echo('Operation cancelled by user.') ================================================ FILE: src/crawlee/_consts.py ================================================ from __future__ import annotations METADATA_FILENAME = '__metadata__.json' """The name of the metadata file for storage clients.""" ================================================ FILE: src/crawlee/_log_config.py ================================================ from __future__ import annotations import json import logging import sys import textwrap from typing import TYPE_CHECKING, Any from colorama import Fore, Style, just_fix_windows_console from typing_extensions import assert_never from crawlee import service_locator if TYPE_CHECKING: from crawlee._types import LogLevel just_fix_windows_console() _LOG_NAME_COLOR = Fore.LIGHTBLACK_EX _LOG_LEVEL_COLOR = { logging.DEBUG: Fore.BLUE, logging.INFO: Fore.GREEN, logging.WARNING: Fore.YELLOW, logging.ERROR: Fore.RED, logging.CRITICAL: Fore.RED, } _LOG_LEVEL_SHORT_ALIAS = { logging.DEBUG: 'DEBUG', logging.INFO: 'INFO ', logging.WARNING: 'WARN ', logging.ERROR: 'ERROR', } # So that all the log messages have the same alignment _LOG_MESSAGE_INDENT = ' ' * 6 def string_to_log_level(level: LogLevel) -> int: """Convert a string representation of a log level to an integer log level.""" if level == 'DEBUG': return logging.DEBUG if level == 'INFO': return logging.INFO if level == 'WARNING': return logging.WARNING if level == 'ERROR': return logging.ERROR if level == 'CRITICAL': return logging.CRITICAL assert_never(level) def get_configured_log_level() -> int: config = service_locator.get_configuration() if 'log_level' in config.model_fields_set: return string_to_log_level(config.log_level) if sys.flags.dev_mode: return logging.DEBUG return logging.INFO def configure_logger(logger: logging.Logger, *, remove_old_handlers: bool = False) -> None: handler = logging.StreamHandler() handler.setFormatter(CrawleeLogFormatter()) if remove_old_handlers: for old_handler in logger.handlers[:]: logger.removeHandler(old_handler) logger.addHandler(handler) logger.setLevel(get_configured_log_level()) # Do not propagate the log messages to the parent logger to prevent duplicate log messages. logger.propagate = False class CrawleeLogFormatter(logging.Formatter): """Log formatter that prints out the log message nicely formatted, with colored level and stringified extra fields. It formats the log records so that they: - start with the level (colorized, and padded to 5 chars so that it is nicely aligned) - then have the actual log message, if it's multiline then it's nicely indented - then have the stringified extra log fields - then, if an exception is a part of the log record, prints the formatted exception. """ # The fields that are added to the log record with `logger.log(..., extra={...})` are just merged in the log record # with the other log record properties, and you can't get them in some nice, isolated way. So, to get the extra # fields, we just compare all the properties present in the log record with properties present in an empty log # record, and extract all the extra ones not present in the empty log record. empty_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None) def __init__( self, include_logger_name: bool = True, # noqa: FBT001, FBT002 *args: Any, **kwargs: Any, ) -> None: """Initialize a new instance. Args: include_logger_name: Include logger name at the beginning of the log line. args: Arguments passed to the parent class. kwargs: Keyword arguments passed to the parent class. """ super().__init__(*args, **kwargs) self.include_logger_name = include_logger_name def _get_extra_fields(self, record: logging.LogRecord) -> dict[str, Any]: extra_fields: dict[str, Any] = {} for key, value in record.__dict__.items(): if key not in self.empty_record.__dict__: extra_fields[key] = value # noqa: PERF403 return extra_fields def format(self, record: logging.LogRecord) -> str: """Format the log record nicely. This formats the log record so that it: - starts with the level (colorized, and padded to 5 chars so that it is nicely aligned) - then has the actual log message, if it's multiline then it's nicely indented - then has the stringified extra log fields - then, if an exception is a part of the log record, prints the formatted exception. """ logger_name_string = f'{_LOG_NAME_COLOR}[{record.name}]{Style.RESET_ALL} ' # Colorize the log level, and shorten it to 6 chars tops level_color_code = _LOG_LEVEL_COLOR.get(record.levelno, '') level_short_alias = _LOG_LEVEL_SHORT_ALIAS.get(record.levelno, record.levelname) level_string = f'{level_color_code}{level_short_alias}{Style.RESET_ALL} ' # Format the extra log record fields, if there were some # Just stringify them to JSON and color them gray extra_string = '' extra = self._get_extra_fields(record) if extra: extra_string = ( f' {Fore.LIGHTBLACK_EX}({json.dumps(extra, ensure_ascii=False, default=str)}){Style.RESET_ALL}' ) # Call the parent method so that it populates missing fields in the record super().format(record) # Format the actual log message log_string = self.formatMessage(record) # Format the exception, if there is some # Basically just print the traceback and indent it a bit exception_string = '' if record.exc_text: exception_string = '\n' + textwrap.indent(record.exc_text.rstrip(), _LOG_MESSAGE_INDENT) else: exception_string = '' if self.include_logger_name: # Include logger name at the beginning of the log line return f'{logger_name_string}{level_string}{log_string}{extra_string}{exception_string}' return f'{level_string}{log_string}{extra_string}{exception_string}' ================================================ FILE: src/crawlee/_request.py ================================================ from __future__ import annotations from collections.abc import Iterator, MutableMapping from datetime import datetime from enum import IntEnum from typing import TYPE_CHECKING, Annotated, Any, TypedDict, cast from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PlainSerializer, PlainValidator, TypeAdapter from yarl import URL from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, JsonSerializable from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.docs import docs_group from crawlee._utils.requests import compute_unique_key from crawlee._utils.urls import validate_http_url if TYPE_CHECKING: from typing_extensions import NotRequired, Required, Self class RequestState(IntEnum): """Crawlee-specific request handling state.""" UNPROCESSED = 0 BEFORE_NAV = 1 AFTER_NAV = 2 REQUEST_HANDLER = 3 DONE = 4 ERROR_HANDLER = 5 ERROR = 6 SKIPPED = 7 class CrawleeRequestData(BaseModel): """Crawlee-specific configuration stored in the `user_data`.""" max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None """Maximum number of retries for this request. Allows to override the global `max_request_retries` option of `BasicCrawler`.""" enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None """The strategy that was used for enqueuing the request.""" state: RequestState = RequestState.UNPROCESSED """Describes the request's current lifecycle state.""" session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None """The number of finished session rotations for this request.""" skip_navigation: Annotated[bool, Field(alias='skipNavigation')] = False last_proxy_tier: Annotated[int | None, Field(alias='lastProxyTier')] = None """The last proxy tier used to process the request.""" forefront: Annotated[bool, Field()] = False """Indicate whether the request should be enqueued at the front of the queue.""" crawl_depth: Annotated[int, Field(alias='crawlDepth')] = 0 """The depth of the request in the crawl tree.""" session_id: Annotated[str | None, Field()] = None """ID of a session to which the request is bound.""" class UserData(BaseModel, MutableMapping[str, JsonSerializable]): """Represents the `user_data` part of a Request. Apart from the well-known attributes (`label` and `__crawlee`), it can also contain arbitrary JSON-compatible values. """ model_config = ConfigDict(extra='allow') __pydantic_extra__: dict[str, JsonSerializable] = Field(init=False) crawlee_data: Annotated[CrawleeRequestData | None, Field(alias='__crawlee')] = None """Crawlee-specific configuration stored in the `user_data`.""" label: Annotated[str | None, Field()] = None """Label used for request routing.""" def __getitem__(self, key: str) -> JsonSerializable: return self.__pydantic_extra__[key] def __setitem__(self, key: str, value: JsonSerializable) -> None: if key == 'label': if value is not None and not isinstance(value, str): raise ValueError('`label` must be str or None') self.label = value self.__pydantic_extra__[key] = value def __delitem__(self, key: str) -> None: del self.__pydantic_extra__[key] def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override] yield from self.__pydantic_extra__ def __len__(self) -> int: return len(self.__pydantic_extra__) def __eq__(self, other: object) -> bool: if isinstance(other, BaseModel): return super().__eq__(other) if isinstance(other, dict): return self.model_dump() == other return NotImplemented def __hash__(self) -> int: """Return hash based on the model fields.""" data = self.model_dump() return hash(tuple(sorted(data.items()))) user_data_adapter = TypeAdapter(UserData) @docs_group('Other') class RequestOptions(TypedDict): """Options that can be used to customize request creation. This type exactly matches the parameters of `Request.from_url` method. """ url: Required[str] method: NotRequired[HttpMethod] headers: NotRequired[HttpHeaders | dict[str, str] | None] payload: NotRequired[HttpPayload | str | None] label: NotRequired[str | None] session_id: NotRequired[str | None] unique_key: NotRequired[str | None] id: NotRequired[str | None] keep_url_fragment: NotRequired[bool] use_extended_unique_key: NotRequired[bool] always_enqueue: NotRequired[bool] user_data: NotRequired[dict[str, JsonSerializable]] no_retry: NotRequired[bool] enqueue_strategy: NotRequired[EnqueueStrategy] max_retries: NotRequired[int | None] @docs_group('Storage data') class Request(BaseModel): """Represents a request in the Crawlee framework, containing the necessary information for crawling operations. The `Request` class is one of the core components in Crawlee, utilized by various components such as request providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests, including the URL, HTTP method, headers, payload, and user data. The user data allows custom information to be stored and persisted throughout the request lifecycle, including its retries. Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used for request deduplication, controlling retries, handling state management, and enabling configuration for session rotation and proxy handling. The recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically generates a unique key and identifier based on the URL and request parameters. ### Usage ```python from crawlee import Request request = Request.from_url('https://crawlee.dev') ``` """ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)] """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing to the same URL. If `unique_key` is not provided, then it is automatically generated by normalizing the URL. For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key` of `http://www.example.com/something`. Pass an arbitrary non-empty text value to the `unique_key` property to override the default behavior and specify which URLs shall be considered equal. """ url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)] """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters and fragments.""" method: Annotated[HttpMethod, Field(frozen=True)] = 'GET' """HTTP request method.""" payload: Annotated[ HttpPayload | None, BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v), PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v), Field(frozen=True), ] = None """HTTP request payload.""" # Workaround for Pydantic and type checkers when using Annotated with default_factory if TYPE_CHECKING: headers: HttpHeaders = HttpHeaders() """HTTP request headers.""" user_data: dict[str, JsonSerializable] = {} """Custom user data assigned to the request. Use this to save any request related data to the request's scope, keeping them accessible on retries, failures etc. """ else: headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] """HTTP request headers.""" user_data: Annotated[ dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience Field(alias='userData', default_factory=UserData), PlainValidator(user_data_adapter.validate_python), PlainSerializer( lambda instance: user_data_adapter.dump_python( instance, by_alias=True, exclude_none=False, exclude_unset=True, exclude_defaults=True, ) ), ] """Custom user data assigned to the request. Use this to save any request related data to the request's scope, keeping them accessible on retries, failures etc. """ retry_count: Annotated[int, Field(alias='retryCount')] = 0 """Number of times the request has been retried.""" no_retry: Annotated[bool, Field(alias='noRetry')] = False """If set to `True`, the request will not be retried in case of failure.""" loaded_url: Annotated[str | None, BeforeValidator(validate_http_url), Field(alias='loadedUrl')] = None """URL of the web page that was loaded. This can differ from the original URL in case of redirects.""" handled_at: Annotated[datetime | None, Field(alias='handledAt')] = None """Timestamp when the request was handled.""" @classmethod def from_url( cls, url: str, *, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None, payload: HttpPayload | str | None = None, label: str | None = None, session_id: str | None = None, unique_key: str | None = None, keep_url_fragment: bool = False, use_extended_unique_key: bool = False, always_enqueue: bool = False, enqueue_strategy: EnqueueStrategy | None = None, max_retries: int | None = None, **kwargs: Any, ) -> Self: """Create a new `Request` instance from a URL. This is recommended constructor for creating new `Request` instances. It generates a `Request` object from a given URL with additional options to customize HTTP method, payload, unique key, and other request properties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL, method and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags. Args: url: The URL of the request. method: The HTTP method of the request. headers: The HTTP headers of the request. payload: The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests. label: A custom label to differentiate between request types. This is stored in `user_data`, and it is used for request routing (different requests go to different handlers). session_id: ID of a specific `Session` to which the request will be strictly bound. If the session becomes unavailable when the request is processed, a `RequestCollisionError` will be raised. unique_key: A unique key identifying the request. If not provided, it is automatically computed based on the URL and other parameters. Requests with the same `unique_key` are treated as identical. keep_url_fragment: Determines whether the URL fragment (e.g., `#section`) should be included in the `unique_key` computation. This is only relevant when `unique_key` is not provided. use_extended_unique_key: Determines whether to include the HTTP method, ID Session and payload in the `unique_key` computation. This is only relevant when `unique_key` is not provided. always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue. Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`. enqueue_strategy: The strategy that will be used for enqueuing the request. max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries` option of `BasicCrawler`. **kwargs: Additional request properties. """ if unique_key is not None and always_enqueue: raise ValueError('`always_enqueue` cannot be used with a custom `unique_key`') if isinstance(headers, dict) or headers is None: headers = HttpHeaders(headers or {}) if isinstance(payload, str): payload = payload.encode() unique_key = unique_key or compute_unique_key( url, method=method, headers=headers, payload=payload, session_id=session_id, keep_url_fragment=keep_url_fragment, use_extended_unique_key=use_extended_unique_key, ) if always_enqueue: unique_key = f'{crypto_random_object_id()}|{unique_key}' user_data_dict = kwargs.pop('user_data', {}) or {} crawlee_data_dict = user_data_dict.get('__crawlee', {}) if max_retries is not None: crawlee_data_dict['maxRetries'] = max_retries if enqueue_strategy is not None: crawlee_data_dict['enqueueStrategy'] = enqueue_strategy crawlee_data = CrawleeRequestData(**crawlee_data_dict) if crawlee_data: user_data_dict['__crawlee'] = crawlee_data request = cls( url=url, unique_key=unique_key, method=method, headers=headers, payload=payload, user_data=user_data_dict, **kwargs, ) if label is not None: request.user_data['label'] = label if session_id is not None: request.crawlee_data.session_id = session_id return request def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None: """Get the value of a specific query parameter from the URL.""" query_params = URL(self.url).query return query_params.get(param, default) @property def label(self) -> str | None: """A string used to differentiate between arbitrary request types.""" return cast('UserData', self.user_data).label @property def session_id(self) -> str | None: """The ID of the bound session, if there is any.""" return self.crawlee_data.session_id @property def crawlee_data(self) -> CrawleeRequestData: """Crawlee-specific configuration stored in the `user_data`.""" user_data = cast('UserData', self.user_data) if user_data.crawlee_data is None: user_data.crawlee_data = CrawleeRequestData() return user_data.crawlee_data @property def crawl_depth(self) -> int: """The depth of the request in the crawl tree.""" return self.crawlee_data.crawl_depth @crawl_depth.setter def crawl_depth(self, new_value: int) -> None: self.crawlee_data.crawl_depth = new_value @property def state(self) -> RequestState: """Crawlee-specific request handling state.""" return self.crawlee_data.state @state.setter def state(self, new_state: RequestState) -> None: self.crawlee_data.state = new_state @property def max_retries(self) -> int | None: """Crawlee-specific limit on the number of retries of the request.""" return self.crawlee_data.max_retries @property def session_rotation_count(self) -> int | None: """Crawlee-specific number of finished session rotations for the request.""" return self.crawlee_data.session_rotation_count @session_rotation_count.setter def session_rotation_count(self, new_session_rotation_count: int) -> None: self.crawlee_data.session_rotation_count = new_session_rotation_count @property def enqueue_strategy(self) -> EnqueueStrategy: """The strategy that was used for enqueuing the request.""" return self.crawlee_data.enqueue_strategy or 'all' @enqueue_strategy.setter def enqueue_strategy(self, new_enqueue_strategy: EnqueueStrategy) -> None: self.crawlee_data.enqueue_strategy = new_enqueue_strategy @property def last_proxy_tier(self) -> int | None: """The last proxy tier used to process the request.""" return self.crawlee_data.last_proxy_tier @last_proxy_tier.setter def last_proxy_tier(self, new_value: int) -> None: self.crawlee_data.last_proxy_tier = new_value @property def forefront(self) -> bool: """Indicate whether the request should be enqueued at the front of the queue.""" return self.crawlee_data.forefront @forefront.setter def forefront(self, new_value: bool) -> None: self.crawlee_data.forefront = new_value @property def was_already_handled(self) -> bool: """Indicates whether the request was handled.""" return self.handled_at is not None class RequestWithLock(Request): """A crawling request with information about locks.""" lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')] """The timestamp when the lock expires.""" ================================================ FILE: src/crawlee/_service_locator.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING from crawlee._utils.docs import docs_group from crawlee.configuration import Configuration from crawlee.errors import ServiceConflictError from crawlee.events import EventManager, LocalEventManager from crawlee.storage_clients import FileSystemStorageClient, StorageClient if TYPE_CHECKING: from crawlee.storages._storage_instance_manager import StorageInstanceManager from logging import getLogger logger = getLogger(__name__) @docs_group('Configuration') class ServiceLocator: """Service locator for managing the services used by Crawlee. All services are initialized to its default value lazily. """ global_storage_instance_manager: StorageInstanceManager | None = None def __init__( self, configuration: Configuration | None = None, event_manager: EventManager | None = None, storage_client: StorageClient | None = None, ) -> None: self._configuration = configuration self._event_manager = event_manager self._storage_client = storage_client def get_configuration(self) -> Configuration: """Get the configuration.""" if self._configuration is None: logger.debug('No configuration set, implicitly creating and using default Configuration.') self._configuration = Configuration() return self._configuration def set_configuration(self, configuration: Configuration) -> None: """Set the configuration. Args: configuration: The configuration to set. Raises: ServiceConflictError: If the configuration has already been retrieved before. """ if self._configuration is configuration: # Same instance, no need to anything return if self._configuration: raise ServiceConflictError(Configuration, configuration, self._configuration) self._configuration = configuration def get_event_manager(self) -> EventManager: """Get the event manager.""" if self._event_manager is None: logger.debug('No event manager set, implicitly creating and using default LocalEventManager.') if self._configuration is None: logger.warning( 'Implicit creation of event manager will implicitly set configuration as side effect. ' 'It is advised to explicitly first set the configuration instead.' ) self._event_manager = LocalEventManager().from_config(config=self._configuration) return self._event_manager def set_event_manager(self, event_manager: EventManager) -> None: """Set the event manager. Args: event_manager: The event manager to set. Raises: ServiceConflictError: If the event manager has already been retrieved before. """ if self._event_manager is event_manager: # Same instance, no need to anything return if self._event_manager: raise ServiceConflictError(EventManager, event_manager, self._event_manager) self._event_manager = event_manager def get_storage_client(self) -> StorageClient: """Get the storage client.""" if self._storage_client is None: logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.') if self._configuration is None: logger.warning( 'Implicit creation of storage client will implicitly set configuration as side effect. ' 'It is advised to explicitly first set the configuration instead.' ) self._storage_client = FileSystemStorageClient() return self._storage_client def set_storage_client(self, storage_client: StorageClient) -> None: """Set the storage client. Args: storage_client: The storage client to set. Raises: ServiceConflictError: If the storage client has already been retrieved before. """ if self._storage_client is storage_client: # Same instance, no need to anything return if self._storage_client: raise ServiceConflictError(StorageClient, storage_client, self._storage_client) self._storage_client = storage_client @property def storage_instance_manager(self) -> StorageInstanceManager: """Get the storage instance manager. It is global manager shared by all instances of ServiceLocator.""" if ServiceLocator.global_storage_instance_manager is None: # Import here to avoid circular imports. from crawlee.storages._storage_instance_manager import StorageInstanceManager # noqa: PLC0415 ServiceLocator.global_storage_instance_manager = StorageInstanceManager() return ServiceLocator.global_storage_instance_manager service_locator = ServiceLocator() ================================================ FILE: src/crawlee/_types.py ================================================ from __future__ import annotations import dataclasses from collections.abc import Callable, Iterator, Mapping from copy import deepcopy from dataclasses import dataclass from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload from pydantic import ConfigDict, Field, PlainValidator, RootModel from crawlee._utils.docs import docs_group if TYPE_CHECKING: import json import logging import re from collections.abc import Callable, Coroutine, Sequence from typing_extensions import NotRequired, Required, Self, Unpack from crawlee import Glob, Request from crawlee._request import RequestOptions from crawlee.configuration import Configuration from crawlee.http_clients import HttpResponse from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions import Session from crawlee.storage_clients import StorageClient from crawlee.storages import KeyValueStore # Workaround for https://github.com/pydantic/pydantic/issues/9445 J = TypeVar('J', bound='JsonSerializable') JsonSerializable = list[J] | dict[str, J] | str | bool | int | float | None else: from pydantic import JsonValue as JsonSerializable T = TypeVar('T') HttpMethod = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH'] HttpPayload = bytes RequestTransformAction = Literal['skip', 'unchanged'] EnqueueStrategy = Literal['all', 'same-domain', 'same-hostname', 'same-origin'] """Enqueue strategy to be used for determining which links to extract and enqueue.""" SkippedReason = Literal['robots_txt'] LogLevel = Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]: """Convert all header keys to lowercase, strips whitespace, and returns them sorted by key.""" normalized_headers = {k.lower().strip(): v.strip() for k, v in headers.items()} sorted_headers = sorted(normalized_headers.items()) return dict(sorted_headers) @docs_group('Other') class HttpHeaders(RootModel, Mapping[str, str]): """A dictionary-like object representing HTTP headers.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) # Workaround for Pydantic and type checkers when using Annotated with default_factory if TYPE_CHECKING: root: dict[str, str] = {} else: root: Annotated[ dict[str, str], PlainValidator(_normalize_headers), Field(default_factory=dict), ] def __getitem__(self, key: str) -> str: return self.root[key.lower()] def __setitem__(self, key: str, value: str) -> None: raise TypeError(f'{self.__class__.__name__} is immutable') def __delitem__(self, key: str) -> None: raise TypeError(f'{self.__class__.__name__} is immutable') def __or__(self, other: HttpHeaders) -> HttpHeaders: """Return a new instance of `HttpHeaders` combining this one with another one.""" combined_headers = {**self.root, **other} return HttpHeaders(combined_headers) def __ror__(self, other: HttpHeaders) -> HttpHeaders: """Support reversed | operation (other | self).""" combined_headers = {**other, **self.root} return HttpHeaders(combined_headers) def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override] yield from self.root def __len__(self) -> int: return len(self.root) @docs_group('Configuration') class ConcurrencySettings: """Concurrency settings for AutoscaledPool.""" def __init__( self, min_concurrency: int = 1, max_concurrency: int = 100, max_tasks_per_minute: float = float('inf'), desired_concurrency: int = 10, ) -> None: """Initialize a new instance. Args: min_concurrency: The minimum number of tasks running in parallel. If you set this value too high with respect to the available system memory and CPU, your code might run extremely slow or crash. max_concurrency: The maximum number of tasks running in parallel. max_tasks_per_minute: The maximum number of tasks per minute the pool can run. By default, this is set to infinity, but you can pass any positive, non-zero number. desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool, if there is a large enough supply of them. By default, it is `min_concurrency`. """ if min_concurrency < 1: raise ValueError('min_concurrency must be 1 or larger') if max_concurrency < min_concurrency: raise ValueError('max_concurrency cannot be less than min_concurrency') if desired_concurrency < min_concurrency: raise ValueError('desired_concurrency cannot be less than min_concurrency') if desired_concurrency > max_concurrency: raise ValueError('desired_concurrency cannot be greater than max_concurrency') if max_tasks_per_minute <= 0: raise ValueError('max_tasks_per_minute must be positive') self.min_concurrency = min_concurrency self.max_concurrency = max_concurrency self.desired_concurrency = desired_concurrency self.max_tasks_per_minute = max_tasks_per_minute class EnqueueLinksKwargs(TypedDict): """Keyword arguments for the `enqueue_links` methods.""" limit: NotRequired[int] """Maximum number of requests to be enqueued.""" base_url: NotRequired[str] """Base URL to be used for relative URLs.""" strategy: NotRequired[EnqueueStrategy] """Enqueue strategy to be used for determining which links to extract and enqueue. Options: all: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all links, including those leading to external websites, are followed. same-domain: Enqueue links that share the same domain name as the current page, including any subdomains. This strategy is ideal for crawling within the same top-level domain while still allowing for subdomain exploration. same-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default behavior and restricts the crawl to the current hostname, excluding subdomains. same-origin: Enqueue links that share the same origin as the current page. The origin is defined by the combination of protocol, domain, and port, ensuring a strict scope for the crawl. """ include: NotRequired[Sequence[re.Pattern | Glob]] """List of regular expressions or globs that URLs must match to be enqueued.""" exclude: NotRequired[Sequence[re.Pattern | Glob]] """List of regular expressions or globs that URLs must not match to be enqueued.""" class AddRequestsKwargs(EnqueueLinksKwargs): """Keyword arguments for the `add_requests` methods.""" requests: Sequence[str | Request] """Requests to be added to the `RequestManager`.""" rq_id: str | None """ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.""" rq_name: str | None """Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided. """ rq_alias: str | None """Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided. """ class PushDataKwargs(TypedDict): """Keyword arguments for dataset's `push_data` method.""" class PushDataFunctionCall(PushDataKwargs): data: list[dict[str, Any]] | dict[str, Any] dataset_id: str | None dataset_name: str | None dataset_alias: str | None class KeyValueStoreInterface(Protocol): """The (limited) part of the `KeyValueStore` interface that should be accessible from a request handler.""" @overload async def get_value(self, key: str) -> Any: ... @overload async def get_value(self, key: str, default_value: T) -> T: ... @overload async def get_value(self, key: str, default_value: T | None = None) -> T | None: ... async def get_value(self, key: str, default_value: T | None = None) -> T | None: ... async def set_value( self, key: str, value: Any, content_type: str | None = None, ) -> None: ... @dataclass() class KeyValueStoreValue: content: Any content_type: str | None class KeyValueStoreChangeRecords: def __init__(self, actual_key_value_store: KeyValueStore) -> None: self.updates = dict[str, KeyValueStoreValue]() self._actual_key_value_store = actual_key_value_store async def set_value( self, key: str, value: Any, content_type: str | None = None, ) -> None: self.updates[key] = KeyValueStoreValue(value, content_type) @overload async def get_value(self, key: str) -> Any: ... @overload async def get_value(self, key: str, default_value: T) -> T: ... @overload async def get_value(self, key: str, default_value: T | None = None) -> T | None: ... async def get_value(self, key: str, default_value: T | None = None) -> T | None: if key in self.updates: return cast('T', self.updates[key].content) return await self._actual_key_value_store.get_value(key, default_value) class RequestHandlerRunResult: """Record of calls to storage-related context helpers.""" def __init__( self, *, key_value_store_getter: GetKeyValueStoreFunction, request: Request, ) -> None: self._key_value_store_getter = key_value_store_getter self.add_requests_calls = list[AddRequestsKwargs]() self.push_data_calls = list[PushDataFunctionCall]() self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]() # Isolated copies for handler execution self._request = deepcopy(request) @property def request(self) -> Request: return self._request async def add_requests( self, requests: Sequence[str | Request], rq_id: str | None = None, rq_name: str | None = None, rq_alias: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> None: """Track a call to the `add_requests` context helper.""" specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None) if specified_params > 1: raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.') self.add_requests_calls.append( AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs) ) async def push_data( self, data: list[dict[str, Any]] | dict[str, Any], dataset_id: str | None = None, dataset_name: str | None = None, dataset_alias: str | None = None, **kwargs: Unpack[PushDataKwargs], ) -> None: """Track a call to the `push_data` context helper.""" self.push_data_calls.append( PushDataFunctionCall( data=data, dataset_id=dataset_id, dataset_name=dataset_name, dataset_alias=dataset_alias, **kwargs, ) ) async def get_key_value_store( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, ) -> KeyValueStoreInterface: if (id, name, alias) not in self.key_value_store_changes: self.key_value_store_changes[id, name, alias] = KeyValueStoreChangeRecords( await self._key_value_store_getter(id=id, name=name, alias=alias) ) return self.key_value_store_changes[id, name, alias] def apply_request_changes(self, target: Request) -> None: """Apply tracked changes from handler copy to original request.""" if self.request.user_data != target.user_data: target.user_data = self.request.user_data if self.request.headers != target.headers: target.headers = self.request.headers @docs_group('Functions') class AddRequestsFunction(Protocol): """Function for adding requests to the `RequestManager`, with optional filtering. It simplifies the process of adding requests to the `RequestManager`. It automatically opens the specified one and adds the provided requests. """ def __call__( self, requests: Sequence[str | Request], rq_id: str | None = None, rq_name: str | None = None, rq_alias: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: """Call dunder method. Args: requests: Requests to be added to the `RequestManager`. rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided. rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided. rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided. **kwargs: Additional keyword arguments. """ @docs_group('Functions') class EnqueueLinksFunction(Protocol): """A function for enqueueing new URLs to crawl based on elements selected by a given selector or explicit requests. It adds explicitly passed `requests` to the `RequestManager` or it extracts URLs from the current page and enqueues them for further crawling. It allows filtering through selectors and other options. You can also specify labels and user data to be associated with the newly created `Request` objects. It should not be called with `selector`, `label`, `user_data` or `transform_request_function` arguments together with `requests` argument. For even more control over the enqueued links you can use combination of `ExtractLinksFunction` and `AddRequestsFunction`. """ @overload def __call__( self, *, selector: str | None = None, attribute: str | None = None, label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, rq_id: str | None = None, rq_name: str | None = None, rq_alias: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: ... @overload def __call__( self, *, requests: Sequence[str | Request] | None = None, rq_id: str | None = None, rq_name: str | None = None, rq_alias: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: ... def __call__( self, *, selector: str | None = None, attribute: str | None = None, label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, rq_id: str | None = None, rq_name: str | None = None, rq_alias: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: """Call enqueue links function. Args: selector: A selector used to find the elements containing the links. The behaviour differs based on the crawler used: - `PlaywrightCrawler` supports CSS and XPath selectors. - `ParselCrawler` supports CSS selectors. - `BeautifulSoupCrawler` supports CSS selectors. attribute: Which node attribute to extract the links from. label: Label for the newly created `Request` objects, used for request routing. user_data: User data to be provided to the newly created `Request` objects. transform_request_function: A function that takes `RequestOptions` and returns either: - Modified `RequestOptions` to update the request configuration, - `'skip'` to exclude the request from being enqueued, - `'unchanged'` to use the original request options without modification. requests: Requests to be added to the `RequestManager`. rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided. rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided. rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided. **kwargs: Additional keyword arguments. """ @docs_group('Functions') class ExtractLinksFunction(Protocol): """A function for extracting URLs to crawl based on elements selected by a given selector. It extracts URLs from the current page and allows filtering through selectors and other options. You can also specify labels and user data to be associated with the newly created `Request` objects. """ def __call__( self, *, selector: str = 'a', attribute: str = 'href', label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, list[Request]]: """Call extract links function. Args: selector: A selector used to find the elements containing the links. The behaviour differs based on the crawler used: - `PlaywrightCrawler` supports CSS and XPath selectors. - `ParselCrawler` supports CSS selectors. - `BeautifulSoupCrawler` supports CSS selectors. attribute: Which node attribute to extract the links from. label: Label for the newly created `Request` objects, used for request routing. user_data: User data to be provided to the newly created `Request` objects. transform_request_function: A function that takes `RequestOptions` and returns either: - Modified `RequestOptions` to update the request configuration, - `'skip'` to exclude the request from being enqueued, - `'unchanged'` to use the original request options without modification. **kwargs: Additional keyword arguments. """ @docs_group('Functions') class GetKeyValueStoreFunction(Protocol): """A function for accessing a `KeyValueStore`. It retrieves an instance of a `KeyValueStore` based on its ID or name. """ def __call__( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, ) -> Coroutine[None, None, KeyValueStore]: """Call dunder method. Args: id: The ID of the `KeyValueStore` to get. name: The name of the `KeyValueStore` to get (global scope, named storage). alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage). """ class GetKeyValueStoreFromRequestHandlerFunction(Protocol): """A function for accessing a `KeyValueStore`. It retrieves an instance of a `KeyValueStore` based on its ID or name. """ def __call__( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, ) -> Coroutine[None, None, KeyValueStoreInterface]: """Call dunder method. Args: id: The ID of the `KeyValueStore` to get. name: The name of the `KeyValueStore` to get (global scope, named storage). alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage). """ @docs_group('Functions') class PushDataFunction(Protocol): """A function for pushing data to a `Dataset`. It simplifies the process of adding data to a `Dataset`. It opens the specified one and pushes the provided data to it. """ def __call__( self, data: list[dict[str, Any]] | dict[str, Any], dataset_id: str | None = None, dataset_name: str | None = None, dataset_alias: str | None = None, **kwargs: Unpack[PushDataKwargs], ) -> Coroutine[None, None, None]: """Call dunder method. Args: data: The data to push to the `Dataset`. dataset_id: The ID of the `Dataset` to push the data to. dataset_name: The name of the `Dataset` to push the data to (global scope, named storage). dataset_alias: The alias of the `Dataset` to push the data to (run scope, unnamed storage). **kwargs: Additional keyword arguments. """ @docs_group('Functions') class SendRequestFunction(Protocol): """A function for sending HTTP requests. It simplifies the process of sending HTTP requests. It is implemented by the crawling context and is used within request handlers to send additional HTTP requests to target URLs. """ def __call__( self, url: str, *, method: HttpMethod = 'GET', payload: HttpPayload | None = None, headers: HttpHeaders | dict[str, str] | None = None, ) -> Coroutine[None, None, HttpResponse]: """Call send request function. Args: url: The URL to send the request to. method: The HTTP method to use. headers: The headers to include in the request. payload: The payload to include in the request. Returns: The HTTP response received from the server. """ @docs_group('Other') @dataclasses.dataclass class PageSnapshot: """Snapshot of a crawled page.""" screenshot: bytes | None = None """Screenshot of the page format.""" html: str | None = None """HTML content of the page.""" def __bool__(self) -> bool: return bool(self.screenshot or self.html) @docs_group('Functions') class UseStateFunction(Protocol): """A function for managing state within the crawling context. It allows the use of persistent state across multiple crawls. Warning: This is an experimental feature. The behavior and interface may change in future versions. """ def __call__( self, default_value: dict[str, JsonSerializable] | None = None, ) -> Coroutine[None, None, dict[str, JsonSerializable]]: """Call dunder method. Args: default_value: The default value to initialize the state if it is not already set. Returns: The current state. """ @dataclass(frozen=True) @docs_group('Crawling contexts') class BasicCrawlingContext: """Basic crawling context. It represents the fundamental crawling context used by the `BasicCrawler`. It is extended by more specific crawlers to provide additional functionality. """ request: Request """Request object for the current page being processed.""" session: Session | None """Session object for the current page being processed.""" proxy_info: ProxyInfo | None """Proxy information for the current page being processed.""" send_request: SendRequestFunction """Send request crawling context helper function.""" add_requests: AddRequestsFunction """Add requests crawling context helper function.""" push_data: PushDataFunction """Push data crawling context helper function.""" use_state: UseStateFunction """Use state crawling context helper function.""" get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction """Get key-value store crawling context helper function.""" log: logging.Logger """Logger instance.""" async def get_snapshot(self) -> PageSnapshot: """Get snapshot of crawled page.""" return PageSnapshot() def __hash__(self) -> int: """Return hash of the context. Each context is considered unique.""" return id(self) def create_modified_copy( self, push_data: PushDataFunction | None = None, add_requests: AddRequestsFunction | None = None, get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None, ) -> Self: """Create a modified copy of the crawling context with specified changes.""" modifications = dict[str, Any]() if push_data is not None: modifications['push_data'] = push_data if add_requests is not None: modifications['add_requests'] = add_requests if get_key_value_store is not None: modifications['get_key_value_store'] = get_key_value_store return dataclasses.replace(self, **modifications) class GetDataKwargs(TypedDict): """Keyword arguments for dataset's `get_data` method.""" offset: NotRequired[int] """Skips the specified number of items at the start.""" limit: NotRequired[int | None] """The maximum number of items to retrieve. Unlimited if None.""" clean: NotRequired[bool] """Return only non-empty items and excludes hidden fields. Shortcut for `skip_hidden` and `skip_empty`.""" desc: NotRequired[bool] """Set to True to sort results in descending order.""" fields: NotRequired[list[str]] """Fields to include in each item. Sorts fields as specified if provided.""" omit: NotRequired[list[str]] """Fields to exclude from each item.""" unwind: NotRequired[list[str]] """Unwinds items by a specified array field, turning each element into a separate item.""" skip_empty: NotRequired[bool] """Excludes empty items from the results if True.""" skip_hidden: NotRequired[bool] """Excludes fields starting with '#' if True.""" flatten: NotRequired[list[str]] """Fields to be flattened in returned items.""" view: NotRequired[str] """Specifies the dataset view to be used.""" class ExportToKwargs(TypedDict): """Keyword arguments for dataset's `export_to` method.""" key: Required[str] """The key under which to save the data.""" content_type: NotRequired[Literal['json', 'csv']] """The format in which to export the data. Either 'json' or 'csv'.""" to_kvs_id: NotRequired[str] """ID of the key-value store to save the exported file.""" to_kvs_name: NotRequired[str] """Name of the key-value store to save the exported file.""" to_kvs_storage_client: NotRequired[StorageClient] """The storage client to use for saving the exported file.""" to_kvs_configuration: NotRequired[Configuration] """The configuration to use for saving the exported file.""" class ExportDataJsonKwargs(TypedDict): """Keyword arguments for dataset's `export_data_json` method.""" skipkeys: NotRequired[bool] """If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped instead of raising a `TypeError`.""" ensure_ascii: NotRequired[bool] """Determines if non-ASCII characters should be escaped in the output JSON string.""" check_circular: NotRequired[bool] """If False (default: True), skips the circular reference check for container types. A circular reference will result in a `RecursionError` or worse if unchecked.""" allow_nan: NotRequired[bool] """If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply with the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity).""" cls: NotRequired[type[json.JSONEncoder]] """Allows specifying a custom JSON encoder.""" indent: NotRequired[int] """Specifies the number of spaces for indentation in the pretty-printed JSON output.""" separators: NotRequired[tuple[str, str]] """A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ') otherwise.""" default: NotRequired[Callable] """A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version of the object or raise a `TypeError`.""" sort_keys: NotRequired[bool] """Specifies whether the output JSON object should have keys sorted alphabetically.""" class ExportDataCsvKwargs(TypedDict): """Keyword arguments for dataset's `export_data_csv` method.""" dialect: NotRequired[str] """Specifies a dialect to be used in CSV parsing and writing.""" delimiter: NotRequired[str] """A one-character string used to separate fields. Defaults to ','.""" doublequote: NotRequired[bool] """Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled; when False, the `escapechar` is used as a prefix. Defaults to True.""" escapechar: NotRequired[str] """A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar` if `doublequote` is False. Defaults to None, disabling escaping.""" lineterminator: NotRequired[str] """The string used to terminate lines produced by the writer. Defaults to '\\r\\n'.""" quotechar: NotRequired[str] """A one-character string used to quote fields containing special characters, like the delimiter or quotechar, or fields containing new-line characters. Defaults to '\"'.""" quoting: NotRequired[int] """Controls when quotes should be generated by the writer and recognized by the reader. Can take any of the `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`.""" skipinitialspace: NotRequired[bool] """When True, spaces immediately following the delimiter are ignored. Defaults to False.""" strict: NotRequired[bool] """When True, raises an exception on bad CSV input. Defaults to False.""" ================================================ FILE: src/crawlee/_utils/__init__.py ================================================ ================================================ FILE: src/crawlee/_utils/blocked.py ================================================ from __future__ import annotations # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/utils/src/internals/blocked.ts CLOUDFLARE_RETRY_CSS_SELECTORS = [ '#turnstile-wrapper iframe[src^="https://challenges.cloudflare.com"]', ] RETRY_CSS_SELECTORS = [ *CLOUDFLARE_RETRY_CSS_SELECTORS, 'div#infoDiv0 a[href*="//www.google.com/policies/terms/"]', 'iframe[src*="_Incapsula_Resource"]', ] """ CSS selectors for elements that should trigger a retry, as the crawler is likely getting blocked. """ ROTATE_PROXY_ERRORS = [ 'ECONNRESET', 'ECONNREFUSED', 'ERR_PROXY_CONNECTION_FAILED', 'ERR_TUNNEL_CONNECTION_FAILED', 'Proxy responded with', 'unsuccessful tunnel', 'TunnelUnsuccessful', ] """ Content of proxy errors that should trigger a retry, as the proxy is likely getting blocked / is malfunctioning. """ ================================================ FILE: src/crawlee/_utils/byte_size.py ================================================ from __future__ import annotations from dataclasses import dataclass from typing import Any _BYTES_PER_KB = 1024 _BYTES_PER_MB = _BYTES_PER_KB**2 _BYTES_PER_GB = _BYTES_PER_KB**3 _BYTES_PER_TB = _BYTES_PER_KB**4 @dataclass(frozen=True) class ByteSize: """Represents a byte size.""" bytes: int def __post_init__(self) -> None: if self.bytes < 0: raise ValueError('ByteSize cannot be negative') @classmethod def validate(cls, value: Any) -> ByteSize: if isinstance(value, ByteSize): return value if not isinstance(value, (float, int)): raise TypeError('Value must be numeric') return cls(int(value)) @classmethod def from_kb(cls, kb: float) -> ByteSize: return cls(int(kb * _BYTES_PER_KB)) @classmethod def from_mb(cls, mb: float) -> ByteSize: return cls(int(mb * _BYTES_PER_MB)) @classmethod def from_gb(cls, gb: float) -> ByteSize: return cls(int(gb * _BYTES_PER_GB)) @classmethod def from_tb(cls, tb: float) -> ByteSize: return cls(int(tb * _BYTES_PER_TB)) def to_kb(self) -> float: return self.bytes / _BYTES_PER_KB def to_mb(self) -> float: return self.bytes / _BYTES_PER_MB def to_gb(self) -> float: return self.bytes / _BYTES_PER_GB def to_tb(self) -> float: return self.bytes / _BYTES_PER_TB def __str__(self) -> str: if self.bytes >= _BYTES_PER_TB: return f'{self.to_tb():.2f} TB' if self.bytes >= _BYTES_PER_GB: return f'{self.to_gb():.2f} GB' if self.bytes >= _BYTES_PER_MB: return f'{self.to_mb():.2f} MB' if self.bytes >= _BYTES_PER_KB: return f'{self.to_kb():.2f} KB' return f'{self.bytes} B' def __eq__(self, other: object) -> bool: if isinstance(other, ByteSize): return self.bytes == other.bytes return NotImplemented def __hash__(self) -> int: """Return hash based on the bytes value.""" return hash(self.bytes) def __lt__(self, other: object) -> bool: if isinstance(other, ByteSize): return self.bytes < other.bytes return NotImplemented def __le__(self, other: object) -> bool: if isinstance(other, ByteSize): return self.bytes <= other.bytes return NotImplemented def __gt__(self, other: object) -> bool: if isinstance(other, ByteSize): return self.bytes > other.bytes return NotImplemented def __ge__(self, other: object) -> bool: if isinstance(other, ByteSize): return self.bytes >= other.bytes return NotImplemented def __add__(self, other: object) -> ByteSize: if isinstance(other, ByteSize): return ByteSize(self.bytes + other.bytes) return NotImplemented def __sub__(self, other: object) -> ByteSize: if isinstance(other, ByteSize): result = self.bytes - other.bytes if result < 0: raise ValueError('Resulting ByteSize cannot be negative') return ByteSize(result) return NotImplemented def __mul__(self, other: object) -> ByteSize: if isinstance(other, (int, float)): return ByteSize(int(self.bytes * other)) return NotImplemented def __truediv__(self, other: object) -> float: if isinstance(other, ByteSize): if other.bytes == 0: raise ZeroDivisionError('Cannot divide by zero') return self.bytes / other.bytes return NotImplemented def __rmul__(self, other: object) -> ByteSize: return self.__mul__(other) ================================================ FILE: src/crawlee/_utils/console.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING if TYPE_CHECKING: from collections.abc import Sequence BORDER = {'TL': '┌', 'TR': '┐', 'BL': '└', 'BR': '┘', 'H': '─', 'V': '│', 'TM': '┬', 'BM': '┴'} def make_table(rows: Sequence[Sequence[str]], width: int = 100) -> str: """Create a text table using Unicode characters. Args: rows: A list of tuples/lists to be displayed in the table. width: Maximum width of the table. """ if not rows: return '' num_cols = max(len(row) for row in rows) if num_cols == 0: return '' # Normalize the row size by filling missing columns with empty values normalized_rows = [list(row) + [''] * (num_cols - len(row)) for row in rows] col_widths = [max(len(str(row[i])) for row in normalized_rows) for i in range(num_cols)] total_width = sum(col_widths) + (3 * num_cols) + 1 # If the table size is larger than `width`, set all columns to the same length col_widths = col_widths if total_width <= width else [max(3, (width - (3 * num_cols) - 1) // num_cols)] * num_cols # Initialize borders top_parts, bottom_parts = [BORDER['TL']], [BORDER['BL']] for i in range(num_cols): h_border = BORDER['H'] * (col_widths[i] + 2) top_parts.append(h_border) bottom_parts.append(h_border) if i < num_cols - 1: top_parts.append(BORDER['TM']) bottom_parts.append(BORDER['BM']) else: top_parts.append(BORDER['TR']) bottom_parts.append(BORDER['BR']) top_border, bottom_border = ''.join(top_parts), ''.join(bottom_parts) result = [top_border] for row in normalized_rows: cells = [] for i, cell in enumerate(row): # Trim the content if the length exceeds the widths of the column norm_cell = f'{cell[: col_widths[i] - 3]}...' if len(cell) > col_widths[i] else cell.ljust(col_widths[i]) cells.append(norm_cell) # row: │ cell1 │ cell2 │ ... row_str = BORDER['V'] + ''.join(f' {cell} {BORDER["V"]}' for cell in cells) result.append(row_str) result.append(bottom_border) return '\n'.join(result) ================================================ FILE: src/crawlee/_utils/context.py ================================================ from __future__ import annotations import inspect from collections.abc import Callable from functools import wraps from typing import Any, TypeVar, cast T = TypeVar('T', bound=Callable[..., Any]) def ensure_context(method: T) -> T: """Ensure the (async) context manager is initialized before executing the method. This decorator checks if the calling instance has an `active` attribute and verifies that it is set to `True`. If the instance is inactive, it raises a `RuntimeError`. Works for both synchronous and asynchronous methods. Args: method: The method to wrap. Returns: The wrapped method with context checking applied. Raises: RuntimeError: If the instance lacks an `active` attribute or is not active. """ @wraps(method) def sync_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: if not hasattr(self, 'active'): raise RuntimeError(f'The {self.__class__.__name__} does not have the "active" attribute.') if not self.active: raise RuntimeError(f'The {self.__class__.__name__} is not active. Use it within the context.') return method(self, *args, **kwargs) @wraps(method) async def async_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: if not hasattr(self, 'active'): raise RuntimeError(f'The {self.__class__.__name__} does not have the "active" attribute.') if not self.active: raise RuntimeError(f'The {self.__class__.__name__} is not active. Use it within the async context.') return await method(self, *args, **kwargs) return cast('T', async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper) ================================================ FILE: src/crawlee/_utils/crypto.py ================================================ from __future__ import annotations import secrets from hashlib import sha256 def compute_short_hash(data: bytes, *, length: int = 8) -> str: """Compute a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it. Args: data: The binary data to be hashed. length: The length of the hash to be returned. Returns: A substring (prefix) of the hexadecimal hash of the data. """ hash_object = sha256(data) return hash_object.hexdigest()[:length] def crypto_random_object_id(length: int = 17) -> str: """Generate a random object ID.""" chars = 'abcdefghijklmnopqrstuvwxyzABCEDFGHIJKLMNOPQRSTUVWXYZ0123456789' return ''.join(secrets.choice(chars) for _ in range(length)) ================================================ FILE: src/crawlee/_utils/docs.py ================================================ from __future__ import annotations from collections.abc import Callable from typing import Any, Literal, TypeVar # The order of the rendered API groups is defined in the website/docusaurus.config.js file. GroupName = Literal[ 'Autoscaling', 'Browser management', 'Configuration', 'Crawlers', 'Crawling contexts', 'Errors', 'Event data', 'Event managers', 'Functions', 'HTTP clients', 'HTTP parsers', 'Request loaders', 'Session management', 'Statistics', 'Storage clients', 'Storage data', 'Storages', 'Other', ] T = TypeVar('T', bound=Callable[..., Any]) def docs_group(group_name: GroupName) -> Callable[[T], T]: # noqa: ARG001 """Mark a symbol for rendering and grouping in documentation. This decorator is used solely for documentation purposes and does not modify the behavior of the decorated callable. Args: group_name: The documentation group to which the symbol belongs. Returns: The original callable without modification. """ def wrapper(func: T) -> T: return func return wrapper ================================================ FILE: src/crawlee/_utils/file.py ================================================ from __future__ import annotations import asyncio import csv import json import os import sys import tempfile from pathlib import Path from typing import TYPE_CHECKING, overload if TYPE_CHECKING: from collections.abc import AsyncIterator from typing import Any, TextIO from typing_extensions import Unpack from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs if sys.platform == 'win32': def _write_file(path: Path, data: str | bytes) -> None: """Windows-specific file write implementation. This implementation writes directly to the file without using a temporary file, because they are problematic due to permissions issues on Windows. """ if isinstance(data, bytes): path.write_bytes(data) elif isinstance(data, str): path.write_text(data, encoding='utf-8') else: raise TypeError(f'Unsupported data type: {type(data)}. Expected str or bytes.') else: def _write_file(path: Path, data: str | bytes) -> None: """Linux/Unix-specific file write implementation using temporary files.""" dir_path = path.parent fd, tmp_path = tempfile.mkstemp( suffix=f'{path.suffix}.tmp', prefix=f'{path.name}.', dir=str(dir_path), ) if not isinstance(data, (str, bytes)): raise TypeError(f'Unsupported data type: {type(data)}. Expected str or bytes.') try: if isinstance(data, bytes): with os.fdopen(fd, 'wb') as tmp_file: tmp_file.write(data) else: with os.fdopen(fd, 'w', encoding='utf-8') as tmp_file: tmp_file.write(data) # Atomically replace the destination file with the temporary file Path(tmp_path).replace(path) except Exception: Path(tmp_path).unlink(missing_ok=True) raise def infer_mime_type(value: Any) -> str: """Infer the MIME content type from the value. Args: value: The value to infer the content type from. Returns: The inferred MIME content type. """ # If the value is bytes (or bytearray), return binary content type. if isinstance(value, (bytes, bytearray)): return 'application/octet-stream' # If the value is a dict or list, assume JSON. if isinstance(value, (dict, list)): return 'application/json; charset=utf-8' # If the value is a string, number or boolean, assume plain text. if isinstance(value, (str, int, float, bool)): return 'text/plain; charset=utf-8' # Default fallback. return 'application/octet-stream' async def json_dumps(obj: Any) -> str: """Serialize an object to a JSON-formatted string with specific settings. Args: obj: The object to serialize. Returns: A string containing the JSON representation of the input object. """ return await asyncio.to_thread(json.dumps, obj, ensure_ascii=False, indent=2, default=str) @overload async def atomic_write( path: Path, data: str, *, retry_count: int = 0, ) -> None: ... @overload async def atomic_write( path: Path, data: bytes, *, retry_count: int = 0, ) -> None: ... async def atomic_write( path: Path, data: str | bytes, *, retry_count: int = 0, ) -> None: """Write data to a file atomically to prevent data corruption or partial writes. This function handles both text and binary data. The binary mode is automatically detected based on the data type (bytes = binary, str = text). It ensures atomic writing by creating a temporary file and then atomically replacing the target file, which prevents data corruption if the process is interrupted during the write operation. Args: path: The path to the destination file. data: The data to write to the file (string or bytes). retry_count: Internal parameter to track the number of retry attempts (default: 0). """ max_retries = 3 try: # Use the platform-specific write function resolved at import time. await asyncio.to_thread(_write_file, path, data) except (FileNotFoundError, PermissionError): if retry_count < max_retries: return await atomic_write( path, data, retry_count=retry_count + 1, ) # If we reach the maximum number of retries, raise the exception. raise async def export_json_to_stream( iterator: AsyncIterator[dict[str, Any]], dst: TextIO, **kwargs: Unpack[ExportDataJsonKwargs], ) -> None: items = [item async for item in iterator] json.dump(items, dst, **kwargs) async def export_csv_to_stream( iterator: AsyncIterator[dict[str, Any]], dst: TextIO, **kwargs: Unpack[ExportDataCsvKwargs], ) -> None: # Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows. # The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted # to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending # conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows. if 'lineterminator' not in kwargs: kwargs['lineterminator'] = '\n' writer = csv.writer(dst, **kwargs) write_header = True # Iterate over the dataset and write to CSV. async for item in iterator: if not item: continue if write_header: writer.writerow(item.keys()) write_header = False writer.writerow(item.values()) ================================================ FILE: src/crawlee/_utils/globs.py ================================================ from __future__ import annotations import os import re from typing import TYPE_CHECKING if TYPE_CHECKING: from collections.abc import Sequence class Glob: """Wraps a glob pattern (supports the `*`, `**`, `?` wildcards).""" def __init__(self, glob: str) -> None: self.glob = glob self.regexp = re.compile(_translate(self.glob, recursive=True)) def _translate( pat: str, *, recursive: bool = False, include_hidden: bool = False, seps: Sequence[str] | None = None ) -> str: """Translate a pathname with shell wildcards to a regular expression. If `recursive` is true, the pattern segment '**' will match any number of path segments. If `include_hidden` is true, wildcards can match path segments beginning with a dot ('.'). If a sequence of separator characters is given to `seps`, they will be used to split the pattern into segments and match path separators. If not given, os.path.sep and os.path.altsep (where available) are used. HACK: This function is copied from CPython stdlib source. It will be released in Python 3.13 as `glob.translate` """ _seps = ((os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)) if seps is None else seps escaped_seps = ''.join(map(re.escape, _seps)) any_sep = f'[{escaped_seps}]' if len(_seps) > 1 else escaped_seps not_sep = f'[^{escaped_seps}]' if include_hidden: one_last_segment = f'{not_sep}+' one_segment = f'{one_last_segment}{any_sep}' any_segments = f'(?:.+{any_sep})?' any_last_segments = '.*' else: one_last_segment = f'[^{escaped_seps}.]{not_sep}*' one_segment = f'{one_last_segment}{any_sep}' any_segments = f'(?:{one_segment})*' any_last_segments = f'{any_segments}(?:{one_last_segment})?' results = [] parts = re.split(any_sep, pat) last_part_idx = len(parts) - 1 for idx, part in enumerate(parts): if part == '*': results.append(one_segment if idx < last_part_idx else one_last_segment) elif recursive and part == '**': if idx < last_part_idx: if parts[idx + 1] != '**': results.append(any_segments) else: results.append(any_last_segments) else: if part: if not include_hidden and part[0] in '*?': results.append(r'(?!\.)') results.extend(_fnmatch_translate(part, f'{not_sep}*', not_sep)) if idx < last_part_idx: results.append(any_sep) res = ''.join(results) return rf'(?s:{res})\Z' def _fnmatch_translate(pat: str, star: str, question_mark: str) -> list[str]: """Copy of fnmatch._translate from Python 3.13.""" res = list[str]() add = res.append i, n = 0, len(pat) while i < n: c = pat[i] i = i + 1 if c == '*': # compress consecutive `*` into one if (not res) or res[-1] is not star: add(star) elif c == '?': add(question_mark) elif c == '[': j = i if j < n and pat[j] == '!': j = j + 1 if j < n and pat[j] == ']': j = j + 1 while j < n and pat[j] != ']': j = j + 1 if j >= n: add('\\[') else: stuff = pat[i:j] if '-' not in stuff: stuff = stuff.replace('\\', r'\\') else: chunks = [] k = i + 2 if pat[i] == '!' else i + 1 while True: k = pat.find('-', k, j) if k < 0: break chunks.append(pat[i:k]) i = k + 1 k = k + 3 chunk = pat[i:j] if chunk: chunks.append(chunk) else: chunks[-1] += '-' # Remove empty ranges -- invalid in RE. for k in range(len(chunks) - 1, 0, -1): if chunks[k - 1][-1] > chunks[k][0]: chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:] del chunks[k] # Escape backslashes and hyphens for set difference (--). # Hyphens that create ranges shouldn't be escaped. stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') for s in chunks) # Escape set operations (&&, ~~ and ||). stuff = re.sub(r'([&~|])', r'\\\1', stuff) i = j + 1 if not stuff: # Empty range: never match. add('(?!)') elif stuff == '!': # Negated empty range: match any character. add('.') else: if stuff[0] == '!': stuff = '^' + stuff[1:] elif stuff[0] in ('^', '['): stuff = '\\' + stuff add(f'[{stuff}]') else: add(re.escape(c)) return res ================================================ FILE: src/crawlee/_utils/html_to_text.py ================================================ # This file contains shared constants used by different implementations of html_to_text function. from __future__ import annotations import re # Tags based on Javascript implementation of htmlToText from: # https://github.com/apify/crawlee/blob/master/packages/utils/src/internals/cheerio.ts#L11 # Originally added here: https://github.com/apify/apify-ts/commit/4c0e5e3e7377536a449bb7b205132348ad3b0fe9 SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'} BLOCK_TAGS = { 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'li', 'pre', 'address', 'blockquote', 'dl', 'div', 'fieldset', 'form', 'table', 'tr', 'select', 'option', } _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE = re.compile(r'(^|\s)$') _EMPTY_OR_ENDS_WITH_NEW_LINE = re.compile(r'(^|\n)$') _ANY_CONSECUTIVE_WHITE_SPACES = re.compile(r'\s+') ================================================ FILE: src/crawlee/_utils/models.py ================================================ from __future__ import annotations from contextlib import suppress from datetime import timedelta from typing import TYPE_CHECKING, Annotated, Any from pydantic import PlainSerializer, TypeAdapter, ValidationError, WrapValidator if TYPE_CHECKING: from collections.abc import Callable """Utility types for Pydantic models.""" def _timedelta_to_ms(td: timedelta | None) -> float | None: if td == timedelta.max: return float('inf') if td is None: return td return round(td.total_seconds() * 1000) def _timedelta_to_secs(td: timedelta | None) -> float | None: if td == timedelta.max: return float('inf') if td is None: return td return td.total_seconds() _number_parser = TypeAdapter(float) def _timedelta_from_ms(value: float | timedelta | Any | None, handler: Callable[[Any], timedelta]) -> timedelta | None: if value == float('inf'): return timedelta.max # If the value is a string-encoded number, decode it if isinstance(value, str): with suppress(ValidationError): value = _number_parser.validate_python(value) if not isinstance(value, (int, float)): return handler(value) return timedelta(milliseconds=value) def _timedelta_from_secs( value: float | timedelta | Any | None, handler: Callable[[Any], timedelta], ) -> timedelta | None: if value == float('inf'): return timedelta.max # If the value is a string-encoded number, decode it if isinstance(value, str): with suppress(ValidationError): value = _number_parser.validate_python(value) if not isinstance(value, (int, float)): return handler(value) return timedelta(seconds=value) timedelta_ms = Annotated[timedelta, PlainSerializer(_timedelta_to_ms), WrapValidator(_timedelta_from_ms)] timedelta_secs = Annotated[timedelta, PlainSerializer(_timedelta_to_secs), WrapValidator(_timedelta_from_secs)] ================================================ FILE: src/crawlee/_utils/raise_if_too_many_kwargs.py ================================================ from typing import Any def raise_if_too_many_kwargs(max_kwargs: int = 1, **kwargs: Any) -> None: """Raise ValueError if there are more non-None kwargs then max_kwargs.""" none_kwargs_names = [f'"{kwarg_name}"' for kwarg_name, value in kwargs.items() if value is not None] if len(none_kwargs_names) > max_kwargs: all_kwargs_names = [f'"{kwarg_name}"' for kwarg_name in kwargs] raise ValueError( f'Only one of {", ".join(all_kwargs_names)} can be specified, but following arguments were ' f'specified: {", ".join(none_kwargs_names)}.' ) ================================================ FILE: src/crawlee/_utils/recoverable_state.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, Generic, Literal, TypeVar from pydantic import BaseModel from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee.events._types import Event, EventPersistStateData if TYPE_CHECKING: import logging from collections.abc import Callable, Coroutine from crawlee.storages import KeyValueStore TStateModel = TypeVar('TStateModel', bound=BaseModel) class RecoverableState(Generic[TStateModel]): """A class for managing persistent recoverable state using a Pydantic model. This class facilitates state persistence to a `KeyValueStore`, allowing data to be saved and retrieved across migrations or restarts. It manages the loading, saving, and resetting of state data, with optional persistence capabilities. The state is represented by a Pydantic model that can be serialized to and deserialized from JSON. The class automatically hooks into the event system to persist state when needed. Type Parameters: TStateModel: A Pydantic BaseModel type that defines the structure of the state data. Typically, it should be inferred from the `default_state` constructor parameter. """ def __init__( self, *, default_state: TStateModel, persist_state_key: str, persistence_enabled: Literal[True, False, 'explicit_only'] = False, persist_state_kvs_name: str | None = None, persist_state_kvs_id: str | None = None, persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None, logger: logging.Logger, ) -> None: """Initialize a new recoverable state object. Args: default_state: The default state model instance to use when no persisted state is found. A deep copy is made each time the state is used. persist_state_key: The key under which the state is stored in the KeyValueStore persistence_enabled: Flag to enable or disable state persistence. Use 'explicit_only' if you want to be able to save the state manually, but without any automatic persistence. persist_state_kvs_name: The name of the KeyValueStore to use for persistence. If neither a name nor and id are supplied, the default store will be used. persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence. If neither a name nor and id are supplied, the default store will be used. persist_state_kvs_factory: Factory that can be awaited to create KeyValueStore to use for persistence. If not provided, a system-wide KeyValueStore will be used, based on service locator configuration. logger: A logger instance for logging operations related to state persistence """ raise_if_too_many_kwargs( persist_state_kvs_name=persist_state_kvs_name, persist_state_kvs_id=persist_state_kvs_id, persist_state_kvs_factory=persist_state_kvs_factory, ) if not persist_state_kvs_factory: logger.debug( 'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore ' 'based on service_locator configuration, potentially calling service_locator.set_storage_client in the ' 'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid ' 'global side effects.' ) self._default_state = default_state self._state_type: type[TStateModel] = self._default_state.__class__ self._state: TStateModel | None = None self._persistence_enabled = persistence_enabled self._persist_state_key = persist_state_key if persist_state_kvs_factory is None: async def kvs_factory() -> KeyValueStore: from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import return await KeyValueStore.open(name=persist_state_kvs_name, id=persist_state_kvs_id) self._persist_state_kvs_factory = kvs_factory else: self._persist_state_kvs_factory = persist_state_kvs_factory self._key_value_store: KeyValueStore | None = None self._log = logger async def initialize(self) -> TStateModel: """Initialize the recoverable state. This method must be called before using the recoverable state. It loads the saved state if persistence is enabled and registers the object to listen for PERSIST_STATE events. Returns: The loaded state model """ if self._persistence_enabled is False: self._state = self._default_state.model_copy(deep=True) return self.current_value # Import here to avoid circular imports. self._key_value_store = await self._persist_state_kvs_factory() await self._load_saved_state() if self._persistence_enabled is True: # Import here to avoid circular imports. from crawlee import service_locator # noqa: PLC0415 event_manager = service_locator.get_event_manager() event_manager.on(event=Event.PERSIST_STATE, listener=self.persist_state) return self.current_value async def teardown(self) -> None: """Clean up resources used by the recoverable state. If persistence is enabled, this method deregisters the object from PERSIST_STATE events and persists the current state one last time. """ if not self._persistence_enabled: return if self._persistence_enabled is True: # Import here to avoid circular imports. from crawlee import service_locator # noqa: PLC0415 event_manager = service_locator.get_event_manager() event_manager.off(event=Event.PERSIST_STATE, listener=self.persist_state) await self.persist_state() @property def current_value(self) -> TStateModel: """Get the current state.""" if self._state is None: raise RuntimeError('Recoverable state has not yet been loaded') return self._state @property def is_initialized(self) -> bool: """Check if the state has already been initialized.""" return self._state is not None async def has_persisted_state(self) -> bool: """Check if there is any persisted state in the key-value store.""" if not self._persistence_enabled: return False if self._key_value_store is None: raise RuntimeError('Recoverable state has not yet been initialized') return await self._key_value_store.record_exists(self._persist_state_key) async def reset(self) -> None: """Reset the state to the default values and clear any persisted state. Resets the current state to the default state and, if persistence is enabled, clears the persisted state from the KeyValueStore. """ self._state = self._default_state.model_copy(deep=True) if self._persistence_enabled: if self._key_value_store is None: raise RuntimeError('Recoverable state has not yet been initialized') await self._key_value_store.set_value(self._persist_state_key, None) async def persist_state(self, event_data: EventPersistStateData | None = None) -> None: """Persist the current state to the KeyValueStore. This method is typically called in response to a PERSIST_STATE event, but can also be called directly when needed. Args: event_data: Optional data associated with a PERSIST_STATE event """ self._log.debug( f'Persisting RecoverableState (model={self._default_state.__class__.__name__}, event_data={event_data}).' ) if self._key_value_store is None or self._state is None: raise RuntimeError('Recoverable state has not yet been initialized') if self._persistence_enabled is True or self._persistence_enabled == 'explicit_only': await self._key_value_store.set_value( self._persist_state_key, self._state.model_dump(mode='json', by_alias=True), 'application/json', ) else: self._log.debug('Persistence is not enabled - not doing anything') async def _load_saved_state(self) -> None: if self._key_value_store is None: raise RuntimeError('Recoverable state has not yet been initialized') stored_state = await self._key_value_store.get_value(self._persist_state_key) if stored_state is None: self._state = self._default_state.model_copy(deep=True) else: self._state = self._state_type.model_validate(stored_state) ================================================ FILE: src/crawlee/_utils/recurring_task.py ================================================ from __future__ import annotations import asyncio import inspect from logging import getLogger from typing import TYPE_CHECKING if TYPE_CHECKING: from collections.abc import Callable from datetime import timedelta from types import TracebackType from typing_extensions import Self logger = getLogger(__name__) class RecurringTask: """Class for creating and managing recurring tasks. Attributes: func: The function to be executed repeatedly. delay: The time delay (in seconds) between function calls. task: The underlying task object. """ def __init__(self, func: Callable, delay: timedelta) -> None: logger.debug( 'Calling RecurringTask.__init__(func={%s}, delay={%s})...', func.__name__ if hasattr(func, '__name__') else func.__class__.__name__, delay, ) self.func = func self.delay = delay self.task: asyncio.Task | None = None async def __aenter__(self) -> Self: self.start() return self async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None, ) -> None: await self.stop() async def _wrapper(self) -> None: """Continuously execute the provided function with the specified delay. Run the function in a loop, waiting for the configured delay between executions. Supports both synchronous and asynchronous functions. """ sleep_time_secs = self.delay.total_seconds() while True: await self.func() if inspect.iscoroutinefunction(self.func) else self.func() await asyncio.sleep(sleep_time_secs) def start(self) -> None: """Start the recurring task execution.""" name = self.func.__name__ if hasattr(self.func, '__name__') else self.func.__class__.__name__ self.task = asyncio.create_task( self._wrapper(), name=f'Task-recurring-{name}', ) async def stop(self) -> None: """Stop the recurring task execution.""" if self.task: self.task.cancel() # Ensure the task has a chance to properly handle the cancellation and any potential exceptions. await asyncio.gather(self.task, return_exceptions=True) ================================================ FILE: src/crawlee/_utils/requests.py ================================================ from __future__ import annotations from logging import getLogger from typing import TYPE_CHECKING from yarl import URL from crawlee._utils.crypto import compute_short_hash if TYPE_CHECKING: from crawlee._types import HttpHeaders, HttpMethod, HttpPayload logger = getLogger(__name__) def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str: """Normalize a URL. This function cleans and standardizes a URL by removing leading and trailing whitespaces, converting the scheme and netloc to lower case, stripping unwanted tracking parameters (specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically, and optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally identical but differ in trivial ways (such as parameter order or casing) are treated as the same. Args: url: The URL to be normalized. keep_url_fragment: Flag to determine whether the fragment part of the URL should be retained. Returns: A string containing the normalized URL. """ # Parse the URL parsed_url = URL(url.strip()) # Remove any 'utm_' parameters search_params = [(k, v) for k, v in parsed_url.query.items() if not k.startswith('utm_')] # Construct the new query string sorted_search_params = sorted(search_params) # Construct the final URL yarl_new_url = parsed_url.with_query(sorted_search_params) yarl_new_url = yarl_new_url.with_path( yarl_new_url.path.removesuffix('/'), keep_query=True, keep_fragment=keep_url_fragment ) return str(yarl_new_url).lower() def compute_unique_key( url: str, method: HttpMethod = 'GET', headers: HttpHeaders | None = None, payload: HttpPayload | None = None, session_id: str | None = None, *, keep_url_fragment: bool = False, use_extended_unique_key: bool = False, ) -> str: """Compute a unique key for caching & deduplication of requests. This function computes a unique key by normalizing the provided URL and method. If `use_extended_unique_key` is True and a payload is provided, the payload is hashed and included in the key. Otherwise, the unique key is just the normalized URL. Additionally, if HTTP headers are provided, the whitelisted headers are hashed and included in the key. Args: url: The request URL. method: The HTTP method. headers: The HTTP headers. payload: The data to be sent as the request body. keep_url_fragment: A flag indicating whether to keep the URL fragment. use_extended_unique_key: A flag indicating whether to include a hashed payload in the key. session_id: The ID of a specific `Session` to which the request will be strictly bound Returns: A string representing the unique key for the request. """ # Normalize the URL. try: normalized_url = normalize_url(url, keep_url_fragment=keep_url_fragment) except Exception as exc: logger.warning(f'Failed to normalize URL: {exc}') normalized_url = url # Normalize the method. normalized_method = method.upper() # Compute and return the extended unique key if required. if use_extended_unique_key: payload_hash = _get_payload_hash(payload) headers_hash = _get_headers_hash(headers) normalized_session = '' if session_id is None else session_id.lower() # Return the extended unique key. Use pipe as a separator of the different parts of the unique key. extended_part = f'{normalized_method}|{headers_hash}|{payload_hash}' if normalized_session: extended_part = f'{extended_part}|{normalized_session}' return f'{extended_part}|{normalized_url}' # Log information if there is a non-GET request with a payload. if normalized_method != 'GET' and payload: logger.info( f'{normalized_method} request with a payload detected. By default, requests to the same URL with ' 'different methods or payloads will be deduplicated. Use "use_extended_unique_key" to include payload ' 'and headers in the unique key and avoid deduplication in these cases.' ) # Return the normalized URL as the unique key. return normalized_url def _get_payload_hash(payload: HttpPayload | None) -> str: payload_in_bytes = b'' if payload is None else payload return compute_short_hash(payload_in_bytes) def _get_headers_hash(headers: HttpHeaders | None) -> str: # HTTP headers which will be included in the hash computation. whitelisted_headers = {'accept', 'accept-language', 'authorization', 'content-type'} if headers is None: normalized_headers = b'' else: filtered_headers = {key: value for key, value in headers.items() if key in whitelisted_headers} normalized_headers = '|'.join(f'{k}:{v}' for k, v in filtered_headers.items()).encode('utf-8') return compute_short_hash(normalized_headers) ================================================ FILE: src/crawlee/_utils/robots.py ================================================ from __future__ import annotations from logging import getLogger from typing import TYPE_CHECKING from protego import Protego from yarl import URL from crawlee._utils.sitemap import Sitemap from crawlee._utils.web import is_status_code_client_error if TYPE_CHECKING: from typing_extensions import Self from crawlee.http_clients import HttpClient from crawlee.proxy_configuration import ProxyInfo logger = getLogger(__name__) class RobotsTxtFile: def __init__( self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None ) -> None: self._robots = robots self._original_url = URL(url).origin() self._http_client = http_client self._proxy_info = proxy_info @classmethod async def from_content(cls, url: str, content: str) -> Self: """Create a `RobotsTxtFile` instance from the given content. Args: url: The URL associated with the robots.txt file. content: The raw string content of the robots.txt file to be parsed. """ robots = Protego.parse(content) return cls(url, robots) @classmethod async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self: """Determine the location of a robots.txt file for a URL and fetch it. Args: url: The URL whose domain will be used to find the corresponding robots.txt file. http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used. proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file. """ robots_url = URL(url).with_path('/robots.txt') return await cls.load(str(robots_url), http_client, proxy_info) @classmethod async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self: """Load the robots.txt file for a given URL. Args: url: The direct URL of the robots.txt file to be loaded. http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file. proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used. """ try: response = await http_client.send_request(url, proxy_info=proxy_info) body = ( b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read() ) robots = Protego.parse(body.decode('utf-8')) except Exception as e: logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"') robots = Protego.parse('User-agent: *\nAllow: /') return cls(url, robots, http_client=http_client, proxy_info=proxy_info) def is_allowed(self, url: str, user_agent: str = '*') -> bool: """Check if the given URL is allowed for the given user agent. Args: url: The URL to check against the robots.txt rules. user_agent: The user-agent string to check permissions for. Defaults to '*' which matches any user-agent. """ check_url = URL(url) if check_url.origin() != self._original_url: return True return bool(self._robots.can_fetch(str(check_url), user_agent)) def get_sitemaps(self) -> list[str]: """Get the list of sitemaps urls from the robots.txt file.""" return list(self._robots.sitemaps) def get_crawl_delay(self, user_agent: str = '*') -> int | None: """Get the crawl delay for the given user agent. Args: user_agent: The user-agent string to check the crawl delay for. Defaults to '*' which matches any user-agent. """ crawl_delay = self._robots.crawl_delay(user_agent) return int(crawl_delay) if crawl_delay is not None else None async def parse_sitemaps(self) -> Sitemap: """Parse the sitemaps from the robots.txt file and return a `Sitemap` instance.""" sitemaps = self.get_sitemaps() if not self._http_client: raise ValueError('HTTP client is required to parse sitemaps.') return await Sitemap.load(sitemaps, self._http_client, self._proxy_info) async def parse_urls_from_sitemaps(self) -> list[str]: """Parse the sitemaps in the robots.txt file and return a list URLs.""" sitemap = await self.parse_sitemaps() return sitemap.urls ================================================ FILE: src/crawlee/_utils/sitemap.py ================================================ from __future__ import annotations import asyncio import re import zlib from codecs import getincrementaldecoder from collections import defaultdict from contextlib import suppress from dataclasses import dataclass from datetime import datetime, timedelta from hashlib import sha256 from logging import getLogger from typing import TYPE_CHECKING, Literal, TypedDict from xml.sax import SAXParseException from xml.sax.expatreader import ExpatParser from xml.sax.handler import ContentHandler from typing_extensions import NotRequired, override from yarl import URL from crawlee._utils.web import is_status_code_successful from crawlee.errors import ProxyError if TYPE_CHECKING: from collections.abc import AsyncGenerator from xml.sax.xmlreader import AttributesImpl from crawlee.http_clients import HttpClient from crawlee.proxy_configuration import ProxyInfo logger = getLogger(__name__) VALID_CHANGE_FREQS = {'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'} SITEMAP_HEADERS = {'accept': 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8'} SITEMAP_URL_PATTERN = re.compile(r'\/sitemap\.(?:xml|txt)(?:\.gz)?$', re.IGNORECASE) COMMON_SITEMAP_PATHS = ['/sitemap.xml', '/sitemap.txt', '/sitemap_index.xml'] @dataclass() class SitemapUrl: loc: str lastmod: datetime | None = None changefreq: str | None = None priority: float | None = None origin_sitemap_url: str | None = None @dataclass() class NestedSitemap: loc: str origin_sitemap_url: str | None = None class ParseSitemapOptions(TypedDict, total=False): emit_nested_sitemaps: bool max_depth: int sitemap_retries: int timeout: timedelta | None class SitemapSource(TypedDict): type: Literal['url', 'raw'] url: NotRequired[str] content: NotRequired[str] depth: NotRequired[int] class _SitemapItem(TypedDict, total=False): type: Literal['url', 'sitemap_url'] loc: str url: str lastmod: datetime | None changefreq: str | None priority: float | None class _XMLSaxSitemapHandler(ContentHandler): def __init__(self) -> None: super().__init__() self._root_tag_name: str | None = None self._current_tag: str | None = None self._current_url: _SitemapItem = {} self._buffer: str = '' self._items: list[_SitemapItem] = [] @property def items(self) -> list[_SitemapItem]: return self._items @override def startElement(self, name: str, attrs: AttributesImpl) -> None: if self._root_tag_name is None and name in ('urlset', 'sitemapindex'): self._root_tag_name = name if name in ('loc', 'lastmod', 'changefreq', 'priority'): self._current_tag = name self._buffer = '' def characters(self, content: str) -> None: if self._current_tag: self._buffer += content @override def endElement(self, name: str) -> None: if name == self._current_tag: text = self._buffer.strip() if name == 'loc': if self._root_tag_name == 'sitemapindex': self._items.append({'type': 'sitemap_url', 'url': text}) else: self._current_url['loc'] = text elif name == 'lastmod' and text: with suppress(ValueError): self._current_url['lastmod'] = datetime.fromisoformat(text.replace('Z', '+00:00')) elif name == 'priority' and text: with suppress(ValueError): self._current_url['priority'] = float(text) elif name == 'changefreq' and text in VALID_CHANGE_FREQS: self._current_url['changefreq'] = text self.current_tag = None if name == 'url' and 'loc' in self._current_url: self.items.append({'type': 'url', **self._current_url}) self._current_url = {} class _TxtSitemapParser: """Parser for plaintext sitemaps that processes data as a stream.""" def __init__(self) -> None: self._buffer = '' async def process_chunk(self, chunk: str) -> AsyncGenerator[_SitemapItem, None]: """Process a chunk of text data and yield items one by one.""" self._buffer += chunk # Process complete lines if '\n' in self._buffer: lines = self._buffer.split('\n') # Last element might be incomplete, save for next chunk self._buffer = lines.pop() for line in lines: url = line.strip() if url: yield {'type': 'url', 'loc': url} async def flush(self) -> AsyncGenerator[_SitemapItem, None]: """Process any remaining data in the buffer, yielding items one by one.""" if self._buffer: url = self._buffer.strip() if url: yield {'type': 'url', 'loc': url} self.buffer = '' def close(self) -> None: """Clean up resources.""" self._buffer = '' class _XmlSitemapParser: """Parser for XML sitemaps using SAX to process data as a stream.""" def __init__(self) -> None: self._parser = ExpatParser() self._handler = _XMLSaxSitemapHandler() self._parser.setContentHandler(self._handler) async def process_chunk(self, chunk: str) -> AsyncGenerator[_SitemapItem, None]: """Process a chunk of XML data and yield items one by one.""" try: self._parser.feed(chunk) # If we get here, the XML was valid and complete for item in self._handler.items: yield item self._handler.items.clear() except Exception as e: logger.warning(f'Failed to parse XML data chunk: {e}', exc_info=True) async def flush(self) -> AsyncGenerator[_SitemapItem, None]: """Process any remaining data in the buffer, yielding items one by one.""" try: self._parser.flush() for item in self._handler.items: yield item self._handler.items.clear() except Exception as e: logger.warning(f'Failed to parse remaining XML data: {e}') def close(self) -> None: """Clean up resources.""" with suppress(SAXParseException): self._parser.close() def _get_parser(content_type: str = '', url: str | None = None) -> _XmlSitemapParser | _TxtSitemapParser: """Create appropriate parser based on content type and URL.""" if 'text/plain' in content_type.lower() or (url and URL(url).path.endswith('.txt')): return _TxtSitemapParser() # Default to XML parser for most cases return _XmlSitemapParser() def _get_origin_url(source: SitemapSource) -> str: """Determine the origin URL for a sitemap source.""" if source['type'] == 'url' and 'url' in source: return source['url'] if source['type'] == 'raw' and 'content' in source: # For raw content sources, create a consistent identifier return f'raw://{sha256(source["content"].encode()).hexdigest()}' return '' async def _process_sitemap_item( item: _SitemapItem, source: SitemapSource, depth: int, visited_sitemap_urls: set[str], sources: list[SitemapSource], *, emit_nested_sitemaps: bool, ) -> AsyncGenerator[SitemapUrl | NestedSitemap | None, None]: """Process a sitemap item and yield appropriate results.""" item_copy = item.copy() # Work with a copy to avoid modifying the original if 'type' not in item_copy: return item_type = item_copy.pop('type') # Handle sitemap URL references (nested sitemaps) if item_type == 'sitemap_url' and 'url' in item_copy: sitemap_url = item_copy['url'] if sitemap_url and sitemap_url not in visited_sitemap_urls: # Add to processing queue sources.append(SitemapSource(type='url', url=sitemap_url, depth=depth + 1)) # Output the nested sitemap reference if requested if emit_nested_sitemaps: yield NestedSitemap(loc=sitemap_url, origin_sitemap_url=None) # Handle individual URL entries elif item_type == 'url' and 'loc' in item_copy: # Determine the origin sitemap URL for tracking purposes origin_url = _get_origin_url(source) # Create and yield the sitemap URL object yield SitemapUrl( loc=item_copy['loc'], lastmod=item_copy.get('lastmod'), changefreq=item_copy.get('changefreq'), priority=item_copy.get('priority'), origin_sitemap_url=origin_url, ) async def _process_raw_source( source: SitemapSource, depth: int, visited_sitemap_urls: set[str], sources: list[SitemapSource], *, emit_nested_sitemaps: bool, ) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]: """Process a raw content sitemap source.""" if 'content' not in source: logger.warning(f'Raw source missing content: {source}') return content = source['content'] parser = _get_parser('text/xml') try: # Process the content async for item in parser.process_chunk(content): async for result in _process_sitemap_item( item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps ): if result: yield result # Process any remaining content async for item in parser.flush(): async for result in _process_sitemap_item( item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps ): if result: yield result except Exception as e: logger.warning(f'Failed to parse raw sitemap content: {e}') finally: parser.close() async def _fetch_and_process_sitemap( http_client: HttpClient, source: SitemapSource, depth: int, visited_sitemap_urls: set[str], sources: list[SitemapSource], retries_left: int, *, proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, emit_nested_sitemaps: bool, ) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]: """Fetch a sitemap from a URL and process its content.""" if 'url' not in source: return sitemap_url = source['url'] try: while retries_left > 0: retries_left -= 1 async with http_client.stream( sitemap_url, method='GET', headers=SITEMAP_HEADERS, proxy_info=proxy_info, timeout=timeout ) as response: # Determine content type and compression content_type = response.headers.get('content-type', '') decoder = getincrementaldecoder('utf-8')(errors='replace') # Create appropriate parser parser = _get_parser(content_type, sitemap_url) decompressor = None try: # Process chunks as they arrive first_chunk = True async for raw_chunk in response.read_stream(): # Check if the first chunk is a valid gzip header if first_chunk and raw_chunk.startswith(b'\x1f\x8b'): decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16) first_chunk = False chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk text_chunk = decoder.decode(chunk) async for item in parser.process_chunk(text_chunk): async for result in _process_sitemap_item( item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps, ): if result: yield result # Process any remaining content async for item in parser.flush(): async for result in _process_sitemap_item( item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps, ): if result: yield result finally: parser.close() break except Exception as e: if retries_left > 0: logger.warning(f'Error fetching sitemap {sitemap_url}: {e}. Retries left: {retries_left}') await asyncio.sleep(1) # Brief pause before retry class Sitemap: def __init__(self, urls: list[str]) -> None: self._urls = urls @property def urls(self) -> list[str]: return self._urls @classmethod async def try_common_names(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Sitemap: base_url = URL(url) sitemap_urls = [str(base_url.with_path(path)) for path in COMMON_SITEMAP_PATHS] return await cls.load(sitemap_urls, http_client, proxy_info) @classmethod async def load( cls, urls: str | list[str], http_client: HttpClient, proxy_info: ProxyInfo | None = None, parse_sitemap_options: ParseSitemapOptions | None = None, ) -> Sitemap: if isinstance(urls, str): urls = [urls] return await cls.parse( [SitemapSource(type='url', url=url) for url in urls], http_client, proxy_info, parse_sitemap_options ) @classmethod async def from_xml_string(cls, content: str) -> Sitemap: return await cls.parse([SitemapSource(type='raw', content=content)]) @classmethod async def parse( cls, sources: list[SitemapSource], http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None, parse_sitemap_options: ParseSitemapOptions | None = None, ) -> Sitemap: urls = [item.loc async for item in parse_sitemap(sources, http_client, proxy_info, parse_sitemap_options)] return cls(urls) async def parse_sitemap( initial_sources: list[SitemapSource], http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None, options: ParseSitemapOptions | None = None, ) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]: """Parse sitemap(s) and yield URLs found in them. This function coordinates the process of fetching and parsing sitemaps, handling both URL-based and raw content sources. It follows nested sitemaps up to the specified maximum depth. """ # Set default options default_timeout = timedelta(seconds=30) if options: emit_nested_sitemaps = options['emit_nested_sitemaps'] max_depth = options['max_depth'] sitemap_retries = options['sitemap_retries'] timeout = options.get('timeout', default_timeout) else: emit_nested_sitemaps = False max_depth = float('inf') sitemap_retries = 3 timeout = default_timeout # Setup working state sources = list(initial_sources) visited_sitemap_urls: set[str] = set() # Process sources until the queue is empty while sources: source = sources.pop(0) depth = source.get('depth', 0) # Skip if we've reached max depth if depth > max_depth: logger.debug(f'Skipping sitemap {source.get("url", "")} - exceeded max depth {max_depth}') continue # Process based on source type if source['type'] == 'raw': async for result in _process_raw_source( source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps ): yield result elif source['type'] == 'url' and 'url' in source: # Add to visited set before processing to avoid duplicates if http_client is None: raise RuntimeError('HttpClient must be provided for URL-based sitemap sources.') visited_sitemap_urls.add(source['url']) async for result in _fetch_and_process_sitemap( http_client, source, depth, visited_sitemap_urls, sources, sitemap_retries, emit_nested_sitemaps=emit_nested_sitemaps, proxy_info=proxy_info, timeout=timeout, ): yield result else: logger.warning(f'Invalid source configuration: {source}') async def _merge_async_generators(*generators: AsyncGenerator) -> AsyncGenerator: queue: asyncio.Queue = asyncio.Queue() end_feed = object() async def feed(gen: AsyncGenerator) -> None: try: async for item in gen: await queue.put(item) except Exception: logger.warning(f'Error in generator: {gen}', exc_info=True) finally: await queue.put(end_feed) tasks = [asyncio.create_task(feed(gen)) for gen in generators] remaining_tasks = len(tasks) try: while remaining_tasks > 0: item = await queue.get() if item is end_feed: remaining_tasks -= 1 else: yield item finally: for task in tasks: task.cancel() await asyncio.gather(*tasks, return_exceptions=True) async def _discover_for_hostname( hostname: str, hostname_urls: list[str], *, http_client: HttpClient, proxy_info: ProxyInfo | None = None, request_timeout: timedelta, method_for_checking: Literal['HEAD', 'GET'] = 'HEAD', ) -> AsyncGenerator[str, None]: # Import here to avoid circular imports. from crawlee._utils.robots import RobotsTxtFile # noqa: PLC0415 domain_seen: set[str] = set() hostname_urls = list(set(hostname_urls)) # Remove duplicates def _check_and_add(url: str) -> bool: if url in domain_seen: return False domain_seen.add(url) return True # Try getting sitemaps from robots.txt first robots = await RobotsTxtFile.find(url=hostname_urls[0], http_client=http_client, proxy_info=proxy_info) for sitemap_url in robots.get_sitemaps(): if _check_and_add(sitemap_url): yield sitemap_url # Check maybe provided URLs have sitemap url matching_sitemap_urls = [url for url in hostname_urls if SITEMAP_URL_PATTERN.search(url)] if matching_sitemap_urls: for sitemap_url in matching_sitemap_urls: if _check_and_add(sitemap_url): yield sitemap_url else: # Check common sitemap locations base_url = URL(hostname_urls[0]) for path in COMMON_SITEMAP_PATHS: candidate = str(base_url.with_path(path)) if candidate in domain_seen: continue try: response = await http_client.send_request( candidate, method=method_for_checking, proxy_info=proxy_info, timeout=request_timeout ) if is_status_code_successful(response.status_code) and _check_and_add(candidate): yield candidate except ProxyError: logger.warning(f'Proxy error when checking {candidate} with sitemap discovery for {hostname}') except asyncio.TimeoutError: logger.warning(f'Timeout when checking {candidate} with sitemap discovery for {hostname}') except Exception: logger.warning(f'Error when checking {candidate} with sitemap discovery for {hostname}', exc_info=True) async def discover_valid_sitemaps( urls: list[str], *, http_client: HttpClient, proxy_info: ProxyInfo | None = None, request_timeout: timedelta = timedelta(seconds=20), method_for_checking: Literal['HEAD', 'GET'] = 'HEAD', ) -> AsyncGenerator[str, None]: """Discover related sitemaps for the given URLs. Args: urls: List of URLs to discover sitemaps for. http_client: `HttpClient` to use for making requests. proxy_info: Proxy configuration to use for requests. request_timeout: Timeout for each request when checking for sitemaps. method_for_checking: HTTP method to use when checking for sitemap existence (HEAD or GET). """ # Use a set to track seen sitemap URLs and avoid duplicates seen = set() grouped_urls = defaultdict(list) for url in urls: try: hostname = URL(url).host except ValueError: logger.warning(f'Invalid URL {url} skipped') continue if not hostname: logger.warning(f'URL {url} without host skipped') continue grouped_urls[hostname].append(url) generators = [ _discover_for_hostname( hostname, hostname_urls, http_client=http_client, proxy_info=proxy_info, request_timeout=request_timeout, method_for_checking=method_for_checking, ) for hostname, hostname_urls in grouped_urls.items() ] async for sitemap_url in _merge_async_generators(*generators): if sitemap_url not in seen: seen.add(sitemap_url) yield sitemap_url ================================================ FILE: src/crawlee/_utils/system.py ================================================ from __future__ import annotations import os import sys from contextlib import suppress from datetime import datetime, timezone from logging import getLogger from typing import TYPE_CHECKING, Annotated import psutil from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator from crawlee._utils.byte_size import ByteSize logger = getLogger(__name__) if sys.platform == 'linux': """Get the most suitable available used memory metric. `Proportional Set Size (PSS)`, is the amount of own memory and memory shared with other processes, accounted in a way that the shared amount is divided evenly between the processes that share it. Available on Linux. Suitable for avoiding overestimation by counting the same shared memory used by children processes multiple times. `Resident Set Size (RSS)` is the non-swapped physical memory a process has used; it includes shared memory. It should be available everywhere. """ def _get_used_memory(process: psutil.Process) -> int: return int(process.memory_full_info().pss) else: def _get_used_memory(process: psutil.Process) -> int: return int(process.memory_info().rss) class CpuInfo(BaseModel): """Information about the CPU usage.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) used_ratio: Annotated[float, Field(alias='usedRatio')] """The ratio of CPU currently in use, represented as a float between 0 and 1.""" # Workaround for Pydantic and type checkers when using Annotated with default_factory if TYPE_CHECKING: created_at: datetime = datetime.now(timezone.utc) """The time at which the measurement was taken.""" else: created_at: Annotated[ datetime, Field( alias='createdAt', default_factory=lambda: datetime.now(timezone.utc), ), ] """The time at which the measurement was taken.""" class MemoryUsageInfo(BaseModel): """Information about the memory usage.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) current_size: Annotated[ ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='currentSize'), ] """Memory usage of the current Python process and its children.""" # Workaround for Pydantic and type checkers when using Annotated with default_factory if TYPE_CHECKING: created_at: datetime = datetime.now(timezone.utc) """The time at which the measurement was taken.""" else: created_at: Annotated[ datetime, Field( alias='createdAt', default_factory=lambda: datetime.now(timezone.utc), ), ] """The time at which the measurement was taken.""" class MemoryInfo(MemoryUsageInfo): """Information about system memory.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) total_size: Annotated[ ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='totalSize') ] """Total memory available in the system.""" system_wide_used_size: Annotated[ ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='systemWideUsedSize'), ] """Total memory used by all processes system-wide (including non-crawlee processes).""" def get_cpu_info() -> CpuInfo: """Retrieve the current CPU usage. It utilizes the `psutil` library. Function `psutil.cpu_percent()` returns a float representing the current system-wide CPU utilization as a percentage. """ logger.debug('Calling get_cpu_info()...') cpu_percent = psutil.cpu_percent(interval=0.1) return CpuInfo(used_ratio=cpu_percent / 100) def get_memory_info() -> MemoryInfo: """Retrieve the current memory usage of the process and its children. It utilizes the `psutil` library. """ logger.debug('Calling get_memory_info()...') current_process = psutil.Process(os.getpid()) # Retrieve estimated memory usage of the current process. current_size_bytes = _get_used_memory(current_process) # Sum memory usage by all children processes, try to exclude shared memory from the sum if allowed by OS. for child in current_process.children(recursive=True): # Ignore any NoSuchProcess exception that might occur if a child process ends before we retrieve # its memory usage. with suppress(psutil.NoSuchProcess): current_size_bytes += _get_used_memory(child) vm = psutil.virtual_memory() return MemoryInfo( total_size=ByteSize(vm.total), current_size=ByteSize(current_size_bytes), system_wide_used_size=ByteSize(vm.total - vm.available), ) ================================================ FILE: src/crawlee/_utils/time.py ================================================ from __future__ import annotations import time from contextlib import contextmanager from dataclasses import dataclass from datetime import timedelta from typing import TYPE_CHECKING from async_timeout import Timeout, timeout if TYPE_CHECKING: from collections.abc import Iterator from types import TracebackType _SECONDS_PER_MINUTE = 60 _SECONDS_PER_HOUR = 3600 @dataclass class TimerResult: wall: float | None = None cpu: float | None = None @contextmanager def measure_time() -> Iterator[TimerResult]: """Measure the execution time (wall-clock and CPU) between the start and end of the with-block.""" result = TimerResult() before_wall = time.monotonic() before_cpu = time.thread_time() try: yield result finally: after_wall = time.monotonic() after_cpu = time.thread_time() result.wall = after_wall - before_wall result.cpu = after_cpu - before_cpu class SharedTimeout: """Keeps track of a time budget shared by multiple independent async operations. Provides a reusable, non-reentrant context manager interface. """ def __init__(self, timeout: timedelta) -> None: self._remaining_timeout = timeout self._active_timeout: Timeout | None = None self._activation_timestamp: float | None = None async def __aenter__(self) -> timedelta: if self._active_timeout is not None or self._activation_timestamp is not None: raise RuntimeError('A shared timeout context cannot be entered twice at the same time') self._activation_timestamp = time.monotonic() self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds()) await new_timeout.__aenter__() return self._remaining_timeout async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None, ) -> None: if self._active_timeout is None or self._activation_timestamp is None: raise RuntimeError('Logic error') await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback) elapsed = time.monotonic() - self._activation_timestamp self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed) self._active_timeout = None self._activation_timestamp = None def format_duration(duration: timedelta | None) -> str: """Format a timedelta into a human-readable string with appropriate units.""" if duration is None: return 'None' total_seconds = duration.total_seconds() if total_seconds == 0: return '0s' # For very small durations, show in milliseconds if total_seconds < 1: milliseconds = total_seconds * 1000 if milliseconds < 1: microseconds = total_seconds * 1_000_000 return f'{microseconds:.1f}μs' return f'{milliseconds:.1f}ms' # For durations less than 60 seconds, show in seconds if total_seconds < _SECONDS_PER_MINUTE: return f'{total_seconds:.2f}s' # For durations less than 1 hour, show in minutes and seconds if total_seconds < _SECONDS_PER_HOUR: minutes = int(total_seconds // _SECONDS_PER_MINUTE) seconds = total_seconds % _SECONDS_PER_MINUTE if seconds == 0: return f'{minutes}min' return f'{minutes}min {seconds:.1f}s' # For longer durations, show in hours, minutes, and seconds hours = int(total_seconds // _SECONDS_PER_HOUR) remaining_seconds = total_seconds % _SECONDS_PER_HOUR minutes = int(remaining_seconds // _SECONDS_PER_MINUTE) seconds = remaining_seconds % _SECONDS_PER_MINUTE result = f'{hours}h' if minutes > 0: result += f' {minutes}min' if seconds > 0: result += f' {seconds:.1f}s' return result ================================================ FILE: src/crawlee/_utils/try_import.py ================================================ import sys from collections.abc import Iterator from contextlib import contextmanager from dataclasses import dataclass from types import ModuleType from typing import Any @contextmanager def try_import(module_name: str, *symbol_names: str) -> Iterator[None]: """Context manager to attempt importing symbols into a module. If an `ImportError` is raised during the import, the symbol will be replaced with a `FailedImport` object. """ try: yield except ImportError as e: for symbol_name in symbol_names: setattr(sys.modules[module_name], symbol_name, FailedImport(e.args[0])) def install_import_hook(module_name: str) -> None: """Install an import hook for a specified module.""" sys.modules[module_name].__class__ = ImportWrapper @dataclass class FailedImport: """Represent a placeholder for a failed import.""" message: str """The error message associated with the failed import.""" class ImportWrapper(ModuleType): """A wrapper class for modules to handle attribute access for failed imports.""" def __getattribute__(self, name: str) -> Any: result = super().__getattribute__(name) if isinstance(result, FailedImport): raise ImportError(result.message) # noqa: TRY004 return result ================================================ FILE: src/crawlee/_utils/urls.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING from pydantic import AnyHttpUrl, TypeAdapter from yarl import URL if TYPE_CHECKING: from collections.abc import Iterator from logging import Logger def is_url_absolute(url: str) -> bool: """Check if a URL is absolute.""" url_parsed = URL(url) # We don't use .absolute because in yarl.URL, it is always True for links that start with '//' return bool(url_parsed.scheme) and bool(url_parsed.raw_authority) def convert_to_absolute_url(base_url: str, relative_url: str) -> str: """Convert a relative URL to an absolute URL using a base URL.""" return str(URL(base_url).join(URL(relative_url))) def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]: """Convert an iterator of relative URLs to absolute URLs using a base URL.""" for url in urls: if is_url_absolute(url): yield url else: converted_url = convert_to_absolute_url(base_url, url) # Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'. if not is_url_absolute(converted_url): if logger: logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.') continue yield converted_url _http_url_adapter = TypeAdapter(AnyHttpUrl) def validate_http_url(value: str | None) -> str | None: """Validate the given HTTP URL. Raises: pydantic.ValidationError: If the URL is not valid. """ if value is not None: _http_url_adapter.validate_python(value) return value ================================================ FILE: src/crawlee/_utils/wait.py ================================================ from __future__ import annotations import asyncio from contextlib import suppress from typing import TYPE_CHECKING, TypeVar if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Sequence from datetime import timedelta from logging import Logger T = TypeVar('T') async def wait_for( operation: Callable[[], Awaitable[T]], *, timeout: timedelta, timeout_message: str | None = None, max_retries: int = 1, logger: Logger, ) -> T: """Wait for an async operation to complete. If the wait times out, `TimeoutError` is raised and the future is cancelled. Optionally retry on error. Args: operation: A function that returns the future to wait for. timeout: How long should we wait before cancelling the future. timeout_message: Message to be included in the `TimeoutError` in case of timeout. max_retries: How many times should the operation be attempted. logger: Used to report information about retries as they happen. """ for iteration in range(1, max_retries + 1): try: return await asyncio.wait_for(operation(), timeout.total_seconds()) except asyncio.TimeoutError as ex: # noqa: PERF203 raise asyncio.TimeoutError(timeout_message) from ex except Exception as e: if iteration == max_retries: raise logger.warning(f'{e!s}: retrying ({iteration}/{max_retries})') raise RuntimeError('Unreachable code') async def wait_for_all_tasks_for_finish( tasks: Sequence[asyncio.Task], *, logger: Logger, timeout: timedelta | None = None, ) -> None: """Wait for all tasks to finish or until the timeout is reached. Args: tasks: A sequence of asyncio tasks to wait for. logger: Logger to use for reporting. timeout: How long should we wait before cancelling the tasks. """ if not tasks: return timeout_secs = timeout.total_seconds() if timeout else None try: _, pending = await asyncio.wait(tasks, timeout=timeout_secs) if pending: logger.warning('Waiting timeout reached; canceling unfinished tasks.') except asyncio.CancelledError: logger.warning('Asyncio wait was cancelled; canceling unfinished tasks.') raise finally: for task in tasks: if not task.done(): task.cancel() with suppress(asyncio.CancelledError): await task # If task is done, access the result to clear any exceptions else: try: task.result() except asyncio.CancelledError: pass except Exception as e: logger.warning(f'Task raised an exception: {e}') ================================================ FILE: src/crawlee/_utils/web.py ================================================ from __future__ import annotations from http import HTTPStatus def is_status_code_client_error(value: int) -> bool: """Return `True` for 4xx status codes, `False` otherwise.""" return HTTPStatus.BAD_REQUEST <= value < HTTPStatus.INTERNAL_SERVER_ERROR def is_status_code_server_error(value: int) -> bool: """Return `True` for 5xx status codes, `False` otherwise.""" return value >= HTTPStatus.INTERNAL_SERVER_ERROR def is_status_code_successful(value: int) -> bool: """Return `True` for 2xx and 3xx status codes, `False` otherwise.""" return HTTPStatus.OK <= value < HTTPStatus.BAD_REQUEST ================================================ FILE: src/crawlee/browsers/__init__.py ================================================ from crawlee._utils.try_import import install_import_hook as _install_import_hook from crawlee._utils.try_import import try_import as _try_import from ._types import BrowserType, CrawleePage _install_import_hook(__name__) # The following imports are wrapped in try_import to handle optional dependencies, # ensuring the module can still function even if these dependencies are missing. with _try_import(__name__, 'BrowserPool'): from ._browser_pool import BrowserPool with _try_import(__name__, 'PlaywrightBrowserController'): from ._playwright_browser_controller import PlaywrightBrowserController with _try_import(__name__, 'PlaywrightBrowserPlugin'): from ._playwright_browser_plugin import PlaywrightBrowserPlugin with _try_import(__name__, 'PlaywrightPersistentBrowser'): from ._playwright_browser import PlaywrightPersistentBrowser __all__ = [ 'BrowserPool', 'BrowserType', 'CrawleePage', 'PlaywrightBrowserController', 'PlaywrightBrowserPlugin', 'PlaywrightPersistentBrowser', ] ================================================ FILE: src/crawlee/browsers/_browser_controller.py ================================================ # Inspiration: https://github.com/apify/crawlee/blob/v3.10.1/packages/browser-pool/src/abstract-classes/browser-controller.ts from __future__ import annotations from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any from crawlee._utils.docs import docs_group if TYPE_CHECKING: from collections.abc import Mapping from datetime import datetime, timedelta from playwright.async_api import Page from crawlee.browsers._types import BrowserType from crawlee.proxy_configuration import ProxyInfo @docs_group('Browser management') class BrowserController(ABC): """An abstract base class for managing browser instance and their pages.""" AUTOMATION_LIBRARY: str | None = None """The name of the automation library that the controller is using.""" @property @abstractmethod def pages(self) -> list[Page]: """Return the list of opened pages.""" @property @abstractmethod def total_opened_pages(self) -> int: """Return the total number of pages opened since the browser was launched.""" @property @abstractmethod def pages_count(self) -> int: """Return the number of currently open pages.""" @property @abstractmethod def last_page_opened_at(self) -> datetime: """Return the time when the last page was opened.""" @property @abstractmethod def idle_time(self) -> timedelta: """Return the idle time of the browser controller.""" @property @abstractmethod def has_free_capacity(self) -> bool: """Return if the browser has free capacity to open a new page.""" @property @abstractmethod def is_browser_connected(self) -> bool: """Return if the browser is closed.""" @property @abstractmethod def browser_type(self) -> BrowserType: """Return the type of the browser.""" @abstractmethod async def new_page( self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None, ) -> Page: """Create a new page with the given context options. Args: browser_new_context_options: Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's `browser.new_context` method. For more details, refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context. proxy_info: The proxy configuration to use for the new page. Returns: Page: The newly created page. Raises: ValueError: If the browser has reached the maximum number of open pages. """ @abstractmethod async def close(self, *, force: bool = False) -> None: """Close the browser. Args: force: Whether to force close all open pages before closing the browser. Raises: ValueError: If there are still open pages when trying to close the browser. """ ================================================ FILE: src/crawlee/browsers/_browser_plugin.py ================================================ # Inspiration: https://github.com/apify/crawlee/blob/v3.10.1/packages/browser-pool/src/abstract-classes/browser-plugin.ts from __future__ import annotations from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any from crawlee._utils.docs import docs_group if TYPE_CHECKING: from collections.abc import Mapping from types import TracebackType from crawlee.browsers._browser_controller import BrowserController from crawlee.browsers._types import BrowserType @docs_group('Browser management') class BrowserPlugin(ABC): """An abstract base class for browser plugins. Browser plugins act as wrappers around browser automation tools like Playwright, providing a unified interface for interacting with browsers. """ AUTOMATION_LIBRARY: str | None = None """The name of the automation library that the plugin is managing.""" @property @abstractmethod def active(self) -> bool: """Indicate whether the context is active.""" @property @abstractmethod def browser_type(self) -> BrowserType: """Return the browser type name.""" @property @abstractmethod def browser_launch_options(self) -> Mapping[str, Any]: """Return the options for the `browser.launch` method. Keyword arguments to pass to the browser launch method. These options are provided directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch. """ @property @abstractmethod def browser_new_context_options(self) -> Mapping[str, Any]: """Return the options for the `browser.new_context` method. Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's `browser.new_context` method. For more details, refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context. """ @property @abstractmethod def max_open_pages_per_browser(self) -> int: """Return the maximum number of pages that can be opened in a single browser.""" @abstractmethod async def __aenter__(self) -> BrowserPlugin: """Enter the context manager and initialize the browser plugin. Raises: RuntimeError: If the context manager is already active. """ @abstractmethod async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None, ) -> None: """Exit the context manager and close the browser plugin. Raises: RuntimeError: If the context manager is not active. """ @abstractmethod async def new_browser(self) -> BrowserController: """Create a new browser instance. Returns: A new browser instance wrapped in a controller. """ ================================================ FILE: src/crawlee/browsers/_browser_pool.py ================================================ # Inspiration: https://github.com/apify/crawlee/tree/v3.10.1/packages/browser-pool/ from __future__ import annotations import asyncio import itertools from collections import defaultdict from datetime import timedelta from logging import getLogger from typing import TYPE_CHECKING, Any from weakref import WeakValueDictionary from crawlee._utils.context import ensure_context from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.docs import docs_group from crawlee._utils.recurring_task import RecurringTask from crawlee.browsers._browser_controller import BrowserController from crawlee.browsers._playwright_browser_plugin import PlaywrightBrowserPlugin from crawlee.browsers._types import BrowserType, CrawleePage if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Mapping, Sequence from pathlib import Path from types import TracebackType from crawlee.browsers._browser_plugin import BrowserPlugin from crawlee.fingerprint_suite import FingerprintGenerator from crawlee.proxy_configuration import ProxyInfo logger = getLogger(__name__) @docs_group('Browser management') class BrowserPool: """Manage a pool of browsers and pages, handling their lifecycle and resource allocation. The `BrowserPool` is responsible for opening and closing browsers, managing pages within those browsers, and handling the overall lifecycle of these resources. It provides flexible configuration via constructor options, which include various hooks that allow for the insertion of custom behavior at different stages of the browser and page lifecycles. The browsers in the pool can be in one of three states: active, inactive, or closed. """ _GENERATED_PAGE_ID_LENGTH = 8 """The length of the newly generated page ID.""" def __init__( self, plugins: Sequence[BrowserPlugin] | None = None, *, operation_timeout: timedelta = timedelta(seconds=15), browser_inactive_threshold: timedelta = timedelta(seconds=10), identify_inactive_browsers_interval: timedelta = timedelta(seconds=20), close_inactive_browsers_interval: timedelta = timedelta(seconds=30), retire_browser_after_page_count: int = 100, ) -> None: """Initialize a new instance. Args: plugins: Browser plugins serve as wrappers around various browser automation libraries, providing a consistent interface across different libraries. operation_timeout: Operations of the underlying automation libraries, such as launching a browser or opening a new page, can sometimes get stuck. To prevent `BrowserPool` from becoming unresponsive, we add a timeout to these operations. browser_inactive_threshold: The period of inactivity after which a browser is considered as inactive. identify_inactive_browsers_interval: The period of inactivity after which a browser is considered as retired. close_inactive_browsers_interval: The interval at which the pool checks for inactive browsers and closes them. The browser is considered as inactive if it has no active pages and has been idle for the specified period. The browser is considered as retired if it has no active pages and has total pages count greater than or equal to `retire_browser_after_page_count`. retire_browser_after_page_count: The maximum number of processed pages after which the browser is considered as retired. """ self._plugins = plugins or [PlaywrightBrowserPlugin()] self._operation_timeout = operation_timeout self._browser_inactive_threshold = browser_inactive_threshold self._active_browsers = list[BrowserController]() """A list of browsers currently active and being used to open pages.""" self._inactive_browsers = list[BrowserController]() """A list of browsers currently inactive and not being used to open new pages, but may still contain open pages.""" self._identify_inactive_browsers_task = RecurringTask( self._identify_inactive_browsers, identify_inactive_browsers_interval, ) self._close_inactive_browsers_task = RecurringTask( self._close_inactive_browsers, close_inactive_browsers_interval, ) self._total_pages_count = 0 self._retire_browser_after_page_count = retire_browser_after_page_count self._pages = WeakValueDictionary[str, CrawleePage]() # Track the pages in the pool self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins self._pre_page_create_hooks: list[ Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]] ] = [] self._post_page_create_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = [] self._pre_page_close_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = [] self._post_page_close_hooks: list[Callable[[str, BrowserController], Awaitable[None]]] = [] # Flag to indicate the context state. self._active = False @classmethod def with_default_plugin( cls, *, browser_type: BrowserType | None = None, user_data_dir: str | Path | None = None, browser_launch_options: Mapping[str, Any] | None = None, browser_new_context_options: Mapping[str, Any] | None = None, headless: bool | None = None, fingerprint_generator: FingerprintGenerator | None = None, use_incognito_pages: bool | None = False, **kwargs: Any, ) -> BrowserPool: """Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options. Args: browser_type: The type of browser to launch: - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system. user_data_dir: Path to a user data directory, which stores browser session data like cookies and local storage. browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch. browser_new_context_options: Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's `browser.new_context` method. For more details, refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context. headless: Whether to run the browser in headless mode. fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used to generate browser fingerprints together with consistent headers. use_incognito_pages: By default pages share the same browser context. If set to True each page uses its own context that is destroyed once the page is closed or crashes. kwargs: Additional arguments for default constructor. """ plugin_options: dict = defaultdict(dict) plugin_options['browser_launch_options'] = dict(browser_launch_options) if browser_launch_options else {} plugin_options['browser_new_context_options'] = browser_new_context_options or {} if headless is not None: plugin_options['browser_launch_options']['headless'] = headless if use_incognito_pages is not None: plugin_options['use_incognito_pages'] = use_incognito_pages if browser_type: plugin_options['browser_type'] = browser_type if user_data_dir: plugin_options['user_data_dir'] = user_data_dir plugin = PlaywrightBrowserPlugin( **plugin_options, fingerprint_generator=fingerprint_generator, ) return cls(plugins=[plugin], **kwargs) @property def plugins(self) -> Sequence[BrowserPlugin]: """Return the browser plugins.""" return self._plugins @property def active_browsers(self) -> Sequence[BrowserController]: """Return the active browsers in the pool.""" return self._active_browsers @property def inactive_browsers(self) -> Sequence[BrowserController]: """Return the inactive browsers in the pool.""" return self._inactive_browsers @property def pages(self) -> Mapping[str, CrawleePage]: """Return the pages in the pool.""" return self._pages @property def total_pages_count(self) -> int: """Return the total number of pages opened since the browser pool was launched.""" return self._total_pages_count @property def active(self) -> bool: """Indicate whether the context is active.""" return self._active async def __aenter__(self) -> BrowserPool: """Enter the context manager and initialize all browser plugins. Raises: RuntimeError: If the context manager is already active. """ if self._active: raise RuntimeError(f'The {self.__class__.__name__} is already active.') self._active = True # Start the recurring tasks for identifying and closing inactive browsers self._identify_inactive_browsers_task.start() self._close_inactive_browsers_task.start() timeout = self._operation_timeout.total_seconds() try: for plugin in self._plugins: await asyncio.wait_for(plugin.__aenter__(), timeout) except asyncio.TimeoutError: logger.warning(f'Initializing of the browser plugin {plugin} timed out, will be skipped.') return self async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None, ) -> None: """Exit the context manager and close all browser plugins. Raises: RuntimeError: If the context manager is not active. """ if not self._active: raise RuntimeError(f'The {self.__class__.__name__} is not active.') await self._identify_inactive_browsers_task.stop() await self._close_inactive_browsers_task.stop() for browser in self._active_browsers + self._inactive_browsers: await browser.close(force=True) self._active_browsers.clear() self._inactive_browsers.clear() for plugin in self._plugins: await plugin.__aexit__(exc_type, exc_value, exc_traceback) self._active = False @ensure_context async def new_page( self, *, page_id: str | None = None, browser_plugin: BrowserPlugin | None = None, proxy_info: ProxyInfo | None = None, ) -> CrawleePage: """Open a new page in a browser using the specified or a random browser plugin. Args: page_id: The ID to assign to the new page. If not provided, a random ID is generated. browser_plugin: browser_plugin: The browser plugin to use for creating the new page. If not provided, the next plugin in the rotation is used. proxy_info: The proxy configuration to use for the new page. Returns: The newly created browser page. """ if page_id in self.pages: raise ValueError(f'Page with ID: {page_id} already exists.') if browser_plugin and browser_plugin not in self.plugins: raise ValueError('Provided browser_plugin is not one of the plugins used by BrowserPool.') page_id = page_id or crypto_random_object_id(self._GENERATED_PAGE_ID_LENGTH) plugin = browser_plugin or next(self._plugins_cycle) return await self._get_new_page(page_id, plugin, proxy_info) @ensure_context async def new_page_with_each_plugin(self) -> Sequence[CrawleePage]: """Create a new page with each browser plugin in the pool. This method is useful for running scripts in multiple environments simultaneously, typically for testing or website analysis. Each page is created using a different browser plugin, allowing you to interact with various browser types concurrently. Returns: A list of newly created pages, one for each plugin in the pool. """ pages_coroutines = [self.new_page(browser_plugin=plugin) for plugin in self._plugins] return await asyncio.gather(*pages_coroutines) async def _get_new_page( self, page_id: str, plugin: BrowserPlugin, proxy_info: ProxyInfo | None, ) -> CrawleePage: """Initialize a new browser page using the specified plugin. Select a browser with available capacity or launch a new one if needed. Create a new page in the selected browser with the provided proxy settings. """ timeout = self._operation_timeout.total_seconds() browser_controller = self._pick_browser_with_free_capacity(plugin) try: if not browser_controller: browser_controller = await asyncio.wait_for(self._launch_new_browser(plugin), timeout) browser_new_context_options = dict(plugin.browser_new_context_options) await self._execute_hooks( self._pre_page_create_hooks, page_id, browser_controller, browser_new_context_options, proxy_info ) page = await asyncio.wait_for( browser_controller.new_page( browser_new_context_options=browser_new_context_options, proxy_info=proxy_info, ), timeout, ) except asyncio.TimeoutError as exc: raise TimeoutError(f'Creating a new page with plugin {plugin} timed out.') from exc except RuntimeError as exc: raise RuntimeError('Browser pool is not initialized.') from exc if browser_controller.total_opened_pages >= self._retire_browser_after_page_count: self._retire_browser(browser_controller) crawlee_page = CrawleePage(id=page_id, page=page, browser_type=plugin.browser_type) self._pages[page_id] = crawlee_page self._total_pages_count += 1 await self._execute_hooks(self._post_page_create_hooks, crawlee_page, browser_controller) self._override_page_close(crawlee_page, browser_controller) return crawlee_page def _pick_browser_with_free_capacity( self, browser_plugin: BrowserPlugin, ) -> BrowserController | None: """Pick a browser with free capacity that matches the specified plugin.""" for browser in self._active_browsers: if browser.has_free_capacity and browser.AUTOMATION_LIBRARY == browser_plugin.AUTOMATION_LIBRARY: return browser return None def _retire_browser(self, browser: BrowserController) -> None: """Retire a browser by moving it to the inactive list.""" if browser in self._active_browsers: self._active_browsers.remove(browser) self._inactive_browsers.append(browser) async def _launch_new_browser(self, plugin: BrowserPlugin) -> BrowserController: """Launch a new browser instance using the specified plugin.""" browser = await plugin.new_browser() self._active_browsers.append(browser) return browser def _identify_inactive_browsers(self) -> None: """Identify inactive browsers and move them to the inactive list if their idle time exceeds the threshold.""" for browser in list(self._active_browsers): if browser.idle_time >= self._browser_inactive_threshold: self._active_browsers.remove(browser) self._inactive_browsers.append(browser) async def _close_inactive_browsers(self) -> None: """Close the browsers that have no active pages and have been idle for a certain period.""" for browser in list(self._inactive_browsers): if not browser.pages: await browser.close() self._inactive_browsers.remove(browser) async def _execute_hooks(self, hooks: list[Callable[..., Awaitable[None]]], *args: Any) -> None: """Execute the provided hooks with the given arguments.""" for hook in hooks: await hook(*args) def _override_page_close(self, crawlee_page: CrawleePage, browser_controller: BrowserController) -> None: """Override the page's close method to execute pre and post close hooks.""" if self._pre_page_close_hooks or self._post_page_close_hooks: original_close = crawlee_page.page.close async def close_with_hooks(*args: Any, **kwargs: Any) -> None: try: await self._execute_hooks(self._pre_page_close_hooks, crawlee_page, browser_controller) finally: await original_close(*args, **kwargs) await self._execute_hooks(self._post_page_close_hooks, crawlee_page.id, browser_controller) crawlee_page.page.close: Callable[..., Awaitable[None]] = close_with_hooks def pre_page_create_hook( self, hook: Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]] ) -> Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]: """Register a hook to be called just before a new page is created. The hook receives the page ID, `BrowserController`, `browser_new_context_options`, and `ProxyInfo`. Note that depending on the `BrowserController` implementation, `browser_new_context_options` may not apply to every page individually. For example, `PlaywrightBrowserController` with ``use_incognito_pages=False`` shares a single context across all pages, so the options are applied only when the context is first created. """ self._pre_page_create_hooks.append(hook) return hook def post_page_create_hook( self, hook: Callable[[CrawleePage, BrowserController], Awaitable[None]] ) -> Callable[[CrawleePage, BrowserController], Awaitable[None]]: """Register a hook to be called right after a new page is created. The hook receives the newly created `CrawleePage` and the `BrowserController`. Use it to apply changes to all pages, such as injecting scripts or configuring request interception. """ self._post_page_create_hooks.append(hook) return hook def pre_page_close_hook( self, hook: Callable[[CrawleePage, BrowserController], Awaitable[None]] ) -> Callable[[CrawleePage, BrowserController], Awaitable[None]]: """Register a hook to be called just before a page is closed. The hook receives the `CrawleePage` and the `BrowserController`. Use it to collect last-second data, such as taking a screenshot or saving page state before the page is destroyed. """ self._pre_page_close_hooks.append(hook) return hook def post_page_close_hook( self, hook: Callable[[str, BrowserController], Awaitable[None]] ) -> Callable[[str, BrowserController], Awaitable[None]]: """Register a hook to be called right after a page is closed. The hook receives the page ID and the `BrowserController`. Use it for cleanup or logging after a page's lifecycle ends. """ self._post_page_close_hooks.append(hook) return hook ================================================ FILE: src/crawlee/browsers/_playwright_browser.py ================================================ from __future__ import annotations import asyncio import shutil import tempfile from logging import getLogger from pathlib import Path from typing import TYPE_CHECKING, Any from playwright.async_api import Browser from typing_extensions import override from crawlee._utils.docs import docs_group if TYPE_CHECKING: from playwright.async_api import BrowserContext, BrowserType, CDPSession, Page logger = getLogger(__name__) @docs_group('Browser management') class PlaywrightPersistentBrowser(Browser): """A wrapper for Playwright's `Browser` that operates with a persistent context. It utilizes Playwright's persistent browser context feature, maintaining user data across sessions. While it follows the same interface as Playwright's `Browser` class, there is no abstract base class enforcing this. There is a limitation that only a single persistent context is allowed. """ _TMP_DIR_PREFIX = 'apify-playwright-firefox-taac-' def __init__( self, browser_type: BrowserType, user_data_dir: str | Path | None, browser_launch_options: dict[str, Any], ) -> None: self._browser_type = browser_type self._browser_launch_options = browser_launch_options self._user_data_dir = user_data_dir self._temp_dir: Path | None = None self._context: BrowserContext | None = None self._is_connected = True @property def browser_type(self) -> BrowserType: return self._browser_type @property def contexts(self) -> list[BrowserContext]: return [self._context] if self._context else [] def is_connected(self) -> bool: return self._is_connected async def new_context(self, **context_options: Any) -> BrowserContext: """Create persistent context instead of regular one. Merge launch options with context options.""" if self._context: raise RuntimeError('Persistent browser can have only one context') launch_options = self._browser_launch_options | context_options if self._user_data_dir: user_data_dir = self._user_data_dir else: user_data_dir = tempfile.mkdtemp(prefix=self._TMP_DIR_PREFIX) self._temp_dir = Path(user_data_dir) self._context = await self._browser_type.launch_persistent_context( user_data_dir=user_data_dir, **launch_options ) if self._temp_dir: self._context.on('close', self._delete_temp_dir) return self._context async def _delete_temp_dir(self, _: BrowserContext | None) -> None: if self._temp_dir and self._temp_dir.exists(): temp_dir = self._temp_dir await asyncio.to_thread(shutil.rmtree, temp_dir, ignore_errors=True) @override async def close(self, **kwargs: Any) -> None: """Close browser by closing its context.""" if self._context: await self._context.close() self._context = None self._is_connected = False await asyncio.sleep(0.1) await self._delete_temp_dir(self._context) @property @override def version(self) -> str: raise NotImplementedError('Persistent browser does not support version.') async def new_page(self, **kwargs: Any) -> Page: raise NotImplementedError('Persistent browser does not support new page.') @override async def new_browser_cdp_session(self) -> CDPSession: raise NotImplementedError('Persistent browser does not support new browser CDP session.') async def start_tracing(self, **kwargs: Any) -> None: raise NotImplementedError('Persistent browser does not support tracing.') async def stop_tracing(self, **kwargs: Any) -> bytes: raise NotImplementedError('Persistent browser does not support tracing.') ================================================ FILE: src/crawlee/browsers/_playwright_browser_controller.py ================================================ # Inspiration: https://github.com/apify/crawlee/blob/v3.10.1/packages/browser-pool/src/playwright/playwright-controller.ts from __future__ import annotations from asyncio import Lock from datetime import datetime, timedelta, timezone from typing import TYPE_CHECKING, Any, cast from browserforge.injectors.playwright import AsyncNewContext from playwright.async_api import Browser, BrowserContext, Page, ProxySettings from typing_extensions import override from crawlee._utils.docs import docs_group from crawlee.browsers._browser_controller import BrowserController from crawlee.fingerprint_suite import HeaderGenerator from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type if TYPE_CHECKING: from collections.abc import Mapping from crawlee.browsers._playwright_browser import PlaywrightPersistentBrowser from crawlee.browsers._types import BrowserType from crawlee.fingerprint_suite import FingerprintGenerator from crawlee.proxy_configuration import ProxyInfo from logging import getLogger logger = getLogger(__name__) @docs_group('Browser management') class PlaywrightBrowserController(BrowserController): """Controller for managing Playwright browser instances and their pages. It provides methods to control browser instances, manage their pages, and handle context-specific configurations. It enforces limits on the number of open pages and tracks their state. """ AUTOMATION_LIBRARY = 'playwright' _DEFAULT_HEADER_GENERATOR = HeaderGenerator() def __init__( self, browser: Browser | PlaywrightPersistentBrowser, *, max_open_pages_per_browser: int = 20, use_incognito_pages: bool = False, header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR, fingerprint_generator: FingerprintGenerator | None = None, ) -> None: """Initialize a new instance. Args: browser: The browser instance to control. max_open_pages_per_browser: The maximum number of pages that can be open at the same time. use_incognito_pages: By default pages share the same browser context. If set to True each page uses its own context that is destroyed once the page is closed or crashes. header_generator: An optional `HeaderGenerator` instance used to generate and manage HTTP headers for requests made by the browser. By default, a predefined header generator is used. Set to `None` to disable automatic header modifications. fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used to generate browser fingerprints together with consistent headers. """ if fingerprint_generator and header_generator is not self._DEFAULT_HEADER_GENERATOR: raise ValueError( 'Do not use `header_generator` and `fingerprint_generator` arguments at the same time. ' 'Choose only one. `fingerprint_generator` generates headers as well.' ) self._browser = browser self._max_open_pages_per_browser = max_open_pages_per_browser self._header_generator = header_generator self._fingerprint_generator = fingerprint_generator self._use_incognito_pages = use_incognito_pages self._browser_context: BrowserContext | None = ( self._browser.contexts[0] if len(self._browser.contexts) > 0 else None ) self._pages = list[Page]() self._last_page_opened_at = datetime.now(timezone.utc) self._total_opened_pages = 0 self._opening_pages_count = 0 self._context_creation_lock: Lock | None = None async def _get_context_creation_lock(self) -> Lock: """Get context checking and creation lock. It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to memory leak as one of the two concurrently created contexts will become orphaned and not properly closed. """ if self._context_creation_lock: return self._context_creation_lock self._context_creation_lock = Lock() return self._context_creation_lock @property @override def pages(self) -> list[Page]: return self._pages @property @override def total_opened_pages(self) -> int: return self._total_opened_pages @property @override def pages_count(self) -> int: return len(self._pages) @property @override def last_page_opened_at(self) -> datetime: return self._last_page_opened_at @property @override def idle_time(self) -> timedelta: return datetime.now(timezone.utc) - self._last_page_opened_at @property @override def has_free_capacity(self) -> bool: return (self.pages_count + self._opening_pages_count) < self._max_open_pages_per_browser @property @override def is_browser_connected(self) -> bool: return self._browser.is_connected() @property @override def browser_type(self) -> BrowserType: return cast('BrowserType', self._browser.browser_type.name) @override async def new_page( self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None, ) -> Page: """Create a new page with the given context options. Args: browser_new_context_options: Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's `browser.new_context` method. For more details, refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context. proxy_info: The proxy configuration to use for the new page. Returns: Page: The newly created page. Raises: ValueError: If the browser has reached the maximum number of open pages. """ if not self.has_free_capacity: raise ValueError('Cannot open more pages in this browser.') self._opening_pages_count += 1 try: if self._use_incognito_pages: # In incognito there is exactly one context per one page. Create new context for each new page. new_context = await self._create_browser_context( browser_new_context_options=browser_new_context_options, proxy_info=proxy_info, ) page = await new_context.new_page() else: async with await self._get_context_creation_lock(): if not self._browser_context: self._browser_context = await self._create_browser_context( browser_new_context_options=browser_new_context_options, proxy_info=proxy_info, ) page = await self._browser_context.new_page() # Handle page close event page.on(event='close', f=self._on_page_close) # Update internal state self._pages.append(page) self._last_page_opened_at = datetime.now(timezone.utc) self._total_opened_pages += 1 finally: self._opening_pages_count -= 1 return page @override async def close(self, *, force: bool = False) -> None: """Close the browser. Args: force: Whether to force close all open pages before closing the browser. Raises: ValueError: If there are still open pages when trying to close the browser. """ if self.pages_count > 0 and not force: raise ValueError('Cannot close the browser while there are open pages.') if self._browser_context: await self._browser_context.close() await self._browser.close() def _on_page_close(self, page: Page) -> None: """Handle actions after a page is closed.""" self._pages.remove(page) async def _create_browser_context( self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None, ) -> BrowserContext: """Create a new browser context with the specified proxy settings. Create context with fingerprints and headers using with `self._fingerprint_generator` if available. Create context without fingerprints, but with headers based on `self._header_generator` if available. Create context without headers and without fingerprints if neither `self._header_generator` nor `self._fingerprint_generator` is available. """ browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {} if proxy_info: if browser_new_context_options.get('proxy'): logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.") browser_new_context_options['proxy'] = ProxySettings( server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', username=proxy_info.username, password=proxy_info.password, ) if self._fingerprint_generator: return await AsyncNewContext( browser=self._browser, fingerprint=self._fingerprint_generator.generate(), **browser_new_context_options, ) if self._header_generator: extra_http_headers = dict( self._header_generator.get_specific_headers( header_names={ 'Accept', 'Accept-Language', 'User-Agent', 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform', }, browser_type=fingerprint_browser_type_from_playwright_browser_type(self.browser_type), ) ) else: extra_http_headers = None browser_new_context_options['extra_http_headers'] = browser_new_context_options.get( 'extra_http_headers', extra_http_headers ) return await self._browser.new_context(**browser_new_context_options) ================================================ FILE: src/crawlee/browsers/_playwright_browser_plugin.py ================================================ # Inspiration: https://github.com/apify/crawlee/blob/v3.10.1/packages/browser-pool/src/playwright/playwright-plugin.ts from __future__ import annotations from logging import getLogger from typing import TYPE_CHECKING, Any from playwright.async_api import Playwright, async_playwright from typing_extensions import override from crawlee import service_locator from crawlee._utils.context import ensure_context from crawlee._utils.docs import docs_group from crawlee.browsers._browser_plugin import BrowserPlugin from crawlee.browsers._playwright_browser import PlaywrightPersistentBrowser from crawlee.browsers._playwright_browser_controller import PlaywrightBrowserController if TYPE_CHECKING: from collections.abc import Mapping from pathlib import Path from types import TracebackType from playwright.async_api._generated import Browser from crawlee.browsers._types import BrowserType from crawlee.fingerprint_suite import FingerprintGenerator logger = getLogger(__name__) @docs_group('Browser management') class PlaywrightBrowserPlugin(BrowserPlugin): """A plugin for managing Playwright automation library. It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory for creating new browser instances and provides a unified interface for interacting with different browser types (chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each browser instance, ensuring that resource limits are respected. """ AUTOMATION_LIBRARY = 'playwright' def __init__( self, *, browser_type: BrowserType = 'chromium', user_data_dir: str | Path | None = None, browser_launch_options: dict[str, Any] | None = None, browser_new_context_options: dict[str, Any] | None = None, max_open_pages_per_browser: int = 20, use_incognito_pages: bool = False, fingerprint_generator: FingerprintGenerator | None = None, ) -> None: """Initialize a new instance. Args: browser_type: The type of browser to launch: - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system. user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch. browser_new_context_options: Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's `browser.new_context` method. For more details, refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context. max_open_pages_per_browser: The maximum number of pages that can be opened in a single browser instance. Once reached, a new browser instance will be launched to handle the excess. use_incognito_pages: By default pages share the same browser context. If set to True each page uses its own context that is destroyed once the page is closed or crashes. fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used to generate browser fingerprints together with consistent headers. """ config = service_locator.get_configuration() # Default browser launch options are based on the configuration. default_launch_browser_options: dict[str, Any] = { 'headless': config.headless, 'executable_path': config.default_browser_path, 'chromium_sandbox': not config.disable_browser_sandbox, } explicit_browser_launch_options = browser_launch_options or {} # Map 'chrome' to 'chromium' with the 'chrome' channel. if browser_type == 'chrome': browser_type = 'chromium' # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome. default_launch_browser_options['channel'] = 'chrome' if executable_path := explicit_browser_launch_options.get( 'executable_path', default_launch_browser_options.get('executable_path') ): logger.debug( f"Using browser executable from {executable_path}, which takes precedence over 'chrome' channel." ) self._browser_type: BrowserType = browser_type self._browser_launch_options: dict[str, Any] = default_launch_browser_options | explicit_browser_launch_options self._browser_new_context_options = browser_new_context_options or {} self._max_open_pages_per_browser = max_open_pages_per_browser self._use_incognito_pages = use_incognito_pages self._user_data_dir = user_data_dir self._playwright_context_manager = async_playwright() self._playwright: Playwright | None = None # Flag to indicate the context state. self._active = False self._fingerprint_generator = fingerprint_generator @property @override def active(self) -> bool: return self._active @property @override def browser_type(self) -> BrowserType: return self._browser_type @property @override def browser_launch_options(self) -> Mapping[str, Any]: """Return the options for the `browser.launch` method. Keyword arguments to pass to the browser launch method. These options are provided directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch. """ return self._browser_launch_options @property @override def browser_new_context_options(self) -> Mapping[str, Any]: """Return the options for the `browser.new_context` method. Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's `browser.new_context` method. For more details, refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context. """ return self._browser_new_context_options @property @override def max_open_pages_per_browser(self) -> int: return self._max_open_pages_per_browser @override async def __aenter__(self) -> PlaywrightBrowserPlugin: if self._active: raise RuntimeError(f'The {self.__class__.__name__} is already active.') self._active = True self._playwright = await self._playwright_context_manager.__aenter__() return self @override async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None, ) -> None: if not self._active: raise RuntimeError(f'The {self.__class__.__name__} is not active.') await self._playwright_context_manager.__aexit__(exc_type, exc_value, exc_traceback) self._playwright_context_manager = async_playwright() self._active = False @override @ensure_context async def new_browser(self) -> PlaywrightBrowserController: if not self._playwright: raise RuntimeError('Playwright browser plugin is not initialized.') if self._browser_type == 'chromium': browser_type = self._playwright.chromium elif self._browser_type == 'firefox': browser_type = self._playwright.firefox elif self._browser_type == 'webkit': browser_type = self._playwright.webkit else: raise ValueError(f'Invalid browser type: {self._browser_type}') if self._use_incognito_pages: browser: Browser | PlaywrightPersistentBrowser = await browser_type.launch(**self._browser_launch_options) else: browser = PlaywrightPersistentBrowser(browser_type, self._user_data_dir, self._browser_launch_options) return PlaywrightBrowserController( browser, use_incognito_pages=self._use_incognito_pages, max_open_pages_per_browser=self._max_open_pages_per_browser, fingerprint_generator=self._fingerprint_generator, ) ================================================ FILE: src/crawlee/browsers/_types.py ================================================ from __future__ import annotations from dataclasses import dataclass from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: from playwright.async_api import Page BrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome'] @dataclass class CrawleePage: """Represents a page object within a browser, with additional metadata for tracking and management.""" id: str browser_type: BrowserType page: Page ================================================ FILE: src/crawlee/browsers/py.typed ================================================ ================================================ FILE: src/crawlee/configuration.py ================================================ from __future__ import annotations from datetime import timedelta from typing import TYPE_CHECKING, Annotated from pydantic import AliasChoices, BeforeValidator, Field from pydantic_settings import BaseSettings, SettingsConfigDict from crawlee._types import LogLevel from crawlee._utils.docs import docs_group from crawlee._utils.models import timedelta_ms if TYPE_CHECKING: from typing_extensions import Self __all__ = ['Configuration'] @docs_group('Configuration') class Configuration(BaseSettings): """Configuration settings for the Crawlee project. This class stores common configurable parameters for Crawlee. Default values are provided for all settings, so typically, no adjustments are necessary. However, you may modify settings for specific use cases, such as changing the default storage directory, the default storage IDs, the timeout for internal operations, and more. Settings can also be configured via environment variables, prefixed with `CRAWLEE_`. """ # TODO: https://github.com/pydantic/pydantic-settings/issues/706 # Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved. model_config = SettingsConfigDict(populate_by_name=True) internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None """Timeout for the internal asynchronous operations.""" default_browser_path: Annotated[ str | None, Field( validation_alias=AliasChoices( 'apify_default_browser_path', 'crawlee_default_browser_path', ) ), ] = None """Specifies the path to the browser executable. Currently primarily for Playwright-based features. This option is passed directly to Playwright's `browser_type.launch` method as `executable_path` argument. For more details, refer to the Playwright documentation: https://playwright.dev/docs/api/class-browsertype#browser-type-launch. """ disable_browser_sandbox: Annotated[ bool, Field( validation_alias=AliasChoices( 'apify_disable_browser_sandbox', 'crawlee_disable_browser_sandbox', ) ), ] = False """Disables the sandbox for the browser. Currently primarily for Playwright-based features. This option is passed directly to Playwright's `browser_type.launch` method as `chromium_sandbox`. For more details, refer to the Playwright documentation: https://playwright.dev/docs/api/class-browsertype#browser-type-launch.""" log_level: Annotated[ LogLevel, Field( validation_alias=AliasChoices( 'apify_log_level', 'crawlee_log_level', ) ), BeforeValidator(lambda value: str(value).upper()), ] = 'INFO' """The logging level.""" purge_on_start: Annotated[ bool, Field( validation_alias=AliasChoices( 'apify_purge_on_start', 'crawlee_purge_on_start', ) ), ] = True """Whether to purge the storage on the start. This option is utilized by the storage clients.""" persist_state_interval: Annotated[ timedelta_ms, Field( validation_alias=AliasChoices( 'apify_persist_state_interval_millis', 'crawlee_persist_state_interval_millis', ) ), ] = timedelta(minutes=1) """Interval at which `PersistState` events are emitted. The event ensures the state persistence during the crawler run. This option is utilized by the `EventManager`.""" system_info_interval: Annotated[ timedelta_ms, Field( validation_alias=AliasChoices( 'apify_system_info_interval_millis', 'crawlee_system_info_interval_millis', ) ), ] = timedelta(seconds=1) """Interval at which `SystemInfo` events are emitted. The event represents the current status of the system. This option is utilized by the `LocalEventManager`.""" max_used_cpu_ratio: Annotated[ float, Field( validation_alias=AliasChoices( 'apify_max_used_cpu_ratio', 'crawlee_max_used_cpu_ratio', ) ), ] = 0.95 """The maximum CPU usage ratio. If the CPU usage exceeds this value, the system is considered overloaded. This option is used by the `Snapshotter`.""" max_used_memory_ratio: Annotated[ float, Field( validation_alias=AliasChoices( 'apify_max_used_memory_ratio', 'crawlee_max_used_memory_ratio', ) ), ] = 0.9 """The maximum memory usage ratio. If the memory usage exceeds this ratio, it is considered overloaded. This option is used by the `Snapshotter`.""" max_event_loop_delay: Annotated[ timedelta_ms, Field( validation_alias=AliasChoices( 'apify_max_event_loop_delay_millis', 'crawlee_max_event_loop_delay_millis', ) ), ] = timedelta(milliseconds=50) """The maximum event loop delay. If the event loop delay exceeds this value, it is considered overloaded. This option is used by the `Snapshotter`.""" max_client_errors: Annotated[ int, Field( validation_alias=AliasChoices( 'apify_max_client_errors', 'crawlee_max_client_errors', ) ), ] = 1 """The maximum number of client errors (HTTP 429) allowed before the system is considered overloaded. This option is used by the `Snapshotter`.""" memory_mbytes: Annotated[ int | None, Field( validation_alias=AliasChoices( 'actor_memory_mbytes', 'apify_memory_mbytes', 'crawlee_memory_mbytes', ) ), ] = None """The maximum used memory in megabytes. This option is utilized by the `Snapshotter`.""" available_memory_ratio: Annotated[ float, Field( validation_alias=AliasChoices( 'apify_available_memory_ratio', 'crawlee_available_memory_ratio', ), gt=0.0, le=1.0, ), ] = 0.25 """The maximum proportion of system memory to use. If `memory_mbytes` is not provided, this ratio is used to calculate the maximum memory. This option is utilized by the `Snapshotter` and supports the dynamic system memory scaling.""" storage_dir: Annotated[ str, Field( validation_alias=AliasChoices( 'apify_local_storage_dir', 'crawlee_storage_dir', ), ), ] = './storage' """The path to the storage directory. This option is utilized by the storage clients.""" headless: Annotated[ bool, Field( validation_alias=AliasChoices( 'apify_headless', 'crawlee_headless', ) ), ] = True """Whether to run the browser in headless mode. Currently primarily for Playwright-based features. This option is passed directly to Playwright's `browser_type.launch` method as `headless`. For more details, refer to the Playwright documentation: https://playwright.dev/docs/api/class-browsertype#browser-type-launch. """ @classmethod def get_global_configuration(cls) -> Self: """Retrieve the global instance of the configuration. Mostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()` instead. """ # Import here to avoid circular imports. from crawlee import service_locator # noqa: PLC0415 config = service_locator.get_configuration() if not isinstance(config, cls): raise TypeError(f'Requested global configuration object of type {cls}, but {config.__class__} was found') return config ================================================ FILE: src/crawlee/crawlers/__init__.py ================================================ from crawlee._utils.try_import import install_import_hook as _install_import_hook from crawlee._utils.try_import import try_import as _try_import from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult _install_import_hook(__name__) # The following imports use try_import to handle optional dependencies, as they may not always be available. with _try_import(__name__, 'BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType'): from ._beautifulsoup import BeautifulSoupCrawler, BeautifulSoupCrawlingContext, BeautifulSoupParserType with _try_import(__name__, 'ParselCrawler', 'ParselCrawlingContext'): from ._parsel import ParselCrawler, ParselCrawlingContext with _try_import( __name__, 'PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPostNavCrawlingContext', 'PlaywrightPreNavCrawlingContext', ): from ._playwright import ( PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPostNavCrawlingContext, PlaywrightPreNavCrawlingContext, ) with _try_import( __name__, 'AdaptivePlaywrightCrawler', 'AdaptivePlaywrightCrawlingContext', 'AdaptivePlaywrightPostNavCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext', 'AdaptivePlaywrightCrawlerStatisticState', 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor', ): from ._adaptive_playwright import ( AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlerStatisticState, AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightPostNavCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, RenderingType, RenderingTypePrediction, RenderingTypePredictor, ) __all__ = [ 'AbstractHttpCrawler', 'AbstractHttpParser', 'AdaptivePlaywrightCrawler', 'AdaptivePlaywrightCrawlerStatisticState', 'AdaptivePlaywrightCrawlingContext', 'AdaptivePlaywrightPostNavCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext', 'BasicCrawler', 'BasicCrawlerOptions', 'BasicCrawlingContext', 'BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType', 'ContextPipeline', 'HttpCrawler', 'HttpCrawlerOptions', 'HttpCrawlingContext', 'HttpCrawlingResult', 'ParsedHttpCrawlingContext', 'ParselCrawler', 'ParselCrawlingContext', 'PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPostNavCrawlingContext', 'PlaywrightPreNavCrawlingContext', 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor', ] ================================================ FILE: src/crawlee/crawlers/_abstract_http/__init__.py ================================================ from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions from ._abstract_http_parser import AbstractHttpParser from ._http_crawling_context import ParsedHttpCrawlingContext __all__ = [ 'AbstractHttpCrawler', 'AbstractHttpParser', 'HttpCrawlerOptions', 'ParsedHttpCrawlingContext', ] ================================================ FILE: src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py ================================================ from __future__ import annotations import asyncio import logging from abc import ABC from datetime import timedelta from typing import TYPE_CHECKING, Any, Generic from more_itertools import partition from pydantic import ValidationError from typing_extensions import NotRequired, TypeVar from crawlee._request import Request, RequestOptions, RequestState from crawlee._utils.docs import docs_group from crawlee._utils.time import SharedTimeout from crawlee._utils.urls import to_absolute_url_iterator from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline from crawlee.errors import SessionError from crawlee.statistics import StatisticsState from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Callable, Iterator from typing_extensions import Unpack from crawlee import RequestTransformAction from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction from ._abstract_http_parser import AbstractHttpParser TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext) TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) class HttpCrawlerOptions( BasicCrawlerOptions[TCrawlingContext, TStatisticsState], Generic[TCrawlingContext, TStatisticsState], ): """Arguments for the `AbstractHttpCrawler` constructor. It is intended for typing forwarded `__init__` arguments in the subclasses. """ navigation_timeout: NotRequired[timedelta | None] """Timeout for the HTTP request.""" @docs_group('Crawlers') class AbstractHttpCrawler( BasicCrawler[TCrawlingContext, StatisticsState], ABC, Generic[TCrawlingContext, TParseResult, TSelectResult], ): """A web crawler for performing HTTP requests. The `AbstractHttpCrawler` builds on top of the `BasicCrawler`, inheriting all its features. Additionally, it implements HTTP communication using HTTP clients. The class allows integration with any HTTP client that implements the `HttpClient` interface, provided as an input parameter to the constructor. `AbstractHttpCrawler` is a generic class intended to be used with a specific parser for parsing HTTP responses and the expected type of `TCrawlingContext` available to the user function. Examples of specific versions include `BeautifulSoupCrawler`, `ParselCrawler`, and `HttpCrawler`. HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. For websites that require client-side JavaScript execution, consider using a browser-based crawler like the `PlaywrightCrawler`. """ def __init__( self, *, parser: AbstractHttpParser[TParseResult, TSelectResult], navigation_timeout: timedelta | None = None, **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]], ) -> None: self._parser = parser self._navigation_timeout = navigation_timeout or timedelta(minutes=1) self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = [] self._post_navigation_hooks: list[Callable[[HttpCrawlingContext], Awaitable[None]]] = [] self._shared_navigation_timeouts: dict[int, SharedTimeout] = {} if '_context_pipeline' not in kwargs: raise ValueError( 'Please pass in a `_context_pipeline`. You should use the ' 'AbstractHttpCrawler._create_static_content_crawler_pipeline() method to initialize it.' ) kwargs.setdefault('_logger', logging.getLogger(self.__class__.__name__)) super().__init__(**kwargs) @classmethod def create_parsed_http_crawler_class( cls, static_parser: AbstractHttpParser[TParseResult, TSelectResult], ) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]]: """Create a specific version of `AbstractHttpCrawler` class. This is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass. While `AbstractHttpCrawler` allows its two generic parameters to be independent, this method simplifies cases where `TParseResult` is used for both generic parameters. """ class _ParsedHttpCrawler(AbstractHttpCrawler): def __init__( self, parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser, # ty: ignore[invalid-parameter-default] **kwargs: Unpack[BasicCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]], ) -> None: kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline() super().__init__( parser=parser, **kwargs, ) return _ParsedHttpCrawler def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpCrawlingContext[TParseResult]]: """Create static content crawler context pipeline with expected pipeline steps.""" return ( ContextPipeline() .compose(self._execute_pre_navigation_hooks) .compose(self._make_http_request) .compose(self._execute_post_navigation_hooks) .compose(self._handle_status_code_response) .compose(self._parse_http_response) .compose(self._handle_blocked_request_by_content) ) async def _execute_pre_navigation_hooks( self, context: BasicCrawlingContext ) -> AsyncGenerator[BasicCrawlingContext, None]: context_id = id(context) self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout) try: for hook in self._pre_navigation_hooks: async with self._shared_navigation_timeouts[context_id]: await hook(context) yield context finally: self._shared_navigation_timeouts.pop(context_id, None) async def _execute_post_navigation_hooks( self, context: HttpCrawlingContext ) -> AsyncGenerator[HttpCrawlingContext, None]: for hook in self._post_navigation_hooks: await hook(context) yield context async def _parse_http_response( self, context: HttpCrawlingContext ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]: """Parse HTTP response and create context enhanced by the parsing result and enqueue links function. Args: context: The current crawling context, that includes HTTP response. Yields: The original crawling context enhanced by the parsing result and enqueue links function. """ parsed_content = await self._parser.parse(context.http_response) extract_links = self._create_extract_links_function(context, parsed_content) yield ParsedHttpCrawlingContext.from_http_crawling_context( context=context, parsed_content=parsed_content, enqueue_links=self._create_enqueue_links_function(context, extract_links), extract_links=extract_links, ) def _create_extract_links_function( self, context: HttpCrawlingContext, parsed_content: TParseResult ) -> ExtractLinksFunction: """Create a callback function for extracting links from parsed content. Args: context: The current crawling context. parsed_content: The parsed http response. Returns: Awaitable that is used for extracting links from parsed content. """ async def extract_links( *, selector: str = 'a', attribute: str = 'href', label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> list[Request]: requests = list[Request]() base_user_data = user_data or {} robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url) kwargs.setdefault('strategy', 'same-hostname') strategy = kwargs.get('strategy', 'same-hostname') links_iterator: Iterator[str] = iter( self._parser.find_links(parsed_content, selector=selector, attribute=attribute) ) # Get base URL from <base> tag if present extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]', 'href')) base_url: str = ( str(extracted_base_urls[0]) if extracted_base_urls else context.request.loaded_url or context.request.url ) links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log) if robots_txt_file: skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) else: skipped = iter([]) for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs): request_options = RequestOptions( url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy ) if transform_request_function: transform_request_options = transform_request_function(request_options) if transform_request_options == 'skip': continue if transform_request_options != 'unchanged': request_options = transform_request_options try: request = Request.from_url(**request_options) except ValidationError as exc: context.log.debug( f'Skipping URL "{url}" due to invalid format: {exc}. ' 'This may be caused by a malformed URL or unsupported URL scheme. ' 'Please ensure the URL is correct and retry.' ) continue requests.append(request) skipped_tasks = [ asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped ] await asyncio.gather(*skipped_tasks) return requests return extract_links async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]: """Make http request and create context enhanced by HTTP response. Args: context: The current crawling context. Yields: The original crawling context enhanced by HTTP response. """ async with self._shared_navigation_timeouts[id(context)] as remaining_timeout: result = await self._http_client.crawl( request=context.request, session=context.session, proxy_info=context.proxy_info, statistics=self._statistics, timeout=remaining_timeout, ) context.request.state = RequestState.AFTER_NAV yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response) async def _handle_status_code_response( self, context: HttpCrawlingContext ) -> AsyncGenerator[HttpCrawlingContext, None]: """Validate the HTTP status code and raise appropriate exceptions if needed. Args: context: The current crawling context containing the HTTP response. Raises: SessionError: If the status code indicates the session is blocked. HttpStatusCodeError: If the status code represents a server error or is explicitly configured as an error. HttpClientStatusCodeError: If the status code represents a client error. Yields: The original crawling context if no errors are detected. """ status_code = context.http_response.status_code if self._retry_on_blocked: self._raise_for_session_blocked_status_code(context.session, status_code) self._raise_for_error_status_code(status_code) yield context async def _handle_blocked_request_by_content( self, context: ParsedHttpCrawlingContext[TParseResult] ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]: """Try to detect if the request is blocked based on the parsed response content. Args: context: The current crawling context. Raises: SessionError: If the request is considered blocked. Yields: The original crawling context if no blocking is detected. """ if self._retry_on_blocked and (blocked_info := self._parser.is_blocked(context.parsed_content)): raise SessionError(blocked_info.reason) yield context def pre_navigation_hook(self, hook: Callable[[BasicCrawlingContext], Awaitable[None]]) -> None: """Register a hook to be called before each navigation. Args: hook: A coroutine function to be called before each navigation. """ self._pre_navigation_hooks.append(hook) def post_navigation_hook(self, hook: Callable[[HttpCrawlingContext], Awaitable[None]]) -> None: """Register a hook to be called after each navigation. Args: hook: A coroutine function to be called after each navigation. """ self._post_navigation_hooks.append(hook) ================================================ FILE: src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py ================================================ from __future__ import annotations from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Generic from crawlee._utils.blocked import RETRY_CSS_SELECTORS from crawlee._utils.docs import docs_group from crawlee.crawlers._types import BlockedInfo from ._http_crawling_context import TParseResult, TSelectResult if TYPE_CHECKING: from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse @docs_group('HTTP parsers') class AbstractHttpParser(ABC, Generic[TParseResult, TSelectResult]): """Parser used for parsing HTTP response and inspecting parsed result to find links or detect blocking.""" @abstractmethod async def parse(self, response: HttpResponse) -> TParseResult: """Parse HTTP response. Args: response: HTTP response to be parsed. Returns: Parsed HTTP response. """ @abstractmethod async def parse_text(self, text: str) -> TParseResult: """Parse text containing html. Args: text: String containing html. Returns: Parsed text. """ @abstractmethod async def select(self, parsed_content: TParseResult, selector: str) -> Sequence[TSelectResult]: """Use css selector to select page element and return it. Args: parsed_content: Content where the page element will be located. selector: Css selector used to locate desired html element. Returns: Selected element. """ def is_blocked(self, parsed_content: TParseResult) -> BlockedInfo: """Detect if blocked and return BlockedInfo with additional information. Default implementation that expects `is_matching_selector` abstract method to be implemented. Override this method if your parser has different way of blockage detection. Args: parsed_content: Parsed HTTP response. Result of `parse` method. Returns: `BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty string in reason signifies no blockage detected. """ reason = '' if parsed_content is not None: matched_selectors = [ selector for selector in RETRY_CSS_SELECTORS if self.is_matching_selector(parsed_content, selector) ] if matched_selectors: reason = ( f'Assuming the session is blocked - HTTP response matched the following selectors: ' f'{"; ".join(matched_selectors)}' ) return BlockedInfo(reason=reason) @abstractmethod def is_matching_selector(self, parsed_content: TParseResult, selector: str) -> bool: """Find if selector has match in parsed content. Args: parsed_content: Parsed HTTP response. Result of `parse` method. selector: String used to define matching pattern. Returns: True if selector has match in parsed content. """ @abstractmethod def find_links(self, parsed_content: TParseResult, selector: str, attribute: str) -> Iterable[str]: """Find all links in result using selector. Args: parsed_content: Parsed HTTP response. Result of `parse` method. selector: String used to define matching pattern for finding links. attribute: Which node attribute to extract the links from. Returns: Iterable of strings that contain found links. """ ================================================ FILE: src/crawlee/crawlers/_abstract_http/_http_crawling_context.py ================================================ from __future__ import annotations from dataclasses import dataclass, fields from typing import Generic from typing_extensions import Self, TypeVar from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction, PageSnapshot from crawlee._utils.docs import docs_group from crawlee.http_clients import HttpCrawlingResult, HttpResponse TParseResult = TypeVar('TParseResult') TSelectResult = TypeVar('TSelectResult') @dataclass(frozen=True) @docs_group('Crawling contexts') class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult): """The crawling context used by the `AbstractHttpCrawler`.""" @classmethod def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_response: HttpResponse) -> Self: """Initialize a new instance from an existing `BasicCrawlingContext`.""" context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} return cls(http_response=http_response, **context_kwargs) async def get_snapshot(self) -> PageSnapshot: """Get snapshot of crawled page.""" return PageSnapshot(html=(await self.http_response.read()).decode('utf-8')) @dataclass(frozen=True) @docs_group('Crawling contexts') class ParsedHttpCrawlingContext(HttpCrawlingContext, Generic[TParseResult]): """The crawling context used by `AbstractHttpCrawler`. It provides access to key objects as well as utility functions for handling crawling tasks. """ parsed_content: TParseResult enqueue_links: EnqueueLinksFunction extract_links: ExtractLinksFunction @classmethod def from_http_crawling_context( cls, context: HttpCrawlingContext, parsed_content: TParseResult, enqueue_links: EnqueueLinksFunction, extract_links: ExtractLinksFunction, ) -> Self: """Initialize a new instance from an existing `HttpCrawlingContext`.""" context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} return cls( parsed_content=parsed_content, enqueue_links=enqueue_links, extract_links=extract_links, **context_kwargs ) ================================================ FILE: src/crawlee/crawlers/_abstract_http/py.typed ================================================ ================================================ FILE: src/crawlee/crawlers/_adaptive_playwright/__init__.py ================================================ from crawlee._utils.try_import import install_import_hook as _install_import_hook from crawlee._utils.try_import import try_import as _try_import # These imports have only mandatory dependencies, so they are imported directly. from ._adaptive_playwright_crawling_context import ( AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightPostNavCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, ) _install_import_hook(__name__) # The following imports are wrapped in try_import to handle optional dependencies, # ensuring the module can still function even if these dependencies are missing. with _try_import(__name__, 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor'): from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor with _try_import(__name__, 'AdaptivePlaywrightCrawler'): from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler with _try_import(__name__, 'AdaptivePlaywrightCrawlerStatisticState'): from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawlerStatisticState __all__ = [ 'AdaptivePlaywrightCrawler', 'AdaptivePlaywrightCrawlerStatisticState', 'AdaptivePlaywrightCrawlingContext', 'AdaptivePlaywrightPostNavCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext', 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor', ] ================================================ FILE: src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py ================================================ from __future__ import annotations import logging from collections.abc import Awaitable, Callable, Coroutine from copy import deepcopy from dataclasses import dataclass from logging import getLogger from random import random from typing import TYPE_CHECKING, Any, Generic, get_args from bs4 import BeautifulSoup, Tag from parsel import Selector from typing_extensions import Self, TypeVar, override from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult from crawlee._utils.docs import docs_group from crawlee._utils.wait import wait_for from crawlee.crawlers import ( AbstractHttpCrawler, AbstractHttpParser, BasicCrawler, BeautifulSoupParserType, HttpCrawlingContext, ParsedHttpCrawlingContext, PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPostNavCrawlingContext, PlaywrightPreNavCrawlingContext, ) from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser from crawlee.crawlers._parsel._parsel_parser import ParselParser from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions from crawlee.statistics import Statistics, StatisticsState from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState from ._adaptive_playwright_crawling_context import ( AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightPostNavCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, ) from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor from ._result_comparator import create_default_comparator if TYPE_CHECKING: from types import TracebackType from typing_extensions import Unpack from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions TStaticParseResult = TypeVar('TStaticParseResult') TStaticSelectResult = TypeVar('TStaticSelectResult') TStaticCrawlingContext = TypeVar('TStaticCrawlingContext', bound=ParsedHttpCrawlingContext) class _NonPersistentStatistics(Statistics): """Statistics compliant object that is not supposed to do anything when entering/exiting context. To be used in sub crawlers. """ def __init__(self) -> None: super().__init__(state_model=StatisticsState) async def __aenter__(self) -> Self: self._active = True await self._state.initialize() return self async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None, ) -> None: self._active = False @docs_group('Crawlers') class AdaptivePlaywrightCrawler( BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState], Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult], ): """An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling. It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit. It uses specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`. ### Usage ```python from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( max_requests_per_crawl=10, # Limit the max requests per crawl. playwright_crawler_specific_kwargs={'browser_type': 'chromium'}, ) @crawler.router.default_handler async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None: # Do some processing using `parsed_content` context.log.info(context.parsed_content.title) # Locate element h2 within 5 seconds h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000)) # Do stuff with element found by the selector context.log.info(h2) # Find more links and enqueue them. await context.enqueue_links() # Save some data. await context.push_data({'Visited url': context.request.url}) await crawler.run(['https://crawlee.dev/']) ``` """ def __init__( self, *, static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult], rendering_type_predictor: RenderingTypePredictor | None = None, result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, statistics: Statistics[AdaptivePlaywrightCrawlerStatisticState] | None = None, **kwargs: Unpack[_BasicCrawlerOptions], ) -> None: """Initialize a new instance. Recommended way to create instance is to call factory methods. Recommended factory methods: `with_beautifulsoup_static_parser`, `with_parsel_static_parser`. Args: rendering_type_predictor: Object that implements RenderingTypePredictor and is capable of predicting which rendering method should be used. If None, then `DefaultRenderingTypePredictor` is used. result_checker: Function that evaluates whether crawling result is valid or not. result_comparator: Function that compares two crawling results and decides whether they are equivalent. static_parser: Implementation of `AbstractHttpParser`. Parser that will be used for static crawling. static_crawler_specific_kwargs: `AbstractHttpCrawler` only kwargs that are passed to the sub crawler. playwright_crawler_specific_kwargs: `PlaywrightCrawler` only kwargs that are passed to the sub crawler. statistics: A custom `Statistics[AdaptivePlaywrightCrawlerStatisticState]` instance, allowing the use of non-default configuration. kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. """ # Adaptive crawling related. self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor() self.result_checker = result_checker or (lambda _: True) self.result_comparator = result_comparator or create_default_comparator(result_checker) # Set default concurrency settings for browser crawlers if not provided if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None: kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1) adaptive_statistics = statistics or Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState) super().__init__(statistics=adaptive_statistics, **kwargs) # Sub crawlers related. playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or _PlaywrightCrawlerAdditionalOptions() # Each sub crawler will use custom logger . static_logger = getLogger('Subcrawler_static') static_logger.setLevel(logging.ERROR) basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs} pw_logger = getLogger('Subcrawler_playwright') pw_logger.setLevel(logging.ERROR) basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs} # Initialize sub crawlers to create their pipelines. static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser) static_crawler = static_crawler_class( parser=static_parser, statistics=_NonPersistentStatistics(), **basic_crawler_kwargs_for_static_crawler, ) playwright_crawler = PlaywrightCrawler( statistics=_NonPersistentStatistics(), **playwright_crawler_specific_kwargs, **basic_crawler_kwargs_for_pw_crawler, ) # Register pre navigation hooks on sub crawlers self._pre_navigation_hooks = list[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]]() self._pre_navigation_hooks_pw_only = list[ Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] ]() async def adaptive_pre_navigation_hook_static(context: BasicCrawlingContext) -> None: for hook in self._pre_navigation_hooks: await hook(AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_context(context)) async def adaptive_pre_navigation_hook_pw(context: PlaywrightPreNavCrawlingContext) -> None: for hook in self._pre_navigation_hooks + self._pre_navigation_hooks_pw_only: await hook(AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_context(context)) static_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_static) playwright_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_pw) # Register post navigation hooks on sub crawlers self._post_navigation_hooks = list[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]]() self._post_navigation_hooks_pw_only = list[ Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]] ]() async def adaptive_post_navigation_hook_static(context: HttpCrawlingContext) -> None: adaptive_context = await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context) for hook in self._post_navigation_hooks: await hook(adaptive_context) async def adaptive_post_navigation_hook_pw(context: PlaywrightPostNavCrawlingContext) -> None: adaptive_context = await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context) for hook in self._post_navigation_hooks + self._post_navigation_hooks_pw_only: await hook(adaptive_context) static_crawler.post_navigation_hook(adaptive_post_navigation_hook_static) playwright_crawler.post_navigation_hook(adaptive_post_navigation_hook_pw) self._additional_context_managers = [ *self._additional_context_managers, self.rendering_type_predictor, static_crawler.statistics, playwright_crawler.statistics, playwright_crawler._browser_pool, # noqa: SLF001 # Intentional access to private member. ] # Sub crawler pipeline related self._pw_context_pipeline = playwright_crawler._context_pipeline # noqa:SLF001 # Intentional access to private member. self._static_context_pipeline = static_crawler._context_pipeline # noqa:SLF001 # Intentional access to private member. self._static_parser = static_parser @classmethod def with_beautifulsoup_static_parser( cls, rendering_type_predictor: RenderingTypePredictor | None = None, result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, parser_type: BeautifulSoupParserType = 'lxml', playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, statistics: Statistics[StatisticsState] | None = None, **kwargs: Unpack[_BasicCrawlerOptions], ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup, Tag]: """Create `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content.""" if statistics is not None: adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState) else: adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState) return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup, Tag]( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, result_comparator=result_comparator, static_parser=BeautifulSoupParser(parser=parser_type), playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs, statistics=adaptive_statistics, **kwargs, ) @classmethod def with_parsel_static_parser( cls, rendering_type_predictor: RenderingTypePredictor | None = None, result_checker: Callable[[RequestHandlerRunResult], bool] | None = None, result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None, playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None, statistics: Statistics[StatisticsState] | None = None, **kwargs: Unpack[_BasicCrawlerOptions], ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector, Selector]: """Create `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content.""" if statistics is not None: adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState) else: adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState) return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector, Selector]( rendering_type_predictor=rendering_type_predictor, result_checker=result_checker, result_comparator=result_comparator, static_parser=ParselParser(), playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs, statistics=adaptive_statistics, **kwargs, ) async def _crawl_one( self, rendering_type: RenderingType, context: BasicCrawlingContext, state: dict[str, JsonSerializable] | None = None, ) -> SubCrawlerRun: """Perform a one request crawl with specific context pipeline and return `SubCrawlerRun`. `SubCrawlerRun` contains either result of the crawl or the exception that was thrown during the crawl. Sub crawler pipeline call is dynamically created based on the `rendering_type`. New copy-like context is created from passed `context` and `state` and is passed to sub crawler pipeline. """ if state is not None: async def get_input_state( default_value: dict[str, JsonSerializable] | None = None, # noqa:ARG001 # Intentionally unused arguments. Closure, that generates same output regardless of inputs. ) -> dict[str, JsonSerializable]: return state use_state_function = get_input_state else: use_state_function = context.use_state # New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers. result = RequestHandlerRunResult( key_value_store_getter=self.get_key_value_store, request=context.request, ) context_linked_to_result = BasicCrawlingContext( request=result.request, session=context.session, proxy_info=context.proxy_info, send_request=context.send_request, add_requests=result.add_requests, push_data=result.push_data, get_key_value_store=result.get_key_value_store, use_state=use_state_function, log=context.log, ) try: await wait_for( lambda: self._pipeline_call_factory( rendering_type=rendering_type, context_linked_to_result=context_linked_to_result ), timeout=self._request_handler_timeout, timeout_message=( f'{rendering_type=!s} timed out after {self._request_handler_timeout.total_seconds()}seconds' ), logger=self._logger, ) return SubCrawlerRun(result=result) except Exception as e: return SubCrawlerRun(exception=e) def _pipeline_call_factory( self, rendering_type: RenderingType, context_linked_to_result: BasicCrawlingContext ) -> Coroutine[Any, Any, None]: """Create sub crawler pipeline call.""" if rendering_type == 'static': async def from_static_pipeline_to_top_router( context: ParsedHttpCrawlingContext[TStaticParseResult], ) -> None: adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context( context=context, parser=self._static_parser ) await self.router(adaptive_crawling_context) return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) # ty: ignore[invalid-argument-type] if rendering_type == 'client only': async def from_pw_pipeline_to_top_router(context: PlaywrightCrawlingContext) -> None: adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context( context=context, parser=self._static_parser ) await self.router(adaptive_crawling_context) return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router) # ty: ignore[invalid-argument-type] raise RuntimeError( f'Not a valid rendering type. Must be one of the following: {", ".join(get_args(RenderingType))}' ) @override async def _run_request_handler(self, context: BasicCrawlingContext) -> None: """Override BasicCrawler method that delegates request processing to sub crawlers. To decide which sub crawler should process the request it runs `rendering_type_predictor`. To check if results are valid it uses `result_checker`. To compare results of both sub crawlers it uses `result_comparator`. Reference implementation: https://github.com/apify/crawlee/blob/master/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts """ rendering_type_prediction = self.rendering_type_predictor.predict(context.request) should_detect_rendering_type = random() < rendering_type_prediction.detection_probability_recommendation if not should_detect_rendering_type: self.log.debug( f'Predicted rendering type {rendering_type_prediction.rendering_type} for {context.request.url}' ) if rendering_type_prediction.rendering_type == 'static': context.log.debug(f'Running static request for {context.request.url}') self.track_http_only_request_handler_runs() static_run = await self._crawl_one(rendering_type='static', context=context) if static_run.result and self.result_checker(static_run.result): self._context_result_map[context] = static_run.result return if static_run.exception: context.log.exception( msg=f'Static crawler: failed for {context.request.url}', exc_info=static_run.exception ) else: context.log.warning(f'Static crawler: returned a suspicious result for {context.request.url}') self.track_rendering_type_mispredictions() context.log.debug(f'Running browser request handler for {context.request.url}') old_state_copy = None if should_detect_rendering_type: # Save copy of global state from `use_state` before it can be mutated by browser crawl. # This copy will be used in the static crawl to make sure they both run with same conditions and to # avoid static crawl to modify the state. # (This static crawl is performed only to evaluate rendering type detection.) kvs = await context.get_key_value_store() default_value = dict[str, JsonSerializable]() old_state: dict[str, JsonSerializable] = await kvs.get_value(self._CRAWLEE_STATE_KEY, default_value) old_state_copy = deepcopy(old_state) pw_run = await self._crawl_one('client only', context=context) self.track_browser_request_handler_runs() if pw_run.exception is not None: raise pw_run.exception if pw_run.result: if should_detect_rendering_type: detection_result: RenderingType static_run = await self._crawl_one('static', context=context, state=old_state_copy) if static_run.result and self.result_comparator(static_run.result, pw_run.result): detection_result = 'static' else: detection_result = 'client only' context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}') self.rendering_type_predictor.store_result(context.request, detection_result) self._context_result_map[context] = pw_run.result def pre_navigation_hook( self, hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None, *, playwright_only: bool = False, ) -> Callable[[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]], None]: """Pre navigation hooks for adaptive crawler are delegated to sub crawlers. Optionally parametrized decorator. Hooks are wrapped in context that handles possibly missing `page` object by raising `AdaptiveContextError`. """ def register_hooks(hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None: if playwright_only: self._pre_navigation_hooks_pw_only.append(hook) else: self._pre_navigation_hooks.append(hook) # No parameter in decorator. Execute directly. if hook: register_hooks(hook) # Return parametrized decorator that will be executed through decorator syntax if called with parameter. return register_hooks def post_navigation_hook( self, hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]] | None = None, *, playwright_only: bool = False, ) -> Callable[[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]], None]: """Post navigation hooks for adaptive crawler are delegated to sub crawlers. Optionally parametrized decorator. Hooks are wrapped in context that handles possibly missing `page` and `response` objects by raising `AdaptiveContextError`. """ def register_hooks(hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None: if playwright_only: self._post_navigation_hooks_pw_only.append(hook) else: self._post_navigation_hooks.append(hook) # No parameter in decorator. Execute directly. if hook: register_hooks(hook) # Return parametrized decorator that will be executed through decorator syntax if called with parameter. return register_hooks def track_http_only_request_handler_runs(self) -> None: self.statistics.state.http_only_request_handler_runs += 1 def track_browser_request_handler_runs(self) -> None: self.statistics.state.browser_request_handler_runs += 1 def track_rendering_type_mispredictions(self) -> None: self.statistics.state.rendering_type_mispredictions += 1 @dataclass(frozen=True) class SubCrawlerRun: result: RequestHandlerRunResult | None = None exception: Exception | None = None ================================================ FILE: src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py ================================================ from __future__ import annotations from typing import Annotated from pydantic import ConfigDict, Field from crawlee._utils.docs import docs_group from crawlee.statistics import StatisticsState @docs_group('Statistics') class AdaptivePlaywrightCrawlerStatisticState(StatisticsState): """Statistic data about a crawler run with additional information related to adaptive crawling.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants') http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0 """Number representing how many times static http based crawling was used.""" browser_request_handler_runs: Annotated[int, Field(alias='browser_request_handler_runs')] = 0 """Number representing how many times browser based crawling was used.""" rendering_type_mispredictions: Annotated[int, Field(alias='rendering_type_mispredictions')] = 0 """Number representing how many times the predictor gave incorrect prediction.""" ================================================ FILE: src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py ================================================ from __future__ import annotations from dataclasses import dataclass, fields from datetime import timedelta from typing import TYPE_CHECKING, Generic, TypeVar from playwright.async_api import TimeoutError as PlaywrightTimeoutError from crawlee._types import BasicCrawlingContext from crawlee._utils.docs import docs_group from crawlee.crawlers import AbstractHttpParser, ParsedHttpCrawlingContext, PlaywrightCrawlingContext from crawlee.crawlers._abstract_http._http_crawling_context import HttpCrawlingContext from crawlee.crawlers._playwright._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext from crawlee.crawlers._playwright._types import PlaywrightHttpResponse if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Sequence from playwright.async_api import Page, Response from typing_extensions import Self from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions TStaticParseResult = TypeVar('TStaticParseResult') TStaticSelectResult = TypeVar('TStaticSelectResult') class AdaptiveContextError(RuntimeError): pass @dataclass(frozen=True) @docs_group('Crawling contexts') class AdaptivePlaywrightCrawlingContext( ParsedHttpCrawlingContext[TStaticParseResult], Generic[TStaticParseResult, TStaticSelectResult], ): _static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult] """The crawling context used by `AdaptivePlaywrightCrawler`. It provides access to key objects as well as utility functions for handling crawling tasks. """ _response: Response | None = None _infinite_scroll: Callable[[], Awaitable[None]] | None = None _page: Page | None = None @property def page(self) -> Page: """The Playwright `Page` object for the current page. Raises `AdaptiveContextError` if accessed during static crawling. """ if not self._page: raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._page @property def infinite_scroll(self) -> Callable[[], Awaitable[None]]: """A function to perform infinite scrolling on the page. This scrolls to the bottom, triggering the loading of additional content if present. Raises `AdaptiveContextError` if accessed during static crawling. """ if not self._infinite_scroll: raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._infinite_scroll @property def response(self) -> Response: """The Playwright `Response` object containing the response details for the current URL. Raises `AdaptiveContextError` if accessed during static crawling. """ if not self._response: raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._response async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> None: """Locate element by css selector and return `None` once it is found. If element is not found within timeout, `TimeoutError` is raised. Args: selector: Css selector to be used to locate specific element on page. timeout: Timeout that defines how long the function wait for the selector to appear. """ if await self._static_parser.select(await self.parse_with_static_parser(), selector): return await self.page.locator(selector).wait_for(timeout=timeout.total_seconds() * 1000) async def query_selector_one( self, selector: str, timeout: timedelta = timedelta(seconds=5) ) -> TStaticSelectResult | None: """Locate element by css selector and return first element found. If element is not found within timeout, `TimeoutError` is raised. Args: selector: Css selector to be used to locate specific element on page. timeout: Timeout that defines how long the function wait for the selector to appear. Returns: Result of used static parser `select` method. """ if matches := await self.query_selector_all(selector=selector, timeout=timeout): return matches[0] return None async def query_selector_all( self, selector: str, timeout: timedelta = timedelta(seconds=5) ) -> Sequence[TStaticSelectResult]: """Locate element by css selector and return all elements found. If element is not found within timeout, `TimeoutError` is raised. Args: selector: Css selector to be used to locate specific element on page. timeout: Timeout that defines how long the function wait for the selector to appear. Returns: List of results of used static parser `select` method. """ if static_content := await self._static_parser.select(await self.parse_with_static_parser(), selector): # Selector found in static content. return static_content locator = self.page.locator(selector) try: await locator.wait_for(timeout=timeout.total_seconds() * 1000) except PlaywrightTimeoutError: # Selector not found at all. return () parsed_selector = await self._static_parser.select( await self._static_parser.parse_text(await locator.evaluate('el => el.outerHTML')), selector ) if parsed_selector is not None: # Selector found by browser after some wait time and selected by static parser. return parsed_selector # Selector found by browser after some wait time, but could not be selected by static parser. raise AdaptiveContextError( 'Element exists on the page and Playwright was able to locate it, but the static content parser of selected' 'static crawler does support such selector.' ) async def parse_with_static_parser( self, selector: str | None = None, timeout: timedelta = timedelta(seconds=5) ) -> TStaticParseResult: """Parse whole page with static parser. If `selector` argument is used, wait for selector first. If element is not found within timeout, TimeoutError is raised. Args: selector: css selector to be used to locate specific element on page. timeout: timeout that defines how long the function wait for the selector to appear. Returns: Result of used static parser `parse_text` method. """ if selector: await self.wait_for_selector(selector, timeout) if self._page: return await self._static_parser.parse_text(await self.page.content()) return self.parsed_content @classmethod def from_parsed_http_crawling_context( cls, context: ParsedHttpCrawlingContext[TStaticParseResult], parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult], ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult, TStaticSelectResult]: """Initialize a new instance from an existing `ParsedHttpCrawlingContext`.""" return cls(_static_parser=parser, **{field.name: getattr(context, field.name) for field in fields(context)}) @classmethod async def from_playwright_crawling_context( cls, context: PlaywrightCrawlingContext, parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult], ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult, TStaticSelectResult]: """Initialize a new instance from an existing `PlaywrightCrawlingContext`.""" context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} # Remove playwright specific attributes and pass them as private instead to be available as property. context_kwargs['_response'] = context_kwargs.pop('response') context_kwargs['_page'] = context_kwargs.pop('page') context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll') # This might not be always available. protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0]?.nextHopProtocol') http_response = await PlaywrightHttpResponse.from_playwright_response( response=context.response, protocol=protocol_guess or '' ) # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here. context_kwargs.pop('block_requests') context_kwargs.pop('goto_options') return cls( parsed_content=await parser.parse(http_response), http_response=http_response, _static_parser=parser, **context_kwargs, ) @dataclass(frozen=True) @docs_group('Crawling contexts') class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext): """A wrapper around BasicCrawlingContext or AdaptivePlaywrightCrawlingContext. Trying to access `page` on this context will raise AdaptiveContextError if wrapped context is BasicCrawlingContext. """ _page: Page | None = None block_requests: BlockRequestsFunction | None = None """Blocks network requests matching specified URL patterns.""" goto_options: GotoOptions | None = None """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported.""" @property def page(self) -> Page: """The Playwright `Page` object for the current page. Raises `AdaptiveContextError` if accessed during static crawling. """ if self._page is not None: return self._page raise AdaptiveContextError( 'Page was crawled with static sub crawler and not with crawled with PlaywrightCrawler. For Playwright only ' 'hooks please use `playwright_only`=True when registering the hook. ' 'For example: @crawler.pre_navigation_hook(playwright_only=True)' ) @classmethod def from_pre_navigation_context(cls, context: BasicCrawlingContext) -> Self: """Initialize a new instance from an existing pre-navigation `BasicCrawlingContext`.""" context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} context_kwargs['_page'] = context_kwargs.pop('page', None) # For static sub crawler replace block requests by function doing nothing. async def dummy_block_requests( url_patterns: list[str] | None = None, # noqa:ARG001 extra_url_patterns: list[str] | None = None, # noqa:ARG001 ) -> None: return context_kwargs['block_requests'] = context_kwargs.pop('block_requests', dummy_block_requests) return cls(**context_kwargs) @dataclass(frozen=True) @docs_group('Crawling contexts') class AdaptivePlaywrightPostNavCrawlingContext(HttpCrawlingContext): """A wrapper around HttpCrawlingContext or AdaptivePlaywrightCrawlingContext. Trying to access `page` on this context will raise AdaptiveContextError if wrapped context is HttpCrawlingContext. """ _page: Page | None = None _response: Response | None = None @property def page(self) -> Page: """The Playwright `Page` object for the current page. Raises `AdaptiveContextError` if accessed during static crawling. """ if not self._page: raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') return self._page @property def response(self) -> Response: """The Playwright `Response` object containing the response details for the current URL. Raises `AdaptiveContextError` if accessed during static crawling. """ if not self._response: raise AdaptiveContextError('Response was not crawled with PlaywrightCrawler.') return self._response @classmethod async def from_post_navigation_context( cls, context: HttpCrawlingContext | PlaywrightPostNavCrawlingContext ) -> Self: """Initialize a new instance from an existing post-navigation context.""" context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} context_kwargs['_page'] = context_kwargs.pop('page', None) context_kwargs['_response'] = context_kwargs.pop('response', None) # block_requests and goto_options are useful only on pre-navigation contexts. context_kwargs.pop('block_requests', None) context_kwargs.pop('goto_options', None) if isinstance(context, PlaywrightPostNavCrawlingContext): protocol_guess = await context_kwargs['_page'].evaluate( '() => performance.getEntries()[0]?.nextHopProtocol' ) context_kwargs['http_response'] = await PlaywrightHttpResponse.from_playwright_response( response=context.response, protocol=protocol_guess or '' ) return cls(**context_kwargs) ================================================ FILE: src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py ================================================ from __future__ import annotations from abc import ABC, abstractmethod from collections import defaultdict from dataclasses import dataclass from itertools import zip_longest from logging import getLogger from statistics import mean from typing import TYPE_CHECKING, Annotated, Literal from urllib.parse import urlparse from jaro import jaro_winkler_metric from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator from sklearn.linear_model import LogisticRegression from typing_extensions import override from crawlee._utils.docs import docs_group from crawlee._utils.recoverable_state import RecoverableState from ._utils import sklearn_model_serializer, sklearn_model_validator if TYPE_CHECKING: from types import TracebackType from crawlee import Request logger = getLogger(__name__) UrlComponents = list[str] RenderingType = Literal['static', 'client only'] FeatureVector = tuple[float, float] class RenderingTypePredictorState(BaseModel): model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) model: Annotated[ LogisticRegression, Field(LogisticRegression), PlainValidator(sklearn_model_validator), PlainSerializer(sklearn_model_serializer), ] labels_coefficients: Annotated[defaultdict[str, float], Field(alias='labelsCoefficients')] @docs_group('Other') @dataclass(frozen=True) class RenderingTypePrediction: """Rendering type recommendation with detection probability recommendation.""" rendering_type: RenderingType """Recommended rendering type.""" detection_probability_recommendation: float """Recommended rendering detection probability. Expected values between 0-1. Zero represents absolute confidence in `rendering_type` recommendation. One represents no confidence in `rendering_type` recommendation.""" @docs_group('Other') class RenderingTypePredictor(ABC): """Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls.""" def __init__(self) -> None: """Initialize a new instance.""" # Flag to indicate the state. self._active = False @abstractmethod def predict(self, request: Request) -> RenderingTypePrediction: """Get `RenderingTypePrediction` based on the input request. Args: request: `Request` instance for which the prediction is made. """ @abstractmethod def store_result(self, request: Request, rendering_type: RenderingType) -> None: """Store prediction results and retrain the model. Args: request: Used request. rendering_type: Known suitable `RenderingType`. """ async def initialize(self) -> None: """Initialize additional resources required for the predictor operation.""" if self._active: raise RuntimeError(f'The {self.__class__.__name__} is already active.') self._active = True async def clear(self) -> None: """Clear and release additional resources used by the predictor.""" if not self._active: raise RuntimeError(f'The {self.__class__.__name__} is not active.') self._active = False async def __aenter__(self) -> RenderingTypePredictor: """Initialize the predictor upon entering the context manager.""" await self.initialize() return self async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None, ) -> None: """Clear the predictor upon exiting the context manager.""" await self.clear() @docs_group('Other') class DefaultRenderingTypePredictor(RenderingTypePredictor): """Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls. `RenderingTypePredictor` implementation based on logistic regression: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html """ def __init__( self, detection_ratio: float = 0.1, *, persistence_enabled: bool = False, persist_state_key: str = 'rendering-type-predictor-state', ) -> None: """Initialize a new instance. Args: detection_ratio: A number between 0 and 1 that determines the desired ratio of rendering type detections. persist_state_key: Key in the key-value storage where the trained model parameters will be saved. If None, defaults to 'rendering-type-predictor-state'. persistence_enabled: Whether to enable persistence of the trained model parameters for reuse. """ super().__init__() self._rendering_type_detection_results: dict[RenderingType, dict[str, list[UrlComponents]]] = { 'static': defaultdict(list), 'client only': defaultdict(list), } self._detection_ratio = max(0, min(1, detection_ratio)) # Used to increase detection probability recommendation for initial recommendations of each label. # Reaches 1 (no additional increase) after n samples of specific label is already present in # `self._rendering_type_detection_results`. n = 3 self._state = RecoverableState( default_state=RenderingTypePredictorState( model=LogisticRegression(max_iter=1000), labels_coefficients=defaultdict(lambda: n + 2) ), persist_state_key=persist_state_key, persistence_enabled=persistence_enabled, logger=logger, ) @override async def initialize(self) -> None: """Get current state of the predictor.""" await super().initialize() if not self._state.is_initialized: await self._state.initialize() @override async def clear(self) -> None: """Clear the predictor state.""" await super().clear() if self._state.is_initialized: await self._state.teardown() @override def predict(self, request: Request) -> RenderingTypePrediction: """Get `RenderingTypePrediction` based on the input request. Args: request: `Request` instance for which the prediction is made. """ similarity_threshold = 0.1 # Prediction probability difference threshold to consider prediction unreliable. label = request.label or '' # Check that the model has already been fitted. if hasattr(self._state.current_value.model, 'coef_'): url_feature = self._calculate_feature_vector(get_url_components(request.url), label) # Are both calls expensive? prediction = self._state.current_value.model.predict([url_feature])[0] probability = self._state.current_value.model.predict_proba([url_feature])[0] if abs(probability[0] - probability[1]) < similarity_threshold: # Prediction not reliable. detection_probability_recommendation = 1.0 else: detection_probability_recommendation = self._detection_ratio # Increase recommendation for uncommon labels. detection_probability_recommendation *= self._state.current_value.labels_coefficients[label] return RenderingTypePrediction( rendering_type=('client only', 'static')[int(prediction)], detection_probability_recommendation=detection_probability_recommendation, ) # No data available yet. return RenderingTypePrediction(rendering_type='client only', detection_probability_recommendation=1) @override def store_result(self, request: Request, rendering_type: RenderingType) -> None: """Store prediction results and retrain the model. Args: request: Used `Request` instance. rendering_type: Known suitable `RenderingType` for the used `Request` instance. """ label = request.label or '' self._rendering_type_detection_results[rendering_type][label].append(get_url_components(request.url)) if self._state.current_value.labels_coefficients[label] > 1: self._state.current_value.labels_coefficients[label] -= 1 self._retrain() def _retrain(self) -> None: x: list[FeatureVector] = [(0, 1), (1, 0)] y: list[float] = [0, 1] for rendering_type, urls_by_label in self._rendering_type_detection_results.items(): encoded_rendering_type = 1 if rendering_type == 'static' else 0 for label, urls in urls_by_label.items(): for url_components in urls: x.append(self._calculate_feature_vector(url_components, label)) y.append(encoded_rendering_type) self._state.current_value.model.fit(x, y) def _calculate_mean_similarity(self, url: UrlComponents, label: str, rendering_type: RenderingType) -> float: if not self._rendering_type_detection_results[rendering_type][label]: return 0 return mean( calculate_url_similarity(url, known_url_components) for known_url_components in self._rendering_type_detection_results[rendering_type][label] ) def _calculate_feature_vector(self, url: UrlComponents, label: str) -> tuple[float, float]: return ( self._calculate_mean_similarity(url, label, 'static'), self._calculate_mean_similarity(url, label, 'client only'), ) def get_url_components(url: str) -> UrlComponents: """Get list of url components where first component is host name.""" parsed_url = urlparse(url) if parsed_url.path: return [parsed_url.netloc, *parsed_url.path.strip('/').split('/')] return [parsed_url.netloc] def calculate_url_similarity(url_1: UrlComponents, url_2: UrlComponents) -> float: """Calculate url similarity based on host name and path components similarity. Return 0 if different host names. Compare path components using jaro-wrinkler method and assign 1 or 0 value based on similarity_cutoff for each path component. Return their weighted average. """ # Anything with jaro_winkler_metric less than this value is considered completely different, # otherwise considered the same. similarity_cutoff = 0.8 if (url_1[0] != url_2[0]) or not url_1 or not url_2: return 0 if url_1 == url_2: return 1 # Each additional path component from longer path is compared to empty string. return mean( 1 if jaro_winkler_metric(path_1, path_2) > similarity_cutoff else 0 for path_1, path_2 in zip_longest(url_1[1:], url_2[1:], fillvalue='') ) ================================================ FILE: src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING if TYPE_CHECKING: from collections.abc import Callable from crawlee._types import RequestHandlerRunResult def create_default_comparator( result_checker: Callable[[RequestHandlerRunResult], bool] | None, ) -> Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool]: """Create a default comparator function for evaluating request handler results.""" if result_checker: # Fallback comparator if only user-specific checker is defined. return lambda result_1, result_2: result_checker(result_1) and result_checker(result_2) # Fallback default comparator. return push_data_only_comparator def full_result_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool: """Compare results by comparing all their parts. Comparison of `add_requests_calls` will consider same url requests with different parameters as different For example following two request will be considered as different requests: https://sdk.apify.com/docs/guides/getting-started https://sdk.apify.com/docs/guides/getting-started?__hsfp=1136113150&__hssc=7591405.1.173549427712 """ return ( (result_1.push_data_calls == result_2.push_data_calls) and (result_1.add_requests_calls == result_2.add_requests_calls) and (result_1.key_value_store_changes == result_2.key_value_store_changes) ) def push_data_only_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool: """Compare results by comparing their push data calls. Ignore other parts of results in comparison.""" return result_1.push_data_calls == result_2.push_data_calls ================================================ FILE: src/crawlee/crawlers/_adaptive_playwright/_utils.py ================================================ from typing import Any import numpy as np from sklearn.linear_model import LogisticRegression def sklearn_model_validator(v: LogisticRegression | dict[str, Any]) -> LogisticRegression: if isinstance(v, LogisticRegression): return v model = LogisticRegression(max_iter=1000) if v.get('is_fitted', False): model.coef_ = np.array(v['coef']) model.intercept_ = np.array(v['intercept']) model.classes_ = np.array(v['classes']) model.n_iter_ = np.array(v.get('n_iter', [1000])) return model def sklearn_model_serializer(model: LogisticRegression) -> dict[str, Any]: if hasattr(model, 'coef_'): return { 'coef': np.asarray(model.coef_).tolist(), 'intercept': model.intercept_.tolist(), 'classes': model.classes_.tolist(), 'n_iter': model.n_iter_.tolist() if hasattr(model, 'n_iter_') else [1000], 'is_fitted': True, 'max_iter': model.max_iter, 'solver': model.solver, } return {'is_fitted': False, 'max_iter': model.max_iter, 'solver': model.solver} ================================================ FILE: src/crawlee/crawlers/_basic/__init__.py ================================================ from ._basic_crawler import BasicCrawler, BasicCrawlerOptions from ._basic_crawling_context import BasicCrawlingContext from ._context_pipeline import ContextPipeline __all__ = [ 'BasicCrawler', 'BasicCrawlerOptions', 'BasicCrawlingContext', 'ContextPipeline', ] ================================================ FILE: src/crawlee/crawlers/_basic/_basic_crawler.py ================================================ # Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/basic-crawler/src/internals/basic-crawler.ts from __future__ import annotations import asyncio import functools import logging import signal import sys import tempfile import threading import traceback from asyncio import CancelledError from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Sequence from contextlib import AsyncExitStack, suppress from datetime import timedelta from functools import partial from io import StringIO from pathlib import Path from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast from urllib.parse import ParseResult, urlparse from weakref import WeakKeyDictionary from cachetools import LRUCache from tldextract import TLDExtract from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack, assert_never from yarl import URL from crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locator from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level from crawlee._request import Request, RequestOptions, RequestState from crawlee._service_locator import ServiceLocator from crawlee._types import ( BasicCrawlingContext, EnqueueLinksKwargs, ExportDataCsvKwargs, ExportDataJsonKwargs, GetKeyValueStoreFromRequestHandlerFunction, HttpHeaders, HttpPayload, LogLevel, RequestHandlerRunResult, SendRequestFunction, SkippedReason, ) from crawlee._utils.docs import docs_group from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream from crawlee._utils.recurring_task import RecurringTask from crawlee._utils.robots import RobotsTxtFile from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute from crawlee._utils.wait import wait_for from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error from crawlee.errors import ( ContextPipelineInitializationError, ContextPipelineInterruptedError, HttpClientStatusCodeError, HttpStatusCodeError, RequestCollisionError, RequestHandlerError, SessionError, UserDefinedErrorHandlerError, UserHandlerTimeoutError, ) from crawlee.events._types import Event, EventCrawlerStatusData from crawlee.http_clients import ImpitHttpClient from crawlee.router import Router from crawlee.sessions import SessionPool from crawlee.statistics import Statistics, StatisticsState from crawlee.storages import Dataset, KeyValueStore, RequestQueue from ._context_pipeline import ContextPipeline from ._context_utils import swapped_context from ._logging_utils import ( get_one_line_error_summary_if_possible, reduce_asyncio_timeout_error_to_relevant_traceback_parts, ) if TYPE_CHECKING: import re from collections.abc import Iterator from contextlib import AbstractAsyncContextManager from crawlee._types import ( ConcurrencySettings, EnqueueLinksFunction, ExtractLinksFunction, GetDataKwargs, HttpMethod, JsonSerializable, PushDataKwargs, ) from crawlee.configuration import Configuration from crawlee.events import EventManager from crawlee.http_clients import HttpClient, HttpResponse from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo from crawlee.request_loaders import RequestManager from crawlee.sessions import Session from crawlee.statistics import FinalStatistics from crawlee.storage_clients import StorageClient from crawlee.storage_clients.models import DatasetItemsListPage TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext) TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) TRequestIterator = TypeVar('TRequestIterator', str, Request) TParams = ParamSpec('TParams') T = TypeVar('T') ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]] FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]] SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]] class _BasicCrawlerOptions(TypedDict): """Non-generic options the `BasicCrawler` constructor.""" configuration: NotRequired[Configuration] """The `Configuration` instance. Some of its properties are used as defaults for the crawler.""" event_manager: NotRequired[EventManager] """The event manager for managing events for the crawler and all its components.""" storage_client: NotRequired[StorageClient] """The storage client for managing storages for the crawler and all its components.""" request_manager: NotRequired[RequestManager] """Manager of requests that should be processed by the crawler.""" session_pool: NotRequired[SessionPool] """A custom `SessionPool` instance, allowing the use of non-default configuration.""" proxy_configuration: NotRequired[ProxyConfiguration] """HTTP proxy configuration used when making requests.""" http_client: NotRequired[HttpClient] """HTTP client used by `BasicCrawlingContext.send_request` method.""" max_request_retries: NotRequired[int] """Specifies the maximum number of retries allowed for a request if its processing fails. This includes retries due to navigation errors or errors thrown from user-supplied functions (`request_handler`, `pre_navigation_hooks` etc.). This limit does not apply to retries triggered by session rotation (see `max_session_rotations`).""" max_requests_per_crawl: NotRequired[int | None] """Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed this value.""" max_session_rotations: NotRequired[int] """Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs or if the website blocks the request. The session rotations are not counted towards the `max_request_retries` limit. """ max_crawl_depth: NotRequired[int | None] """Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level of links. Requests at the maximum depth will still be processed, but no new links will be enqueued from those requests. If not set, crawling continues without depth restrictions. """ use_session_pool: NotRequired[bool] """Enable the use of a session pool for managing sessions during crawling.""" retry_on_blocked: NotRequired[bool] """If True, the crawler attempts to bypass bot protections automatically.""" concurrency_settings: NotRequired[ConcurrencySettings] """Settings to fine-tune concurrency levels.""" request_handler_timeout: NotRequired[timedelta] """Maximum duration allowed for a single request handler to run.""" abort_on_error: NotRequired[bool] """If True, the crawler stops immediately when any request handler error occurs.""" configure_logging: NotRequired[bool] """If True, the crawler will set up logging infrastructure automatically.""" statistics_log_format: NotRequired[Literal['table', 'inline']] """If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain text log messages. """ keep_alive: NotRequired[bool] """Flag that can keep crawler running even when there are no requests in queue.""" additional_http_error_status_codes: NotRequired[Iterable[int]] """Additional HTTP status codes to treat as errors, triggering automatic retries when encountered.""" ignore_http_error_status_codes: NotRequired[Iterable[int]] """HTTP status codes that are typically considered errors but should be treated as successful responses.""" _additional_context_managers: NotRequired[Sequence[AbstractAsyncContextManager]] """Additional context managers used throughout the crawler lifecycle. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.""" _logger: NotRequired[logging.Logger] """A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.""" respect_robots_txt_file: NotRequired[bool] """If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`.""" status_message_logging_interval: NotRequired[timedelta] """Interval for logging the crawler status messages.""" status_message_callback: NotRequired[ Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]] ] """Allows overriding the default status message. The default status message is provided in the parameters. Returning `None` suppresses the status message.""" id: NotRequired[int] """Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between them.""" class _BasicCrawlerOptionsGeneric(TypedDict, Generic[TCrawlingContext, TStatisticsState]): """Generic options the `BasicCrawler` constructor.""" request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]] """A callable responsible for handling requests.""" _context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]] """Enables extending the request lifecycle and modifying the crawling context. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.""" statistics: NotRequired[Statistics[TStatisticsState]] """A custom `Statistics` instance, allowing the use of non-default configuration.""" class BasicCrawlerOptions( _BasicCrawlerOptions, _BasicCrawlerOptionsGeneric[TCrawlingContext, TStatisticsState], Generic[TCrawlingContext, TStatisticsState], ): """Arguments for the `BasicCrawler` constructor. It is intended for typing forwarded `__init__` arguments in the subclasses. """ @docs_group('Crawlers') class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]): """A basic web crawler providing a framework for crawling websites. The `BasicCrawler` provides a low-level functionality for crawling websites, allowing users to define their own page download and data extraction logic. It is designed mostly to be subclassed by crawlers with specific purposes. In most cases, you will want to use a more specialized crawler, such as `HttpCrawler`, `BeautifulSoupCrawler`, `ParselCrawler`, or `PlaywrightCrawler`. If you are an advanced user and want full control over the crawling process, you can subclass the `BasicCrawler` and implement the request-handling logic yourself. The crawling process begins with URLs provided by a `RequestProvider` instance. Each request is then handled by a user-defined `request_handler` function, which processes the page and extracts the data. The `BasicCrawler` includes several common features for crawling, such as: - automatic scaling based on the system resources, - retries for failed requests, - session management, - statistics tracking, - request routing via labels, - proxy rotation, - direct storage interaction helpers, - and more. """ _CRAWLEE_STATE_KEY = 'CRAWLEE_STATE' _request_handler_timeout_text = 'Request handler timed out after' __next_id = 0 def __init__( self, *, configuration: Configuration | None = None, event_manager: EventManager | None = None, storage_client: StorageClient | None = None, request_manager: RequestManager | None = None, session_pool: SessionPool | None = None, proxy_configuration: ProxyConfiguration | None = None, http_client: HttpClient | None = None, request_handler: Callable[[TCrawlingContext], Awaitable[None]] | None = None, max_request_retries: int = 3, max_requests_per_crawl: int | None = None, max_session_rotations: int = 10, max_crawl_depth: int | None = None, use_session_pool: bool = True, retry_on_blocked: bool = True, additional_http_error_status_codes: Iterable[int] | None = None, ignore_http_error_status_codes: Iterable[int] | None = None, concurrency_settings: ConcurrencySettings | None = None, request_handler_timeout: timedelta = timedelta(minutes=1), statistics: Statistics[TStatisticsState] | None = None, abort_on_error: bool = False, keep_alive: bool = False, configure_logging: bool = True, statistics_log_format: Literal['table', 'inline'] = 'table', respect_robots_txt_file: bool = False, status_message_logging_interval: timedelta = timedelta(seconds=10), status_message_callback: Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]] | None = None, id: int | None = None, _context_pipeline: ContextPipeline[TCrawlingContext] | None = None, _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None, _logger: logging.Logger | None = None, ) -> None: """Initialize a new instance. Args: configuration: The `Configuration` instance. Some of its properties are used as defaults for the crawler. event_manager: The event manager for managing events for the crawler and all its components. storage_client: The storage client for managing storages for the crawler and all its components. request_manager: Manager of requests that should be processed by the crawler. session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration. proxy_configuration: HTTP proxy configuration used when making requests. http_client: HTTP client used by `BasicCrawlingContext.send_request` method. request_handler: A callable responsible for handling requests. max_request_retries: Specifies the maximum number of retries allowed for a request if its processing fails. This includes retries due to navigation errors or errors thrown from user-supplied functions (`request_handler`, `pre_navigation_hooks` etc.). This limit does not apply to retries triggered by session rotation (see `max_session_rotations`). max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed this value. If used together with `keep_alive`, then the crawler will be kept alive only until `max_requests_per_crawl` is achieved. max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs or if the website blocks the request. The session rotations are not counted towards the `max_request_retries` limit. max_crawl_depth: Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level of links. Requests at the maximum depth will still be processed, but no new links will be enqueued from those requests. If not set, crawling continues without depth restrictions. use_session_pool: Enable the use of a session pool for managing sessions during crawling. retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically. additional_http_error_status_codes: Additional HTTP status codes to treat as errors, triggering automatic retries when encountered. ignore_http_error_status_codes: HTTP status codes that are typically considered errors but should be treated as successful responses. concurrency_settings: Settings to fine-tune concurrency levels. request_handler_timeout: Maximum duration allowed for a single request handler to run. statistics: A custom `Statistics` instance, allowing the use of non-default configuration. abort_on_error: If True, the crawler stops immediately when any request handler error occurs. keep_alive: If True, it will keep crawler alive even if there are no requests in queue. Use `crawler.stop()` to exit the crawler. configure_logging: If True, the crawler will set up logging infrastructure automatically. statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain text log messages. respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction` status_message_logging_interval: Interval for logging the crawler status messages. status_message_callback: Allows overriding the default status message. The default status message is provided in the parameters. Returning `None` suppresses the status message. id: Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between them. _context_pipeline: Enables extending the request lifecycle and modifying the crawling context. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`. _additional_context_managers: Additional context managers used throughout the crawler lifecycle. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`. _logger: A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`. """ if id is None: self._id = BasicCrawler.__next_id BasicCrawler.__next_id += 1 else: self._id = id implicit_event_manager_with_explicit_config = False if not configuration: configuration = service_locator.get_configuration() elif not event_manager: implicit_event_manager_with_explicit_config = True if not storage_client: storage_client = service_locator.get_storage_client() if not event_manager: event_manager = service_locator.get_event_manager() self._service_locator = ServiceLocator( configuration=configuration, storage_client=storage_client, event_manager=event_manager ) config = self._service_locator.get_configuration() # Core components self._request_manager = request_manager self._session_pool = session_pool or SessionPool() self._proxy_configuration = proxy_configuration self._additional_http_error_status_codes = ( set(additional_http_error_status_codes) if additional_http_error_status_codes else set() ) self._ignore_http_error_status_codes = ( set(ignore_http_error_status_codes) if ignore_http_error_status_codes else set() ) self._http_client = http_client or ImpitHttpClient() # Request router setup self._router: Router[TCrawlingContext] | None = None if isinstance(cast('Router', request_handler), Router): self._router = cast('Router[TCrawlingContext]', request_handler) elif request_handler is not None: self._router = None self.router.default_handler(request_handler) # Error, failed & skipped request handlers self._error_handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] | None = None self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None self._on_skipped_request: SkippedRequestCallback | None = None self._abort_on_error = abort_on_error # Crawler callbacks self._status_message_callback = status_message_callback # Context of each request with matching result of request handler. # Inheritors can use this to override the result of individual request handler runs in `_run_request_handler`. self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]() # Context pipeline self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects) # ty: ignore[invalid-argument-type] # Crawl settings self._max_request_retries = max_request_retries self._max_requests_per_crawl = max_requests_per_crawl self._max_session_rotations = max_session_rotations self._max_crawl_depth = max_crawl_depth self._respect_robots_txt_file = respect_robots_txt_file # Timeouts self._request_handler_timeout = request_handler_timeout self._internal_timeout = ( config.internal_timeout if config.internal_timeout is not None else max(2 * request_handler_timeout, timedelta(minutes=5)) ) # Retry and session settings self._use_session_pool = use_session_pool self._retry_on_blocked = retry_on_blocked # Logging setup if configure_logging: root_logger = logging.getLogger() configure_logger(root_logger, remove_old_handlers=True) httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING) self._logger = _logger or logging.getLogger(__name__) if implicit_event_manager_with_explicit_config: self._logger.warning( 'No event manager set, implicitly using event manager from global service_locator.' 'It is advised to explicitly set the event manager if explicit configuration is used as well.' ) self._statistics_log_format = statistics_log_format # Statistics if statistics: self._statistics = statistics else: async def persist_state_factory() -> KeyValueStore: return await self.get_key_value_store() self._statistics = cast( 'Statistics[TStatisticsState]', Statistics.with_default_state( persistence_enabled=True, periodic_message_logger=self._logger, statistics_log_format=self._statistics_log_format, log_message='Current request statistics:', persist_state_kvs_factory=persist_state_factory, ), ) # Additional context managers to enter and exit self._additional_context_managers = _additional_context_managers or [] # Internal, not explicitly configurable components self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000) self._robots_txt_lock = asyncio.Lock() self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name) self._snapshotter = Snapshotter.from_config(config) self._autoscaled_pool = AutoscaledPool( system_status=SystemStatus(self._snapshotter), concurrency_settings=concurrency_settings, is_finished_function=self.__is_finished_function, is_task_ready_function=self.__is_task_ready_function, run_task_function=self.__run_task_function, ) self._crawler_state_rec_task = RecurringTask( func=self._crawler_state_task, delay=status_message_logging_interval ) self._previous_crawler_state: TStatisticsState | None = None # State flags self._keep_alive = keep_alive self._running = False self._has_finished_before = False self._failed = False self._unexpected_stop = False @property def log(self) -> logging.Logger: """The logger used by the crawler.""" return self._logger @property def router(self) -> Router[TCrawlingContext]: """The `Router` used to handle each individual crawling request.""" if self._router is None: self._router = Router[TCrawlingContext]() return self._router @router.setter def router(self, router: Router[TCrawlingContext]) -> None: if self._router is not None: raise RuntimeError('A router is already set') self._router = router @property def statistics(self) -> Statistics[TStatisticsState]: """Statistics about the current (or last) crawler run.""" return self._statistics def stop(self, reason: str = 'Stop was called externally.') -> None: """Set flag to stop crawler. This stops current crawler run regardless of whether all requests were finished. Args: reason: Reason for stopping that will be used in logs. """ self._logger.info(f'Crawler.stop() was called with following reason: {reason}.') self._unexpected_stop = True def _wrap_handler_with_error_context( self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]] ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]: """Decorate error handlers to make their context helpers usable.""" @functools.wraps(handler) async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T: # Original context helpers that are from `RequestHandlerRunResult` will not be committed as the request # failed. Modified context provides context helpers with direct access to the storages. error_context = context.create_modified_copy( push_data=self._push_data, get_key_value_store=self.get_key_value_store, add_requests=functools.partial(self._add_requests, context), ) return await handler(error_context, exception) return wrapped_handler def _stop_if_max_requests_count_exceeded(self) -> None: """Call `stop` when the maximum number of requests to crawl has been reached.""" if self._max_requests_per_crawl is None: return if self._statistics.state.requests_total >= self._max_requests_per_crawl: self.stop( reason=f'The crawler has reached its limit of {self._max_requests_per_crawl} requests per crawl. ' ) async def _get_session(self) -> Session | None: """If session pool is being used, try to take a session from it.""" if not self._use_session_pool: return None return await wait_for( self._session_pool.get_session, timeout=self._internal_timeout, timeout_message='Fetching a session from the pool timed out after ' f'{self._internal_timeout.total_seconds()} seconds', max_retries=3, logger=self._logger, ) async def _get_session_by_id(self, session_id: str | None) -> Session | None: """If session pool is being used, try to take a session by id from it.""" if not self._use_session_pool or not session_id: return None return await wait_for( partial(self._session_pool.get_session_by_id, session_id), timeout=self._internal_timeout, timeout_message='Fetching a session from the pool timed out after ' f'{self._internal_timeout.total_seconds()} seconds', max_retries=3, logger=self._logger, ) async def _get_proxy_info(self, request: Request, session: Session | None) -> ProxyInfo | None: """Retrieve a new ProxyInfo object based on crawler configuration and the current request and session.""" if not self._proxy_configuration: return None return await self._proxy_configuration.new_proxy_info( session_id=session.id if session else None, request=request, proxy_tier=None, ) async def get_request_manager(self) -> RequestManager: """Return the configured request manager. If none is configured, open and return the default request queue.""" if not self._request_manager: self._request_manager = await RequestQueue.open( storage_client=self._service_locator.get_storage_client(), configuration=self._service_locator.get_configuration(), ) return self._request_manager async def get_dataset( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, ) -> Dataset: """Return the `Dataset` with the given ID or name. If none is provided, return the default one.""" return await Dataset.open( id=id, name=name, alias=alias, storage_client=self._service_locator.get_storage_client(), configuration=self._service_locator.get_configuration(), ) async def get_key_value_store( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, ) -> KeyValueStore: """Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS.""" return await KeyValueStore.open( id=id, name=name, alias=alias, storage_client=self._service_locator.get_storage_client(), configuration=self._service_locator.get_configuration(), ) def error_handler( self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] ) -> ErrorHandler[TCrawlingContext]: """Register a function to handle errors occurring in request handlers. The error handler is invoked after a request handler error occurs and before a retry attempt. """ self._error_handler = self._wrap_handler_with_error_context(handler) return handler def failed_request_handler( self, handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] ) -> FailedRequestHandler[TCrawlingContext]: """Register a function to handle requests that exceed the maximum retry limit. The failed request handler is invoked when a request has failed all retry attempts. """ self._failed_request_handler = self._wrap_handler_with_error_context(handler) return handler def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback: """Register a function to handle skipped requests. The skipped request handler is invoked when a request is skipped due to a collision or other reasons. """ self._on_skipped_request = callback return callback async def run( self, requests: Sequence[str | Request] | None = None, *, purge_request_queue: bool = True, ) -> FinalStatistics: """Run the crawler until all requests are processed. Args: requests: The requests to be enqueued before the crawler starts. purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default request queue will be purged. """ if self._running: raise RuntimeError( 'This crawler instance is already running, you can add more requests to it via `crawler.add_requests()`' ) self._running = True if self._has_finished_before: await self._statistics.reset() if self._use_session_pool: await self._session_pool.reset_store() request_manager = await self.get_request_manager() if purge_request_queue and isinstance(request_manager, RequestQueue): await request_manager.drop() self._request_manager = await RequestQueue.open( storage_client=self._service_locator.get_storage_client(), configuration=self._service_locator.get_configuration(), ) if requests is not None: await self.add_requests(requests) interrupted = False def sigint_handler() -> None: nonlocal interrupted if not interrupted: interrupted = True self._logger.info('Pausing... Press CTRL+C again to force exit.') run_task.cancel() run_task = asyncio.create_task(self._run_crawler(), name='run_crawler_task') if threading.current_thread() is threading.main_thread(): # `add_signal_handler` works only in the main thread with suppress(NotImplementedError): # event loop signal handlers are not supported on Windows asyncio.get_running_loop().add_signal_handler(signal.SIGINT, sigint_handler) try: await run_task except CancelledError: pass finally: if threading.current_thread() is threading.main_thread(): with suppress(NotImplementedError): asyncio.get_running_loop().remove_signal_handler(signal.SIGINT) if self._statistics.error_tracker.total > 0: self._logger.info( 'Error analysis:' f' total_errors={self._statistics.error_tracker.total}' f' unique_errors={self._statistics.error_tracker.unique_error_count}' ) if interrupted: self._logger.info( f'The crawl was interrupted. To resume, do: CRAWLEE_PURGE_ON_START=0 python {sys.argv[0]}' ) self._running = False self._has_finished_before = True await self._save_crawler_state() final_statistics = self._statistics.calculate() if self._statistics_log_format == 'table': self._logger.info(f'Final request statistics:\n{final_statistics.to_table()}') else: self._logger.info('Final request statistics:', extra=final_statistics.to_dict()) return final_statistics async def _run_crawler(self) -> None: event_manager = self._service_locator.get_event_manager() # Collect the context managers to be entered. Context managers that are already active are excluded, # as they were likely entered by the caller, who will also be responsible for exiting them. contexts_to_enter = [ cm for cm in ( event_manager, self._snapshotter, self._statistics, self._session_pool if self._use_session_pool else None, self._http_client, self._crawler_state_rec_task, *self._additional_context_managers, ) if cm and getattr(cm, 'active', False) is False ] async with AsyncExitStack() as exit_stack: for context in contexts_to_enter: await exit_stack.enter_async_context(context) # ty: ignore[invalid-argument-type] await self._autoscaled_pool.run() async def add_requests( self, requests: Sequence[str | Request], *, forefront: bool = False, batch_size: int = 1000, wait_time_between_batches: timedelta = timedelta(0), wait_for_all_requests_to_be_added: bool = False, wait_for_all_requests_to_be_added_timeout: timedelta | None = None, ) -> None: """Add requests to the underlying request manager in batches. Args: requests: A list of requests to add to the queue. forefront: If True, add requests to the forefront of the queue. batch_size: The number of requests to add in one batch. wait_time_between_batches: Time to wait between adding batches. wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning. wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added. """ allowed_requests = [] skipped = [] for request in requests: check_url = request.url if isinstance(request, Request) else request if await self._is_allowed_based_on_robots_txt_file(check_url): allowed_requests.append(request) else: skipped.append(request) if skipped: skipped_tasks = [ asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped ] await asyncio.gather(*skipped_tasks) self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file') request_manager = await self.get_request_manager() await request_manager.add_requests( requests=allowed_requests, forefront=forefront, batch_size=batch_size, wait_time_between_batches=wait_time_between_batches, wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added, wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout, ) async def use_state( self, default_value: dict[str, JsonSerializable] | None = None, ) -> dict[str, JsonSerializable]: kvs = await self.get_key_value_store() return await kvs.get_auto_saved_value(f'{self._CRAWLEE_STATE_KEY}_{self._id}', default_value) async def _save_crawler_state(self) -> None: store = await self.get_key_value_store() await store.persist_autosaved_values() async def get_data( self, dataset_id: str | None = None, dataset_name: str | None = None, dataset_alias: str | None = None, **kwargs: Unpack[GetDataKwargs], ) -> DatasetItemsListPage: """Retrieve data from a `Dataset`. This helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified one and then retrieves the data based on the provided parameters. Args: dataset_id: The ID of the `Dataset`. dataset_name: The name of the `Dataset` (global scope, named storage). dataset_alias: The alias of the `Dataset` (run scope, unnamed storage). kwargs: Keyword arguments to be passed to the `Dataset.get_data()` method. Returns: The retrieved data. """ dataset = await Dataset.open( id=dataset_id, name=dataset_name, alias=dataset_alias, storage_client=self._service_locator.get_storage_client(), configuration=self._service_locator.get_configuration(), ) return await dataset.get_data(**kwargs) async def export_data( self, path: str | Path, dataset_id: str | None = None, dataset_name: str | None = None, dataset_alias: str | None = None, **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], ) -> None: """Export all items from a Dataset to a JSON or CSV file. This method simplifies the process of exporting data collected during crawling. It automatically determines the export format based on the file extension (`.json` or `.csv`) and handles the conversion of `Dataset` items to the appropriate format. Args: path: The destination file path. Must end with '.json' or '.csv'. dataset_id: The ID of the Dataset to export from. dataset_name: The name of the Dataset to export from (global scope, named storage). dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage). additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format. """ dataset = await Dataset.open( id=dataset_id, name=dataset_name, alias=dataset_alias, storage_client=self._service_locator.get_storage_client(), configuration=self._service_locator.get_configuration(), ) path = Path(path) if path.suffix == '.csv': dst = StringIO() csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs) await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs) await atomic_write(path, dst.getvalue()) elif path.suffix == '.json': dst = StringIO() json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs) await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs) await atomic_write(path, dst.getvalue()) else: raise ValueError(f'Unsupported file extension: {path.suffix}') async def _push_data( self, data: list[dict[str, Any]] | dict[str, Any], dataset_id: str | None = None, dataset_name: str | None = None, dataset_alias: str | None = None, **kwargs: Unpack[PushDataKwargs], ) -> None: """Push data to a `Dataset`. This helper method simplifies the process of pushing data to a `Dataset`. It opens the specified one and then pushes the provided data to it. Args: data: The data to push to the `Dataset`. dataset_id: The ID of the `Dataset`. dataset_name: The name of the `Dataset` (global scope, named storage). dataset_alias: The alias of the `Dataset` (run scope, unnamed storage). kwargs: Keyword arguments to be passed to the `Dataset.push_data()` method. """ dataset = await self.get_dataset(id=dataset_id, name=dataset_name, alias=dataset_alias) await dataset.push_data(data, **kwargs) def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) -> bool: if context.request.no_retry: return False # Do not retry on client errors. if isinstance(error, HttpClientStatusCodeError): return False if isinstance(error, SessionError): return ((context.request.session_rotation_count or 0) + 1) < self._max_session_rotations max_request_retries = context.request.max_retries if max_request_retries is None: max_request_retries = self._max_request_retries return context.request.retry_count < max_request_retries async def _check_url_after_redirects(self, context: TCrawlingContext) -> AsyncGenerator[TCrawlingContext, None]: """Ensure that the `loaded_url` still matches the enqueue strategy after redirects. Filter out links that redirect outside of the crawled domain. """ if context.request.loaded_url is not None and not self._check_enqueue_strategy( context.request.enqueue_strategy, origin_url=urlparse(context.request.url), target_url=urlparse(context.request.loaded_url), ): raise ContextPipelineInterruptedError( f'Skipping URL {context.request.loaded_url} (redirected from {context.request.url})' ) yield context def _create_enqueue_links_function( self, context: BasicCrawlingContext, extract_links: ExtractLinksFunction ) -> EnqueueLinksFunction: """Create a callback function for extracting links from parsed content and enqueuing them to the crawl. Args: context: The current crawling context. extract_links: Function used to extract links from the page. Returns: Awaitable that is used for extracting links from parsed content and enqueuing them to the crawl. """ async def enqueue_links( *, selector: str | None = None, attribute: str | None = None, label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, rq_id: str | None = None, rq_name: str | None = None, rq_alias: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> None: kwargs.setdefault('strategy', 'same-hostname') if requests: if any((selector, attribute, label, user_data, transform_request_function)): raise ValueError( 'You cannot provide `selector`, `attribute`, `label`, `user_data` or ' '`transform_request_function` arguments when `requests` is provided.' ) # Add directly passed requests. await context.add_requests( requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs ) else: # Add requests from extracted links. await context.add_requests( await extract_links( selector=selector or 'a', attribute=attribute or 'href', label=label, user_data=user_data, transform_request_function=transform_request_function, **kwargs, ), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs, ) return enqueue_links def _enqueue_links_filter_iterator( self, request_iterator: Iterator[TRequestIterator], origin_url: str, **kwargs: Unpack[EnqueueLinksKwargs] ) -> Iterator[TRequestIterator]: """Filter requests based on the enqueue strategy and URL patterns.""" limit = kwargs.get('limit') parsed_origin_url = urlparse(origin_url) strategy = kwargs.get('strategy', 'all') if strategy == 'all' and not parsed_origin_url.hostname: self.log.warning(f'Skipping enqueue: Missing hostname in origin_url = {origin_url}.') return # Emit a `warning` message to the log, only once per call warning_flag = True for request in request_iterator: if isinstance(request, Request): if request.enqueue_strategy != strategy: request.enqueue_strategy = strategy target_url = request.url else: target_url = request parsed_target_url = urlparse(target_url) if warning_flag and strategy != 'all' and not parsed_target_url.hostname: self.log.warning(f'Skipping enqueue url: Missing hostname in target_url = {target_url}.') warning_flag = False if self._check_enqueue_strategy( strategy, target_url=parsed_target_url, origin_url=parsed_origin_url ) and self._check_url_patterns(target_url, kwargs.get('include'), kwargs.get('exclude')): yield request if limit is not None: limit -= 1 if limit <= 0: break def _check_enqueue_strategy( self, strategy: EnqueueStrategy, *, target_url: ParseResult, origin_url: ParseResult, ) -> bool: """Check if a URL matches the enqueue_strategy.""" if strategy == 'all': return True if origin_url.hostname is None or target_url.hostname is None: self.log.debug( f'Skipping enqueue: Missing hostname in origin_url = {origin_url.geturl()} or ' f'target_url = {target_url.geturl()}' ) return False if strategy == 'same-hostname': return target_url.hostname == origin_url.hostname if strategy == 'same-domain': origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix return origin_domain == target_domain if strategy == 'same-origin': return ( target_url.hostname == origin_url.hostname and target_url.scheme == origin_url.scheme and target_url.port == origin_url.port ) assert_never(strategy) def _check_url_patterns( self, target_url: str, include: Sequence[re.Pattern[Any] | Glob] | None, exclude: Sequence[re.Pattern[Any] | Glob] | None, ) -> bool: """Check if a URL matches configured include/exclude patterns.""" # If the URL matches any `exclude` pattern, reject it for pattern in exclude or (): if isinstance(pattern, Glob): pattern = pattern.regexp # noqa: PLW2901 if pattern.match(target_url) is not None: return False # If there are no `include` patterns and the URL passed all `exclude` patterns, accept the URL if include is None: return True # If the URL matches any `include` pattern, accept it for pattern in include: if isinstance(pattern, Glob): pattern = pattern.regexp # noqa: PLW2901 if pattern.match(target_url) is not None: return True # The URL does not match any `include` pattern - reject it return False async def _handle_request_retries( self, context: TCrawlingContext | BasicCrawlingContext, error: Exception, ) -> None: request_manager = await self.get_request_manager() request = context.request if self._abort_on_error: self._logger.exception('Aborting crawler run due to error (abort_on_error=True)', exc_info=error) self._failed = True if self._should_retry_request(context, error): request.retry_count += 1 reduced_error = str(error).split('\n')[0] self.log.warning( f'Retrying request to {context.request.url} due to: {reduced_error}. ' f'{get_one_line_error_summary_if_possible(error)}' ) await self._statistics.error_tracker.add(error=error, context=context) if self._error_handler: try: new_request = await self._error_handler(context, error) except Exception as e: raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e else: if new_request is not None and new_request != request: await request_manager.add_request(new_request) await self._mark_request_as_handled(request) return await request_manager.reclaim_request(request) else: request.state = RequestState.ERROR await self._mark_request_as_handled(request) await self._handle_failed_request(context, error) self._statistics.record_request_processing_failure(request.unique_key) async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None: try: context.request.state = RequestState.ERROR_HANDLER await wait_for( partial(self._handle_request_retries, context, error), timeout=self._internal_timeout, timeout_message='Handling request failure timed out after ' f'{self._internal_timeout.total_seconds()} seconds', logger=self._logger, ) except UserDefinedErrorHandlerError: context.request.state = RequestState.ERROR raise except Exception as secondary_error: self._logger.exception( 'An exception occurred during handling of failed request. This places the crawler ' 'and its underlying storages into an unknown state and crawling will be terminated.', exc_info=secondary_error, ) context.request.state = RequestState.ERROR raise if context.session: context.session.mark_bad() async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None: self._logger.error( f'Request to {context.request.url} failed and reached maximum retries\n ' f'{self._get_message_from_error(error)}' ) await self._statistics.error_tracker.add(error=error, context=context) if self._failed_request_handler: try: await self._failed_request_handler(context, error) except Exception as e: raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e async def _handle_skipped_request( self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False ) -> None: if need_mark and isinstance(request, Request): request.state = RequestState.SKIPPED await self._mark_request_as_handled(request) url = request.url if isinstance(request, Request) else request if self._on_skipped_request: try: await self._on_skipped_request(url, reason) except Exception as e: raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e def _get_message_from_error(self, error: Exception) -> str: """Get error message summary from exception. Custom processing to reduce the irrelevant traceback clutter in some cases. """ traceback_parts = traceback.format_exception(type(error), value=error, tb=error.__traceback__, chain=True) used_traceback_parts = traceback_parts if ( isinstance(error, asyncio.exceptions.TimeoutError) and traceback_parts and self._request_handler_timeout_text in traceback_parts[-1] ) or isinstance(error, UserHandlerTimeoutError): used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error) used_traceback_parts.extend(traceback_parts[-1:]) return ''.join(used_traceback_parts).strip('\n') def _get_only_inner_most_exception(self, error: BaseException) -> BaseException: """Get innermost exception by following __cause__ and __context__ attributes of exception.""" if error.__cause__: return self._get_only_inner_most_exception(error.__cause__) if error.__context__: return self._get_only_inner_most_exception(error.__context__) # No __cause__ and no __context__, this is as deep as it can get. return error def _prepare_send_request_function( self, session: Session | None, proxy_info: ProxyInfo | None, ) -> SendRequestFunction: async def send_request( url: str, *, method: HttpMethod = 'GET', payload: HttpPayload | None = None, headers: HttpHeaders | dict[str, str] | None = None, ) -> HttpResponse: return await self._http_client.send_request( url=url, method=method, payload=payload, headers=headers, session=session, proxy_info=proxy_info, ) return send_request def _convert_url_to_request_iterator(self, urls: Sequence[str | Request], base_url: str) -> Iterator[Request]: """Convert a sequence of URLs or Request objects to an iterator of Request objects.""" for url in urls: # If the request is a Request object, keep it as it is if isinstance(url, Request): yield url # If the request is a string, convert it to Request object with absolute_url. elif isinstance(url, str) and not is_url_absolute(url): absolute_url = convert_to_absolute_url(base_url, url) yield Request.from_url(absolute_url) else: yield Request.from_url(url) async def _add_requests( self, context: BasicCrawlingContext, requests: Sequence[str | Request], rq_id: str | None = None, rq_name: str | None = None, rq_alias: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> None: """Add requests method aware of the crawling context.""" if rq_id or rq_name or rq_alias: request_manager: RequestManager = await RequestQueue.open( id=rq_id, name=rq_name, alias=rq_alias, storage_client=self._service_locator.get_storage_client(), configuration=self._service_locator.get_configuration(), ) else: request_manager = await self.get_request_manager() context_aware_requests = list[Request]() base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url requests_iterator = self._convert_url_to_request_iterator(requests, base_url) filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs) for dst_request in filter_requests_iterator: # Update the crawl depth of the request. dst_request.crawl_depth = context.request.crawl_depth + 1 if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth: context_aware_requests.append(dst_request) return await request_manager.add_requests(context_aware_requests) async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None: """Commit request handler result for the input `context`. Result is taken from `_context_result_map`.""" result = self._context_result_map[context] for add_requests_call in result.add_requests_calls: await self._add_requests(context, **add_requests_call) for push_data_call in result.push_data_calls: await self._push_data(**push_data_call) await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store) result.apply_request_changes(target=context.request) @staticmethod async def _commit_key_value_store_changes( result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction ) -> None: """Store key value store changes recorded in result.""" for (id, name, alias), changes in result.key_value_store_changes.items(): store = await get_kvs(id=id, name=name, alias=alias) for key, value in changes.updates.items(): await store.set_value(key, value.content, value.content_type) async def __is_finished_function(self) -> bool: self._stop_if_max_requests_count_exceeded() if self._unexpected_stop: self._logger.info('The crawler will finish any remaining ongoing requests and shut down.') return True if self._abort_on_error and self._failed: self._failed = False return True if self._keep_alive: return False request_manager = await self.get_request_manager() return await request_manager.is_finished() async def __is_task_ready_function(self) -> bool: self._stop_if_max_requests_count_exceeded() if self._unexpected_stop: self._logger.info( 'No new requests are allowed because crawler `stop` method was called. ' 'Ongoing requests will be allowed to complete.' ) return False request_manager = await self.get_request_manager() return not await request_manager.is_empty() async def __run_task_function(self) -> None: request_manager = await self.get_request_manager() request = await wait_for( request_manager.fetch_next_request, timeout=self._internal_timeout, timeout_message=f'Fetching next request failed after {self._internal_timeout.total_seconds()} seconds', logger=self._logger, max_retries=3, ) if request is None: return if not (await self._is_allowed_based_on_robots_txt_file(request.url)): self._logger.warning( f'Skipping request {request.url} ({request.unique_key}) because it is disallowed based on robots.txt' ) await self._handle_skipped_request(request, 'robots_txt', need_mark=True) return if request.session_id: session = await self._get_session_by_id(request.session_id) else: session = await self._get_session() proxy_info = await self._get_proxy_info(request, session) result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request) context = BasicCrawlingContext( request=result.request, session=session, proxy_info=proxy_info, send_request=self._prepare_send_request_function(session, proxy_info), add_requests=result.add_requests, push_data=result.push_data, get_key_value_store=result.get_key_value_store, use_state=self.use_state, log=self._logger, ) self._context_result_map[context] = result self._statistics.record_request_processing_start(request.unique_key) try: request.state = RequestState.REQUEST_HANDLER try: with swapped_context(context, request): self._check_request_collision(request, session) await self._run_request_handler(context=context) except asyncio.TimeoutError as e: raise RequestHandlerError(e, context) from e await self._commit_request_handler_result(context) request.state = RequestState.DONE await self._mark_request_as_handled(request) if session and session.is_usable: session.mark_good() self._statistics.record_request_processing_finish(request.unique_key) except RequestCollisionError as request_error: request.no_retry = True await self._handle_request_error(context, request_error) except RequestHandlerError as primary_error: primary_error = cast( 'RequestHandlerError[TCrawlingContext]', primary_error ) # valid thanks to ContextPipeline self._logger.debug( 'An exception occurred in the user-defined request handler', exc_info=primary_error.wrapped_exception, ) await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception) except SessionError as session_error: if not session: raise RuntimeError('SessionError raised in a crawling context without a session') from session_error if self._error_handler: await self._error_handler(context, session_error) if self._should_retry_request(context, session_error): exc_only = ''.join(traceback.format_exception_only(session_error)).strip() self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only) if session: session.retire() # Increment session rotation count. request.session_rotation_count = (request.session_rotation_count or 0) + 1 await request_manager.reclaim_request(request) await self._statistics.error_tracker_retry.add(error=session_error, context=context) else: await self._mark_request_as_handled(request) await self._handle_failed_request(context, session_error) self._statistics.record_request_processing_failure(request.unique_key) except ContextPipelineInterruptedError as interrupted_error: self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error) await self._mark_request_as_handled(request) except ContextPipelineInitializationError as initialization_error: self._logger.debug( 'An exception occurred during the initialization of crawling context', exc_info=initialization_error, ) await self._handle_request_error(context, initialization_error.wrapped_exception) except Exception as internal_error: self._logger.exception( 'An exception occurred during handling of a request. This places the crawler ' 'and its underlying storages into an unknown state and crawling will be terminated.', exc_info=internal_error, ) raise async def _run_request_handler(self, context: BasicCrawlingContext) -> None: context.request.state = RequestState.BEFORE_NAV await self._context_pipeline( context, lambda final_context: wait_for( lambda: self.router(final_context), timeout=self._request_handler_timeout, timeout_message=f'{self._request_handler_timeout_text}' f' {self._request_handler_timeout.total_seconds()} seconds', logger=self._logger, ), ) def _raise_for_error_status_code(self, status_code: int) -> None: """Raise an exception if the given status code is considered an error. Args: status_code: The HTTP status code to check. Raises: HttpStatusCodeError: If the status code represents a server error or is explicitly configured as an error. HttpClientStatusCodeError: If the status code represents a client error. """ is_ignored_status = status_code in self._ignore_http_error_status_codes is_explicit_error = status_code in self._additional_http_error_status_codes if is_explicit_error: raise HttpStatusCodeError('Error status code (user-configured) returned.', status_code) if is_status_code_client_error(status_code) and not is_ignored_status: raise HttpClientStatusCodeError('Client error status code returned', status_code) if is_status_code_server_error(status_code) and not is_ignored_status: raise HttpStatusCodeError('Error status code returned', status_code) def _raise_for_session_blocked_status_code(self, session: Session | None, status_code: int) -> None: """Raise an exception if the given status code indicates the session is blocked. Args: session: The session used for the request. If None, no check is performed. status_code: The HTTP status code to check. Raises: SessionError: If the status code indicates the session is blocked. """ if session is not None and session.is_blocked_status_code( status_code=status_code, ignore_http_error_status_codes=self._ignore_http_error_status_codes, ): raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}') def _check_request_collision(self, request: Request, session: Session | None) -> None: """Raise an exception if a request cannot access required resources. Args: request: The `Request` that might require specific resources (like a session). session: The `Session` that was retrieved for the request, or `None` if not available. Raises: RequestCollisionError: If the `Session` referenced by the `Request` is not available. """ if self._use_session_pool and request.session_id and not session: raise RequestCollisionError( f'The Session (id: {request.session_id}) bound to the Request is no longer available in SessionPool' ) async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool: """Check if the URL is allowed based on the robots.txt file. Args: url: The URL to verify against robots.txt rules. Returns True if crawling this URL is permitted. """ if not self._respect_robots_txt_file: return True robots_txt_file = await self._get_robots_txt_file_for_url(url) return not robots_txt_file or robots_txt_file.is_allowed(url) async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None: """Get the RobotsTxtFile for a given URL. Args: url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file. """ if not self._respect_robots_txt_file: return None origin_url = str(URL(url).origin()) robots_txt_file = self._robots_txt_file_cache.get(origin_url) if robots_txt_file: return robots_txt_file async with self._robots_txt_lock: # Check again if the robots.txt file is already cached after acquiring the lock robots_txt_file = self._robots_txt_file_cache.get(origin_url) if robots_txt_file: return robots_txt_file # If not cached, fetch the robots.txt file robots_txt_file = await self._find_txt_file_for_url(url) self._robots_txt_file_cache[origin_url] = robots_txt_file return robots_txt_file async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile: """Find the robots.txt file for a given URL. Args: url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file. """ return await RobotsTxtFile.find(url, self._http_client) def _log_status_message(self, message: str, level: LogLevel = 'DEBUG') -> None: """Log a status message for the crawler. Args: message: The status message to log. level: The logging level for the message. """ log_level = string_to_log_level(level) self.log.log(log_level, message) async def _crawler_state_task(self) -> None: """Emit a persist state event with the given migration status.""" event_manager = self._service_locator.get_event_manager() current_state = self.statistics.state if ( failed_requests := ( current_state.requests_failed - (self._previous_crawler_state or current_state).requests_failed ) > 0 ): message = f'Experiencing problems, {failed_requests} failed requests since last status update.' else: request_manager = await self.get_request_manager() total_count = await request_manager.get_total_count() if total_count is not None and total_count > 0: pages_info = f'{self._statistics.state.requests_finished}/{total_count}' else: pages_info = str(self._statistics.state.requests_finished) message = ( f'Crawled {pages_info} pages, {self._statistics.state.requests_failed} failed requests, ' f'desired concurrency {self._autoscaled_pool.desired_concurrency}.' ) if self._status_message_callback: new_message = await self._status_message_callback(current_state, self._previous_crawler_state, message) if new_message: message = new_message self._log_status_message(message, level='INFO') else: self._log_status_message(message, level='INFO') event_manager.emit( event=Event.CRAWLER_STATUS, event_data=EventCrawlerStatusData(message=message, crawler_id=id(self)) ) self._previous_crawler_state = current_state async def _mark_request_as_handled(self, request: Request) -> None: request_manager = await self.get_request_manager() await wait_for( lambda: request_manager.mark_request_as_handled(request), timeout=self._internal_timeout, timeout_message='Marking request as handled timed out after ' f'{self._internal_timeout.total_seconds()} seconds', logger=self._logger, max_retries=3, ) ================================================ FILE: src/crawlee/crawlers/_basic/_basic_crawling_context.py ================================================ from __future__ import annotations # Do just the re-export because of the circular imports. from crawlee._types import BasicCrawlingContext # noqa: F401 ================================================ FILE: src/crawlee/crawlers/_basic/_context_pipeline.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, Any, Generic, cast from typing_extensions import TypeVar from crawlee._types import BasicCrawlingContext from crawlee._utils.docs import docs_group from crawlee.errors import ( ContextPipelineFinalizationError, ContextPipelineInitializationError, ContextPipelineInterruptedError, RequestHandlerError, SessionError, ) if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Callable, Generator TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext) TMiddlewareCrawlingContext = TypeVar('TMiddlewareCrawlingContext', bound=BasicCrawlingContext) class _Middleware(Generic[TMiddlewareCrawlingContext, TCrawlingContext]): """Helper wrapper class to make the middleware easily observable by open telemetry instrumentation.""" def __init__( self, middleware: Callable[ [TCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, Exception | None], ], input_context: TCrawlingContext, ) -> None: self.generator = middleware(input_context) self.input_context = input_context self.output_context: TMiddlewareCrawlingContext | None = None async def action(self) -> TMiddlewareCrawlingContext: self.output_context = await self.generator.__anext__() return self.output_context async def cleanup(self, final_consumer_exception: Exception | None) -> None: try: await self.generator.asend(final_consumer_exception) except StopAsyncIteration: pass except ContextPipelineInterruptedError as e: raise RuntimeError('Invalid state - pipeline interrupted in the finalization step') from e except Exception as e: raise ContextPipelineFinalizationError(e, self.output_context or self.input_context) from e else: raise RuntimeError('The middleware yielded more than once') @docs_group('Other') class ContextPipeline(Generic[TCrawlingContext]): """Encapsulates the logic of gradually enhancing the crawling context with additional information and utilities. The enhancement is done by a chain of middlewares that are added to the pipeline after it's creation. """ def __init__( self, *, _middleware: Callable[ [TCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, Exception | None], ] | None = None, _parent: ContextPipeline[BasicCrawlingContext] | None = None, ) -> None: self._middleware = _middleware self._parent = _parent def _middleware_chain(self) -> Generator[ContextPipeline[Any], None, None]: yield self if self._parent is not None: yield from self._parent._middleware_chain() # noqa: SLF001 async def __call__( self, crawling_context: BasicCrawlingContext, final_context_consumer: Callable[[TCrawlingContext], Awaitable[None]], ) -> None: """Run a crawling context through the middleware chain and pipe it into a consumer function. Exceptions from the consumer function are wrapped together with the final crawling context. """ chain = list(self._middleware_chain()) cleanup_stack: list[_Middleware[Any]] = [] final_consumer_exception: Exception | None = None try: for member in reversed(chain): if member._middleware: # noqa: SLF001 middleware_instance = _Middleware(middleware=member._middleware, input_context=crawling_context) # noqa: SLF001 try: result = await middleware_instance.action() except SessionError: # Session errors get special treatment raise except StopAsyncIteration as e: raise RuntimeError('The middleware did not yield') from e except ContextPipelineInterruptedError: raise except Exception as e: raise ContextPipelineInitializationError(e, crawling_context) from e crawling_context = result cleanup_stack.append(middleware_instance) try: await final_context_consumer(cast('TCrawlingContext', crawling_context)) except SessionError as e: # Session errors get special treatment final_consumer_exception = e raise except Exception as e: final_consumer_exception = e raise RequestHandlerError(e, crawling_context) from e finally: for middleware_instance in reversed(cleanup_stack): await middleware_instance.cleanup(final_consumer_exception) def compose( self, middleware: Callable[ [TCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, None], ], ) -> ContextPipeline[TMiddlewareCrawlingContext]: """Add a middleware to the pipeline. The middleware should yield exactly once, and it should yield an (optionally) extended crawling context object. The part before the yield can be used for initialization and the part after it for cleanup. Returns: The extended pipeline instance, providing a fluent interface """ return ContextPipeline[TMiddlewareCrawlingContext]( _middleware=cast( 'Callable[[BasicCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, Exception | None]]', middleware, ), _parent=cast('ContextPipeline[BasicCrawlingContext]', self), ) ================================================ FILE: src/crawlee/crawlers/_basic/_context_utils.py ================================================ from __future__ import annotations from contextlib import contextmanager from typing import TYPE_CHECKING if TYPE_CHECKING: from collections.abc import Iterator from crawlee._request import Request from ._basic_crawling_context import BasicCrawlingContext @contextmanager def swapped_context( context: BasicCrawlingContext, request: Request, ) -> Iterator[None]: """Replace context's isolated copies with originals after handler execution.""" try: yield finally: # Restore original context state to avoid side effects between different handlers. object.__setattr__(context, 'request', request) ================================================ FILE: src/crawlee/crawlers/_basic/_logging_utils.py ================================================ import asyncio import re import traceback import crawlee.errors def _get_only_innermost_exception(error: BaseException) -> BaseException: """Get innermost exception by following __cause__ and __context__ attributes of exception. If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible. """ if type(error) is crawlee.errors.UserHandlerTimeoutError: if error.__cause__: return error.__cause__ if error.__context__: return error.__context__ return error if error.__cause__: return _get_only_innermost_exception(error.__cause__) if error.__context__: return _get_only_innermost_exception(error.__context__) # No __cause__ and no __context__, this is as deep as it can get. return error def _get_filtered_traceback_parts_for_asyncio_timeout_error(traceback_parts: list[str]) -> list[str]: """Extract only the most relevant traceback parts from stack trace.""" ignore_pattern = ( r'([\\/]{1}asyncio[\\/]{1})|' # internal asyncio parts r'(Traceback \(most recent call last\))|' # common part of the stack trace formatting r'(asyncio\.exceptions\.CancelledError)' # internal asyncio exception ) return [ _strip_pep657_highlighting(traceback_part) for traceback_part in traceback_parts if not re.findall(ignore_pattern, traceback_part) ] def _strip_pep657_highlighting(traceback_part: str) -> str: """Remove PEP 657 highlighting from the traceback.""" highlight_pattern = r'(\n\s*~*\^+~*\n)$' return re.sub(highlight_pattern, '\n', traceback_part) def reduce_asyncio_timeout_error_to_relevant_traceback_parts( timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError, ) -> list[str]: innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error) return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts) def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]: innermost_error = _get_only_innermost_exception(error) return traceback.format_exception( type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False ) def get_one_line_error_summary_if_possible(error: Exception) -> str: if isinstance(error, asyncio.exceptions.TimeoutError): relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error) most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else '' elif isinstance(error, crawlee.errors.UserHandlerTimeoutError): # Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee # code and third line the topmost user error traceback_parts = _get_traceback_parts_for_innermost_exception(error) relevant_index_from_start = 3 most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else '' elif 'playwright._impl._errors.Error' in str(error.__class__): # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway # point to deep internals. return '' else: traceback_parts = _get_traceback_parts_for_innermost_exception(error) # Commonly last traceback part is type of the error, and the second last part is the relevant file. # If there are not enough traceback parts, then we are not sure how to summarize the error. relevant_traceback_part_index_from_end = 2 most_relevant_part = _strip_pep657_highlighting( _get_traceback_parts_for_innermost_exception(error)[-relevant_traceback_part_index_from_end] if len(traceback_parts) >= relevant_traceback_part_index_from_end else '' ) return most_relevant_part.strip('\n ').replace('\n', ', ') ================================================ FILE: src/crawlee/crawlers/_basic/py.typed ================================================ ================================================ FILE: src/crawlee/crawlers/_beautifulsoup/__init__.py ================================================ from crawlee._utils.try_import import install_import_hook as _install_import_hook from crawlee._utils.try_import import try_import as _try_import _install_import_hook(__name__) # The following imports are wrapped in try_import to handle optional dependencies, # ensuring the module can still function even if these dependencies are missing. with _try_import(__name__, 'BeautifulSoupCrawler'): from ._beautifulsoup_crawler import BeautifulSoupCrawler with _try_import(__name__, 'BeautifulSoupCrawlingContext'): from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext with _try_import(__name__, 'BeautifulSoupParserType'): from ._beautifulsoup_parser import BeautifulSoupParserType __all__ = [ 'BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType', ] ================================================ FILE: src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING from bs4 import BeautifulSoup, Tag from crawlee._utils.docs import docs_group from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType if TYPE_CHECKING: from collections.abc import AsyncGenerator from typing_extensions import Unpack from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext @docs_group('Crawlers') class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, BeautifulSoup, Tag]): """A web crawler for performing HTTP requests and parsing HTML/XML content. The `BeautifulSoupCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features. It specifies its own parser `BeautifulSoupParser` which is used to parse `HttpResponse`. `BeautifulSoupParser` uses following library for parsing: https://pypi.org/project/beautifulsoup4/ The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However, if you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`. ### Usage ```python from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext crawler = BeautifulSoupCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, } # Push the extracted data to the default dataset. await context.push_data(data) await crawler.run(['https://crawlee.dev/']) ``` """ def __init__( self, *, parser: BeautifulSoupParserType = 'lxml', **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]], ) -> None: """Initialize a new instance. Args: parser: The type of parser that should be used by `BeautifulSoup`. kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`. """ async def final_step( context: ParsedHttpCrawlingContext[BeautifulSoup], ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]: """Enhance `ParsedHttpCrawlingContext[BeautifulSoup]` with `soup` property.""" yield BeautifulSoupCrawlingContext.from_parsed_http_crawling_context(context) kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step) super().__init__( parser=BeautifulSoupParser(parser=parser), **kwargs, ) ================================================ FILE: src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py ================================================ from dataclasses import dataclass, fields from bs4 import BeautifulSoup from typing_extensions import Self from crawlee._utils.docs import docs_group from crawlee.crawlers import ParsedHttpCrawlingContext from ._utils import html_to_text @dataclass(frozen=True) @docs_group('Crawling contexts') class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext[BeautifulSoup]): """The crawling context used by the `BeautifulSoupCrawler`. It provides access to key objects as well as utility functions for handling crawling tasks. """ @property def soup(self) -> BeautifulSoup: """Convenience alias.""" return self.parsed_content @classmethod def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[BeautifulSoup]) -> Self: """Initialize a new instance from an existing `ParsedHttpCrawlingContext`.""" return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) def html_to_text(self) -> str: """Convert the parsed HTML content to newline-separated plain text without tags.""" return html_to_text(self.parsed_content) ================================================ FILE: src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, Literal from bs4 import BeautifulSoup, Tag from typing_extensions import override from crawlee._utils.docs import docs_group from crawlee.crawlers._abstract_http import AbstractHttpParser if TYPE_CHECKING: from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse @docs_group('HTTP parsers') class BeautifulSoupParser(AbstractHttpParser[BeautifulSoup, Tag]): """Parser for parsing HTTP response using `BeautifulSoup`.""" def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None: self._parser = parser @override async def parse(self, response: HttpResponse) -> BeautifulSoup: return BeautifulSoup(await response.read(), features=self._parser) @override async def parse_text(self, text: str) -> BeautifulSoup: return BeautifulSoup(text, features=self._parser) @override def is_matching_selector(self, parsed_content: Tag, selector: str) -> bool: return parsed_content.select_one(selector) is not None @override async def select(self, parsed_content: Tag, selector: str) -> Sequence[Tag]: return tuple(match for match in parsed_content.select(selector)) @override def find_links(self, parsed_content: Tag, selector: str, attribute: str) -> Iterable[str]: link: Tag urls: list[str] = [] for link in parsed_content.select(selector): url = link.attrs.get(attribute) if url: urls.append(url.strip()) return urls BeautifulSoupParserType = Literal['html.parser', 'lxml', 'xml', 'html5lib'] ================================================ FILE: src/crawlee/crawlers/_beautifulsoup/_utils.py ================================================ from __future__ import annotations import re from typing import TYPE_CHECKING from bs4 import BeautifulSoup, NavigableString, PageElement, Tag from crawlee._utils.html_to_text import ( _ANY_CONSECUTIVE_WHITE_SPACES, _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, _EMPTY_OR_ENDS_WITH_NEW_LINE, BLOCK_TAGS, SKIP_TAGS, ) if TYPE_CHECKING: from collections.abc import Iterable def html_to_text(source: str | Tag) -> str: """Convert markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup. Args: source: Input markup string or `BeautifulSoup` object. Returns: Newline separated plain text without tags. """ if isinstance(source, str): soup = BeautifulSoup(source, features='lxml') elif isinstance(source, BeautifulSoup): soup = source else: raise TypeError('Source must be either a string or a `BeautifulSoup` object.') text = '' def _page_element_to_text(page_elements: Iterable[PageElement]) -> None: """Extract and process text content from a collection of HTML elements. Convert page elements into plain text while preserving structure. Handle whitespace compression, skip unwanted elements, and format block elements correctly. """ nonlocal text for page_element in page_elements: if isinstance(page_element, (Tag, NavigableString)): if isinstance(page_element, NavigableString): compr: str if isinstance(page_element.parent, Tag) and page_element.parent.name.lower() == 'pre': compr = page_element.get_text() else: # Compress white spaces outside of pre block compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', page_element.get_text()) # If text is empty or ends with a whitespace, don't add the leading whitespace or new line if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text): compr = compr[1:] text += compr elif page_element.name.lower() in SKIP_TAGS: # Skip comments and special elements pass elif page_element.name.lower() == 'br': text += '\n' elif page_element.name.lower() == 'td': _page_element_to_text(page_element.children) text += '\t' else: # Block elements must be surrounded by newlines(unless beginning of text) is_block_tag = page_element.name.lower() in BLOCK_TAGS if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text): text += '\n' _page_element_to_text(page_element.children) if is_block_tag and not text.endswith('\n'): text += '\n' _page_element_to_text(soup.children) return text.strip() ================================================ FILE: src/crawlee/crawlers/_beautifulsoup/py.typed ================================================ ================================================ FILE: src/crawlee/crawlers/_http/__init__.py ================================================ from crawlee.crawlers._abstract_http._http_crawling_context import HttpCrawlingContext from crawlee.http_clients import HttpCrawlingResult from ._http_crawler import HttpCrawler __all__ = [ 'HttpCrawler', 'HttpCrawlingContext', 'HttpCrawlingResult', ] ================================================ FILE: src/crawlee/crawlers/_http/_http_crawler.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING from crawlee._utils.docs import docs_group from crawlee.crawlers._abstract_http import AbstractHttpCrawler, ParsedHttpCrawlingContext from ._http_parser import NoParser if TYPE_CHECKING: from typing_extensions import Unpack from crawlee.crawlers import BasicCrawlerOptions @docs_group('Crawlers') class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes, bytes]): """Specific version of generic `AbstractHttpCrawler`. It uses a dummy parser that simply returns the HTTP response body as-is. Use this only if you know what you are doing. In most cases, using an HTML parser would be more beneficial. For such scenarios, consider using `BeautifulSoupCrawler`, `ParselCrawler`, or writing your own subclass of `AbstractHttpCrawler`. ### Usage ```python from crawlee.crawlers import HttpCrawler, HttpCrawlingContext crawler = HttpCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. data = { 'url': context.request.url, 'response': (await context.http_response.read()).decode()[:100], } # Push the extracted data to the default dataset. await context.push_data(data) await crawler.run(['https://crawlee.dev/']) ``` """ def __init__( self, **kwargs: Unpack[BasicCrawlerOptions[ParsedHttpCrawlingContext[bytes]]], ) -> None: """Initialize a new instance. Args: kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`. """ kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline() super().__init__( parser=NoParser(), **kwargs, ) ================================================ FILE: src/crawlee/crawlers/_http/_http_parser.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING from typing_extensions import override from crawlee._utils.docs import docs_group from crawlee.crawlers._abstract_http import AbstractHttpParser from crawlee.crawlers._types import BlockedInfo if TYPE_CHECKING: from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse @docs_group('HTTP parsers') class NoParser(AbstractHttpParser[bytes, bytes]): """A no-op parser that returns raw response content without any processing. This is useful when you only need the raw response data and don't require HTML parsing, link extraction, or content selection functionality. """ @override async def parse(self, response: HttpResponse) -> bytes: return await response.read() @override async def parse_text(self, text: str) -> bytes: raise NotImplementedError @override async def select(self, parsed_content: bytes, selector: str) -> Sequence[bytes]: raise NotImplementedError @override def is_blocked(self, parsed_content: bytes) -> BlockedInfo: # Intentional unused argument. return BlockedInfo(reason='') @override def is_matching_selector(self, parsed_content: bytes, selector: str) -> bool: # Intentional unused argument. return False @override def find_links( self, parsed_content: bytes, selector: str, attribute: str ) -> Iterable[str]: # Intentional unused argument. return [] ================================================ FILE: src/crawlee/crawlers/_parsel/__init__.py ================================================ from crawlee._utils.try_import import install_import_hook as _install_import_hook from crawlee._utils.try_import import try_import as _try_import _install_import_hook(__name__) # The following imports are wrapped in try_import to handle optional dependencies, # ensuring the module can still function even if these dependencies are missing. with _try_import(__name__, 'ParselCrawler'): from ._parsel_crawler import ParselCrawler with _try_import(__name__, 'ParselCrawlingContext'): from ._parsel_crawling_context import ParselCrawlingContext __all__ = [ 'ParselCrawler', 'ParselCrawlingContext', ] ================================================ FILE: src/crawlee/crawlers/_parsel/_parsel_crawler.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING from parsel import Selector from crawlee._utils.docs import docs_group from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions from ._parsel_crawling_context import ParselCrawlingContext from ._parsel_parser import ParselParser if TYPE_CHECKING: from collections.abc import AsyncGenerator from typing_extensions import Unpack from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext @docs_group('Crawlers') class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selector]): """A web crawler for performing HTTP requests and parsing HTML/XML content. The `ParselCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features. It specifies its own parser `ParselParser` which is used to parse `HttpResponse`. `ParselParser` uses following library for parsing: https://pypi.org/project/parsel/ The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However, if you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`. ### Usage ```python from crawlee.crawlers import ParselCrawler, ParselCrawlingContext crawler = ParselCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. data = { 'url': context.request.url, 'title': context.selector.css('title').get(), } # Push the extracted data to the default dataset. await context.push_data(data) await crawler.run(['https://crawlee.dev/']) ``` """ def __init__( self, **kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]], ) -> None: """Initialize a new instance. Args: kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`. """ async def final_step( context: ParsedHttpCrawlingContext[Selector], ) -> AsyncGenerator[ParselCrawlingContext, None]: """Enhance `ParsedHttpCrawlingContext[Selector]` with a `selector` property.""" yield ParselCrawlingContext.from_parsed_http_crawling_context(context) kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step) super().__init__( parser=ParselParser(), **kwargs, ) ================================================ FILE: src/crawlee/crawlers/_parsel/_parsel_crawling_context.py ================================================ from dataclasses import dataclass, fields from parsel import Selector from typing_extensions import Self from crawlee._utils.docs import docs_group from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext from ._utils import html_to_text @dataclass(frozen=True) @docs_group('Crawling contexts') class ParselCrawlingContext(ParsedHttpCrawlingContext[Selector]): """The crawling context used by the `ParselCrawler`. It provides access to key objects as well as utility functions for handling crawling tasks. """ @property def selector(self) -> Selector: """Convenience alias.""" return self.parsed_content @classmethod def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Selector]) -> Self: """Create a new context from an existing `ParsedHttpCrawlingContext[Selector]`.""" return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) def html_to_text(self) -> str: """Convert the parsed HTML content to newline-separated plain text without tags.""" return html_to_text(self.parsed_content) ================================================ FILE: src/crawlee/crawlers/_parsel/_parsel_parser.py ================================================ from __future__ import annotations import asyncio from typing import TYPE_CHECKING from parsel import Selector from typing_extensions import override from crawlee._utils.docs import docs_group from crawlee.crawlers._abstract_http import AbstractHttpParser if TYPE_CHECKING: from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse @docs_group('HTTP parsers') class ParselParser(AbstractHttpParser[Selector, Selector]): """Parser for parsing HTTP response using Parsel.""" @override async def parse(self, response: HttpResponse) -> Selector: response_body = await response.read() return await asyncio.to_thread(Selector, body=response_body) @override async def parse_text(self, text: str) -> Selector: return Selector(text=text) @override async def select(self, parsed_content: Selector, selector: str) -> Sequence[Selector]: return tuple(match for match in parsed_content.css(selector)) @override def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool: return parsed_content.type in ('html', 'xml') and parsed_content.css(selector).get() is not None @override def find_links(self, parsed_content: Selector, selector: str, attribute: str) -> Iterable[str]: link: Selector urls: list[str] = [] for link in parsed_content.css(selector): url = link.xpath(f'@{attribute}').get() if url: urls.append(url.strip()) return urls ================================================ FILE: src/crawlee/crawlers/_parsel/_utils.py ================================================ from __future__ import annotations import re from parsel import Selector from crawlee._utils.html_to_text import ( _ANY_CONSECUTIVE_WHITE_SPACES, _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, _EMPTY_OR_ENDS_WITH_NEW_LINE, BLOCK_TAGS, SKIP_TAGS, ) def html_to_text(source: str | Selector) -> str: """Convert markup string or `Selector` to newline-separated plain text without tags using Parsel. Args: source: Input markup string or `Selector` object. Returns: Newline separated plain text without tags. """ if isinstance(source, str): selector = Selector(text=source) elif isinstance(source, Selector): selector = source else: raise TypeError('Source must be either a string or a `Selector` object.') text = '' def _extract_text(elements: list[Selector], *, compress: bool = True) -> None: """Extract text content from HTML elements while preserving formatting. Perform custom HTML parsing to match the behavior of the JavaScript version of Crawlee. Handles whitespace compression and block-level tag formatting. Args: elements: A list of selectors representing the HTML elements. compress: Whether to compress consecutive whitespace outside of `<pre>` blocks. """ nonlocal text for element in elements: tag = element.root.tag if hasattr(element.root, 'tag') else None if tag is None: # Compress white spaces outside of pre block compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', element.root) if compress else element.root # If text is empty or ends with a whitespace, don't add the leading whitespace or new line if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text): compr = compr[1:] text += compr if tag in SKIP_TAGS or not isinstance(tag, str): continue if tag == 'br': text += '\n' elif tag == 'td': _extract_text(element.xpath('./node()')) text += '\t' else: is_block_tag = tag in BLOCK_TAGS if tag else False if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text): text += '\n' _extract_text(element.xpath('./node()'), compress=tag != 'pre') if is_block_tag and not text.endswith('\n'): text += '\n' # Start processing the root elements _extract_text(selector.xpath('/*')) return text.strip() ================================================ FILE: src/crawlee/crawlers/_playwright/__init__.py ================================================ from crawlee._utils.try_import import install_import_hook as _install_import_hook from crawlee._utils.try_import import try_import as _try_import _install_import_hook(__name__) # The following imports are wrapped in try_import to handle optional dependencies, # ensuring the module can still function even if these dependencies are missing. with _try_import(__name__, 'PlaywrightCrawler'): from ._playwright_crawler import PlaywrightCrawler with _try_import(__name__, 'PlaywrightCrawlingContext'): from ._playwright_crawling_context import PlaywrightCrawlingContext with _try_import(__name__, 'PlaywrightPreNavCrawlingContext'): from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext with _try_import(__name__, 'PlaywrightPostNavCrawlingContext'): from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext __all__ = [ 'PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPostNavCrawlingContext', 'PlaywrightPreNavCrawlingContext', ] ================================================ FILE: src/crawlee/crawlers/_playwright/_playwright_crawler.py ================================================ from __future__ import annotations import asyncio import logging import warnings from datetime import timedelta from functools import partial from typing import TYPE_CHECKING, Any, Generic, Literal import playwright.async_api from more_itertools import partition from pydantic import ValidationError from typing_extensions import NotRequired, TypedDict, TypeVar from crawlee._request import Request, RequestOptions, RequestState from crawlee._types import BasicCrawlingContext, ConcurrencySettings from crawlee._utils.blocked import RETRY_CSS_SELECTORS from crawlee._utils.docs import docs_group from crawlee._utils.robots import RobotsTxtFile from crawlee._utils.time import SharedTimeout from crawlee._utils.urls import to_absolute_url_iterator from crawlee.browsers import BrowserPool from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline from crawlee.errors import SessionError from crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type from crawlee.http_clients import ImpitHttpClient from crawlee.sessions._cookies import PlaywrightCookieParam from crawlee.statistics import StatisticsState from ._playwright_crawling_context import PlaywrightCrawlingContext from ._playwright_http_client import PlaywrightHttpClient, browser_page_context from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext from ._types import GotoOptions from ._utils import block_requests, infinite_scroll TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext) TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Callable, Iterator, Mapping from pathlib import Path from playwright.async_api import Page, Route from playwright.async_api import Request as PlaywrightRequest from typing_extensions import Unpack from crawlee import RequestTransformAction from crawlee._types import ( EnqueueLinksKwargs, ExtractLinksFunction, HttpHeaders, HttpMethod, HttpPayload, ) from crawlee.browsers._types import BrowserType @docs_group('Crawlers') class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]): """A web crawler that leverages the `Playwright` browser automation library. The `PlaywrightCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. On top of that it provides a high level web crawling interface on top of the `Playwright` library. To be more specific, it uses the Crawlee's `BrowserPool` to manage the Playwright's browser instances and the pages they open. You can create your own `BrowserPool` instance and pass it to the `PlaywrightCrawler` constructor, or let the crawler create a new instance with the default settings. This crawler is ideal for crawling websites that require JavaScript execution, as it uses real browsers to download web pages and extract data. For websites that do not require JavaScript, consider using one of the HTTP client-based crawlers, such as the `HttpCrawler`, `ParselCrawler`, or `BeautifulSoupCrawler`. They use raw HTTP requests, which means they are much faster. ### Usage ```python from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext crawler = PlaywrightCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. data = { 'url': context.request.url, 'title': await context.page.title(), 'response': (await context.response.text())[:100], } # Push the extracted data to the default dataset. await context.push_data(data) await crawler.run(['https://crawlee.dev/']) ``` """ def __init__( self, *, browser_pool: BrowserPool | None = None, browser_type: BrowserType | None = None, user_data_dir: str | Path | None = None, browser_launch_options: Mapping[str, Any] | None = None, browser_new_context_options: Mapping[str, Any] | None = None, goto_options: GotoOptions | None = None, fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default', headless: bool | None = None, use_incognito_pages: bool | None = None, navigation_timeout: timedelta | None = None, **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]], ) -> None: """Initialize a new instance. Args: browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages. user_data_dir: Path to a user data directory, which stores browser session data like cookies and local storage. browser_type: The type of browser to launch: - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system. This option should not be used if `browser_pool` is provided. browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided directly to Playwright's `browser_type.launch` method. For more details, refer to the [Playwright documentation](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch). This option should not be used if `browser_pool` is provided. browser_new_context_options: Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's `browser.new_context` method. For more details, refer to the [Playwright documentation](https://playwright.dev/python/docs/api/class-browser#browser-new-context). This option should not be used if `browser_pool` is provided. fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used to generate browser fingerprints together with consistent headers. headless: Whether to run the browser in headless mode. This option should not be used if `browser_pool` is provided. use_incognito_pages: By default pages share the same browser context. If set to True each page uses its own context that is destroyed once the page is closed or crashes. This option should not be used if `browser_pool` is provided. navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling the request handler) goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported, use `navigation_timeout` instead. kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. """ self._shared_navigation_timeouts: dict[int, SharedTimeout] = {} if browser_pool: # Raise an exception if browser_pool is provided together with other browser-related arguments. if any( param not in [None, 'default'] for param in ( user_data_dir, use_incognito_pages, headless, browser_type, browser_launch_options, browser_new_context_options, fingerprint_generator, ) ): raise ValueError( 'You cannot provide `headless`, `browser_type`, `browser_launch_options`, ' '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or ' '`fingerprint_generator` arguments when `browser_pool` is provided.' ) # If browser_pool is not provided, create a new instance of BrowserPool with specified arguments. else: if fingerprint_generator == 'default': generator_browser_type: list[Literal['chrome', 'firefox', 'safari', 'edge']] | None = ( [fingerprint_browser_type_from_playwright_browser_type(browser_type)] if browser_type else None ) fingerprint_generator = DefaultFingerprintGenerator( header_options=HeaderGeneratorOptions(browsers=generator_browser_type) ) browser_pool = BrowserPool.with_default_plugin( headless=headless, browser_type=browser_type, user_data_dir=user_data_dir, browser_launch_options=browser_launch_options, browser_new_context_options=browser_new_context_options, use_incognito_pages=use_incognito_pages, fingerprint_generator=fingerprint_generator, ) self._browser_pool = browser_pool # Compose the context pipeline with the Playwright-specific context enhancer. kwargs['_context_pipeline'] = ( ContextPipeline() .compose(self._open_page) .compose(self._navigate) .compose(self._execute_post_navigation_hooks) .compose(self._handle_status_code_response) .compose(self._handle_blocked_request_by_content) .compose(self._create_crawling_context) ) kwargs['_additional_context_managers'] = [self._browser_pool] kwargs.setdefault('_logger', logging.getLogger(__name__)) self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]] = [] self._post_navigation_hooks: list[Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]] = [] kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client'] # Set default concurrency settings for browser crawlers if not provided if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None: kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1) self._navigation_timeout = navigation_timeout or timedelta(minutes=1) self._goto_options = goto_options or GotoOptions() super().__init__(**kwargs) async def _open_page( self, context: BasicCrawlingContext, ) -> AsyncGenerator[PlaywrightPreNavCrawlingContext, None]: if self._browser_pool is None: raise ValueError('Browser pool is not initialized.') # Create a new browser page crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info) pre_navigation_context = PlaywrightPreNavCrawlingContext( request=context.request, session=context.session, add_requests=context.add_requests, send_request=context.send_request, push_data=context.push_data, use_state=context.use_state, proxy_info=context.proxy_info, get_key_value_store=context.get_key_value_store, log=context.log, page=crawlee_page.page, block_requests=partial(block_requests, page=crawlee_page.page), goto_options=GotoOptions(**self._goto_options), ) context_id = id(pre_navigation_context) self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout) try: # Only use the page context manager here — it sets the current page in a context variable, # making it accessible to PlaywrightHttpClient in subsequent pipeline steps. async with browser_page_context(crawlee_page.page): for hook in self._pre_navigation_hooks: async with self._shared_navigation_timeouts[context_id]: await hook(pre_navigation_context) # Yield should be inside the browser_page_context. yield pre_navigation_context finally: self._shared_navigation_timeouts.pop(context_id, None) def _prepare_request_interceptor( self, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None, payload: HttpPayload | None = None, ) -> Callable: """Create a request interceptor for Playwright to support non-GET methods with custom parameters. The interceptor modifies requests by adding custom headers and payload before they are sent. Args: method: HTTP method to use for the request. headers: Custom HTTP headers to send with the request. payload: Request body data for POST/PUT requests. """ async def route_handler(route: Route, _: PlaywrightRequest) -> None: await route.continue_(method=method, headers=dict(headers) if headers else None, post_data=payload) return route_handler async def _navigate( self, context: PlaywrightPreNavCrawlingContext, ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, Exception | None]: """Execute an HTTP request utilizing the `BrowserPool` and the `Playwright` library. Args: context: The basic crawling context to be enhanced. Raises: ValueError: If the browser pool is not initialized. SessionError: If the URL cannot be loaded by the browser. TimeoutError: If navigation does not succeed within the navigation timeout. Yields: The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links, infinite_scroll and block_requests). """ async with context.page: if context.session: session_cookies = context.session.cookies.get_cookies_as_playwright_format() await self._update_cookies(context.page, session_cookies) if context.request.headers: await context.page.set_extra_http_headers(context.request.headers.model_dump()) # Navigate to the URL and get response. if context.request.method != 'GET': # Call the notification only once warnings.warn( 'Using other request methods than GET or adding payloads has a high impact on performance' ' in recent versions of Playwright. Use only when necessary.', category=UserWarning, stacklevel=2, ) route_handler = self._prepare_request_interceptor( method=context.request.method, headers=context.request.headers, payload=context.request.payload, ) # Set route_handler only for current request await context.page.route(context.request.url, route_handler) try: async with self._shared_navigation_timeouts[id(context)] as remaining_timeout: response = await context.page.goto( context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options ) context.request.state = RequestState.AFTER_NAV except playwright.async_api.TimeoutError as exc: raise asyncio.TimeoutError from exc if response is None: raise SessionError(f'Failed to load the URL: {context.request.url}') # Set the loaded URL to the actual URL after redirection. context.request.loaded_url = context.page.url yield PlaywrightPostNavCrawlingContext( request=context.request, session=context.session, add_requests=context.add_requests, send_request=context.send_request, push_data=context.push_data, use_state=context.use_state, proxy_info=context.proxy_info, get_key_value_store=context.get_key_value_store, log=context.log, page=context.page, block_requests=context.block_requests, goto_options=context.goto_options, response=response, ) def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction: """Create a callback function for extracting links from context. Args: context: The current crawling context. Returns: Awaitable that is used for extracting links from context. """ async def extract_links( *, selector: str = 'a', attribute: str = 'href', label: str | None = None, user_data: dict | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> list[Request]: """Extract links from the current page. The `PlaywrightCrawler` implementation of the `ExtractLinksFunction` function. """ requests = list[Request]() base_user_data = user_data or {} robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url) kwargs.setdefault('strategy', 'same-hostname') strategy = kwargs.get('strategy', 'same-hostname') elements = await context.page.query_selector_all(selector) links_iterator: Iterator[str] = iter( [url for element in elements if (url := await element.get_attribute(attribute)) is not None] ) # Get base URL from <base> tag if present extracted_base_url = await context.page.evaluate('document.baseURI') base_url: str = extracted_base_url or context.request.loaded_url or context.request.url links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log) if robots_txt_file: skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) else: skipped = iter([]) for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs): request_options = RequestOptions( url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy ) if transform_request_function: transform_request_options = transform_request_function(request_options) if transform_request_options == 'skip': continue if transform_request_options != 'unchanged': request_options = transform_request_options try: request = Request.from_url(**request_options) except ValidationError as exc: context.log.debug( f'Skipping URL "{url}" due to invalid format: {exc}. ' 'This may be caused by a malformed URL or unsupported URL scheme. ' 'Please ensure the URL is correct and retry.' ) continue requests.append(request) skipped_tasks = [ asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped ] await asyncio.gather(*skipped_tasks) return requests return extract_links async def _handle_status_code_response( self, context: PlaywrightPostNavCrawlingContext ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: """Validate the HTTP status code and raise appropriate exceptions if needed. Args: context: The current crawling context containing the response. Raises: SessionError: If the status code indicates the session is blocked. HttpStatusCodeError: If the status code represents a server error or is explicitly configured as an error. HttpClientStatusCodeError: If the status code represents a client error. Yields: The original crawling context if no errors are detected. """ status_code = context.response.status if self._retry_on_blocked: self._raise_for_session_blocked_status_code(context.session, status_code) self._raise_for_error_status_code(status_code) yield context async def _handle_blocked_request_by_content( self, context: PlaywrightPostNavCrawlingContext, ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: """Try to detect if the request is blocked based on the response content. Args: context: The current crawling context. Raises: SessionError: If the request is considered blocked. Yields: The original crawling context if no errors are detected. """ if self._retry_on_blocked: matched_selectors = [ selector for selector in RETRY_CSS_SELECTORS if (await context.page.query_selector(selector)) ] # Check if the session is blocked based on the response content if matched_selectors: raise SessionError( 'Assuming the session is blocked - ' f'HTTP response matched the following selectors: {"; ".join(matched_selectors)}' ) yield context async def _execute_post_navigation_hooks( self, context: PlaywrightPostNavCrawlingContext ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: for hook in self._post_navigation_hooks: await hook(context) yield context async def _create_crawling_context( self, context: PlaywrightPostNavCrawlingContext ) -> AsyncGenerator[PlaywrightCrawlingContext, Exception | None]: extract_links = self._create_extract_links_function(context) error = yield PlaywrightCrawlingContext( request=context.request, session=context.session, add_requests=context.add_requests, send_request=context.send_request, push_data=context.push_data, use_state=context.use_state, proxy_info=context.proxy_info, get_key_value_store=context.get_key_value_store, log=context.log, page=context.page, goto_options=context.goto_options, response=context.response, infinite_scroll=lambda: infinite_scroll(context.page), extract_links=extract_links, enqueue_links=self._create_enqueue_links_function(context, extract_links), block_requests=partial(block_requests, page=context.page), ) if context.session: pw_cookies = await self._get_cookies(context.page) context.session.cookies.set_cookies_from_playwright_format(pw_cookies) # Collect data in case of errors, before the page object is closed. if error: await self.statistics.error_tracker.add(error=error, context=context, early=True) def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None: """Register a hook to be called before each navigation. Args: hook: A coroutine function to be called before each navigation. """ self._pre_navigation_hooks.append(hook) def post_navigation_hook(self, hook: Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None: """Register a hook to be called after each navigation. Args: hook: A coroutine function to be called after each navigation. """ self._post_navigation_hooks.append(hook) async def _get_cookies(self, page: Page) -> list[PlaywrightCookieParam]: """Get the cookies from the page.""" cookies = await page.context.cookies() return [PlaywrightCookieParam(**cookie) for cookie in cookies] async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]) -> None: """Update the cookies in the page context.""" await page.context.add_cookies([{**cookie} for cookie in cookies]) async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile: """Find the robots.txt file for a given URL. Args: url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file. """ http_client = ImpitHttpClient() if isinstance(self._http_client, PlaywrightHttpClient) else self._http_client return await RobotsTxtFile.find(url, http_client=http_client) class _PlaywrightCrawlerAdditionalOptions(TypedDict): """Additional arguments for the `PlaywrightCrawler` constructor. It is intended for typing forwarded `__init__` arguments in the subclasses. All arguments are `BasicCrawlerOptions` + `_PlaywrightCrawlerAdditionalOptions` """ browser_pool: NotRequired[BrowserPool] """A `BrowserPool` instance to be used for launching the browsers and getting pages.""" browser_type: NotRequired[BrowserType] """The type of browser to launch: - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system. This option should not be used if `browser_pool` is provided.""" browser_launch_options: NotRequired[Mapping[str, Any]] """Keyword arguments to pass to the browser launch method. These options are provided directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch. This option should not be used if `browser_pool` is provided.""" browser_new_context_options: NotRequired[Mapping[str, Any]] """Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's `browser.new_context` method. For more details, refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context. This option should not be used if `browser_pool` is provided.""" headless: NotRequired[bool] """Whether to run the browser in headless mode. This option should not be used if `browser_pool` is provided.""" class PlaywrightCrawlerOptions( _PlaywrightCrawlerAdditionalOptions, BasicCrawlerOptions[TCrawlingContext, StatisticsState], Generic[TCrawlingContext, TStatisticsState], ): """Arguments for the `AbstractHttpCrawler` constructor. It is intended for typing forwarded `__init__` arguments in the subclasses. """ ================================================ FILE: src/crawlee/crawlers/_playwright/_playwright_crawling_context.py ================================================ from __future__ import annotations from dataclasses import dataclass from typing import TYPE_CHECKING from crawlee._utils.docs import docs_group from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext if TYPE_CHECKING: from collections.abc import Awaitable, Callable from crawlee._types import EnqueueLinksFunction, ExtractLinksFunction @dataclass(frozen=True) @docs_group('Crawling contexts') class PlaywrightCrawlingContext(PlaywrightPostNavCrawlingContext): """The crawling context used by the `PlaywrightCrawler`. It provides access to key objects as well as utility functions for handling crawling tasks. """ enqueue_links: EnqueueLinksFunction """The Playwright `EnqueueLinksFunction` implementation.""" extract_links: ExtractLinksFunction """The Playwright `ExtractLinksFunction` implementation.""" infinite_scroll: Callable[[], Awaitable[None]] """A function to perform infinite scrolling on the page. This scrolls to the bottom, triggering the loading of additional content if present.""" ================================================ FILE: src/crawlee/crawlers/_playwright/_playwright_http_client.py ================================================ from __future__ import annotations import contextvars from contextlib import AbstractAsyncContextManager, asynccontextmanager from typing import TYPE_CHECKING from typing_extensions import override from crawlee._types import HttpHeaders from crawlee.crawlers._playwright._types import PlaywrightHttpResponse from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse if TYPE_CHECKING: from collections.abc import AsyncGenerator from datetime import timedelta from playwright.async_api import Page from crawlee import Request from crawlee._types import HttpMethod, HttpPayload from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions import Session from crawlee.statistics import Statistics _browser_page_context_var: contextvars.ContextVar[Page | None] = contextvars.ContextVar('browser_context', default=None) @asynccontextmanager async def browser_page_context(page: Page) -> AsyncGenerator[None, None]: """Asynchronous context manager for setting the current Playwright page in the context variable.""" token = _browser_page_context_var.set(page) try: yield finally: _browser_page_context_var.reset(token) class PlaywrightHttpClient(HttpClient): """HTTP client based on the Playwright library. This client uses the Playwright library to perform HTTP requests in crawlers (`BasicCrawler` subclasses) and to manage sessions, proxies, and error handling. See the `HttpClient` class for more common information about HTTP clients. Note: This class is pre-designated for use in `PlaywrightCrawler` only """ def __init__(self) -> None: """Initialize a new instance.""" self._active = False @override async def crawl( self, request: Request, *, session: Session | None = None, proxy_info: ProxyInfo | None = None, statistics: Statistics | None = None, timeout: timedelta | None = None, ) -> HttpCrawlingResult: raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`') @override async def send_request( self, url: str, *, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None, payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> HttpResponse: # `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext` # TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved # https://github.com/apify/crawlee-python/issues/1055 if isinstance(headers, dict) or headers is None: headers = HttpHeaders(headers or {}) browser_context = _browser_page_context_var.get() if browser_context is None: raise RuntimeError('Unable to create an `APIRequestContext` outside the browser context') # Proxies appropriate to the browser context are used response = await browser_context.request.fetch( url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload, timeout=timeout.total_seconds() if timeout else None, ) return await PlaywrightHttpResponse.from_playwright_response(response, protocol='') @override def stream( self, url: str, *, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None, payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> AbstractAsyncContextManager[HttpResponse]: raise NotImplementedError('The `stream` method should not be used for `PlaywrightHttpClient`') async def cleanup(self) -> None: # The `browser_page_context` is responsible for resource cleanup return ================================================ FILE: src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py ================================================ from __future__ import annotations from dataclasses import dataclass from typing import TYPE_CHECKING from crawlee._utils.docs import docs_group from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext if TYPE_CHECKING: from playwright.async_api import Response @dataclass(frozen=True) @docs_group('Crawling contexts') class PlaywrightPostNavCrawlingContext(PlaywrightPreNavCrawlingContext): """The post navigation crawling context used by the `PlaywrightCrawler`. It provides access to the `Page` and `Response` objects, after the navigation to the URL is performed. """ response: Response """The Playwright `Response` object containing the response details for the current URL.""" ================================================ FILE: src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py ================================================ from __future__ import annotations from dataclasses import dataclass from typing import TYPE_CHECKING from crawlee._types import BasicCrawlingContext, PageSnapshot from crawlee._utils.docs import docs_group if TYPE_CHECKING: from playwright.async_api import Page from ._types import BlockRequestsFunction, GotoOptions @dataclass(frozen=True) @docs_group('Crawling contexts') class PlaywrightPreNavCrawlingContext(BasicCrawlingContext): """The pre navigation crawling context used by the `PlaywrightCrawler`. It provides access to the `Page` object, before the navigation to the URL is performed. """ page: Page """The Playwright `Page` object for the current page.""" block_requests: BlockRequestsFunction """Blocks network requests matching specified URL patterns.""" goto_options: GotoOptions """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported.""" async def get_snapshot(self) -> PageSnapshot: """Get snapshot of crawled page.""" html = None screenshot = None try: html = await self.page.content() except Exception: self.log.exception(f'Failed to get html snapshot for {self.request.url}.') try: screenshot = await self.page.screenshot(full_page=True, type='jpeg') except Exception: self.log.exception(f'Failed to get page screenshot for {self.request.url}.') return PageSnapshot(html=html, screenshot=screenshot) ================================================ FILE: src/crawlee/crawlers/_playwright/_types.py ================================================ from __future__ import annotations from dataclasses import dataclass from typing import TYPE_CHECKING, Literal, Protocol, TypedDict from playwright.async_api import APIResponse from crawlee import HttpHeaders from crawlee._utils.docs import docs_group if TYPE_CHECKING: from collections.abc import AsyncGenerator from playwright.async_api import Response from typing_extensions import NotRequired, Self @docs_group('Functions') class BlockRequestsFunction(Protocol): """A function for blocking unwanted HTTP requests during page loads in PlaywrightCrawler. It simplifies the process of blocking specific HTTP requests during page navigation. The function allows blocking both default resource types (like images, fonts, stylesheets) and custom URL patterns. """ async def __call__( self, url_patterns: list[str] | None = None, extra_url_patterns: list[str] | None = None ) -> None: """Call dunder method. Args: url_patterns: List of URL patterns to block. If None, uses default patterns. extra_url_patterns: Additional URL patterns to append to the main patterns list. """ @dataclass(frozen=True) class PlaywrightHttpResponse: """Wrapper class for playwright `Response` and `APIResponse` objects to implement `HttpResponse` protocol.""" http_version: str status_code: int headers: HttpHeaders _content: bytes async def read(self) -> bytes: return self._content async def read_stream(self) -> AsyncGenerator[bytes, None]: # Playwright does not support `streaming` responses. # This is a workaround to make it compatible with `HttpResponse` protocol. yield self._content @classmethod async def from_playwright_response(cls, response: Response | APIResponse, protocol: str) -> Self: headers = HttpHeaders(response.headers) status_code = response.status # Used http protocol version cannot be obtained from `Response` and has to be passed as additional argument. http_version = protocol _content = await response.body() # If not called then the body will stay in memory until the context closes. if isinstance(response, APIResponse): await response.dispose() return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content) class GotoOptions(TypedDict): """Keyword arguments for Playwright's `Page.goto()` method.""" wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']] """When to consider operation succeeded, defaults to 'load' event.""" referer: NotRequired[str] """Referer header value.""" ================================================ FILE: src/crawlee/crawlers/_playwright/_utils.py ================================================ from __future__ import annotations import asyncio from contextlib import suppress from typing import TYPE_CHECKING if TYPE_CHECKING: from playwright.async_api import Page from playwright.async_api import Request as PlaywrightRequest _DEFAULT_BLOCK_REQUEST_URL_PATTERNS = [ '.css', '.webp', '.jpg', '.jpeg', '.png', '.svg', '.gif', '.woff', '.pdf', '.zip', ] async def infinite_scroll(page: Page) -> None: """Scroll to the bottom of a page, handling loading of additional items.""" scrolled_distance = 0 finished = False match_count = 0 match_count_threshold = 4 old_request_count = 0 new_request_count = 0 def track_request(request: PlaywrightRequest) -> None: if request.resource_type in ['xhr', 'fetch', 'websocket', 'other']: nonlocal new_request_count new_request_count += 1 page.on('request', track_request) async def scroll() -> None: body_scroll_height = await page.evaluate('() => document.body.scrollHeight') delta = body_scroll_height or 10000 await page.mouse.wheel(delta_x=0, delta_y=delta) nonlocal scrolled_distance scrolled_distance += delta async def check_finished() -> None: nonlocal old_request_count, new_request_count, match_count, finished while True: if old_request_count == new_request_count: match_count += 1 if match_count >= match_count_threshold: finished = True return else: match_count = 0 old_request_count = new_request_count await asyncio.sleep(1) check_task = asyncio.create_task(check_finished(), name='infinite_scroll_check_finished_task') try: while not finished: await scroll() await page.wait_for_timeout(250) finally: if not check_task.done(): check_task.cancel() with suppress(asyncio.CancelledError): await check_task async def block_requests( page: Page, url_patterns: list[str] | None = None, extra_url_patterns: list[str] | None = None ) -> None: """Blocks network requests matching specified URL patterns. Args: page: Playwright Page object to block requests on. url_patterns: List of URL patterns to block. If None, uses default patterns. extra_url_patterns: Additional URL patterns to append to the main patterns list. """ url_patterns = list(url_patterns or _DEFAULT_BLOCK_REQUEST_URL_PATTERNS) url_patterns.extend(extra_url_patterns or []) browser_type = page.context.browser.browser_type.name if page.context.browser else 'undefined' if browser_type == 'chromium': client = await page.context.new_cdp_session(page) await client.send('Network.enable') await client.send('Network.setBlockedURLs', {'urls': url_patterns}) else: extensions = [pattern.strip('*.') for pattern in url_patterns if pattern.startswith(('*.', '.'))] specific_files = [pattern for pattern in url_patterns if not pattern.startswith(('*.', '.'))] if extensions: await page.route(f'**/*.{{{",".join(extensions)}}}*', lambda route, _: route.abort()) if specific_files: await page.route(f'**/{{{",".join(specific_files)}}}*', lambda route, _: route.abort()) ================================================ FILE: src/crawlee/crawlers/_types.py ================================================ from __future__ import annotations from dataclasses import dataclass @dataclass(frozen=True) class BlockedInfo: """Information about whether the crawling is blocked. If reason is empty, then it means it is not blocked.""" reason: str def __bool__(self) -> bool: """No reason means no blocking.""" return bool(self.reason) ================================================ FILE: src/crawlee/crawlers/py.typed ================================================ ================================================ FILE: src/crawlee/errors.py ================================================ from __future__ import annotations from typing import Generic from typing_extensions import TypeVar from crawlee._types import BasicCrawlingContext from crawlee._utils.docs import docs_group __all__ = [ 'ContextPipelineFinalizationError', 'ContextPipelineInitializationError', 'ContextPipelineInterruptedError', 'HttpClientStatusCodeError', 'HttpStatusCodeError', 'ProxyError', 'RequestCollisionError', 'RequestHandlerError', 'ServiceConflictError', 'SessionError', 'UserDefinedErrorHandlerError', ] TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext) @docs_group('Errors') class UserDefinedErrorHandlerError(Exception): """Wraps an exception thrown from an user-defined error handler.""" class UserHandlerTimeoutError(UserDefinedErrorHandlerError): """Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out.""" @docs_group('Errors') class SessionError(Exception): """Errors of `SessionError` type will trigger a session rotation. This error doesn't respect the `max_request_retries` option and has a separate limit of `max_session_rotations`. """ @docs_group('Errors') class ServiceConflictError(Exception): """Raised when attempting to reassign a service in service container that is already in use.""" def __init__(self, service: type, new_value: object, existing_value: object) -> None: super().__init__( f'Service {service.__name__} is already in use. Existing value: {existing_value}, ' f'attempted new value: {new_value}.' ) @docs_group('Errors') class ProxyError(SessionError): """Raised when a proxy is being blocked or malfunctions.""" @docs_group('Errors') class HttpStatusCodeError(Exception): """Raised when the response status code indicates an error.""" def __init__(self, message: str, status_code: int) -> None: super().__init__(f'{message} (status code: {status_code}).') self.status_code = status_code self.message = message @docs_group('Errors') class HttpClientStatusCodeError(HttpStatusCodeError): """Raised when the response status code indicates an client error.""" @docs_group('Errors') class RequestHandlerError(Exception, Generic[TCrawlingContext]): """Wraps an exception thrown from a request handler (router) and extends it with crawling context.""" def __init__(self, wrapped_exception: Exception, crawling_context: TCrawlingContext) -> None: super().__init__() self.wrapped_exception = wrapped_exception self.crawling_context = crawling_context @docs_group('Errors') class ContextPipelineInitializationError(Exception): """Wraps an exception thrown in the initialization step of a context pipeline middleware. We may not have the complete context at this point, so only `BasicCrawlingContext` is provided. """ def __init__(self, wrapped_exception: Exception, crawling_context: BasicCrawlingContext) -> None: super().__init__() self.wrapped_exception = wrapped_exception self.crawling_context = crawling_context @docs_group('Errors') class ContextPipelineFinalizationError(Exception): """Wraps an exception thrown in the finalization step of a context pipeline middleware. We may not have the complete context at this point, so only `BasicCrawlingContext` is provided. """ def __init__(self, wrapped_exception: Exception, crawling_context: BasicCrawlingContext) -> None: super().__init__() self.wrapped_exception = wrapped_exception self.crawling_context = crawling_context @docs_group('Errors') class ContextPipelineInterruptedError(Exception): """May be thrown in the initialization phase of a middleware to signal that the request should not be processed.""" @docs_group('Errors') class RequestCollisionError(Exception): """Raised when a request cannot be processed due to a conflict with required resources.""" ================================================ FILE: src/crawlee/events/__init__.py ================================================ from ._event_manager import EventManager from ._local_event_manager import LocalEventManager from ._types import ( Event, EventAbortingData, EventCrawlerStatusData, EventData, EventExitData, EventListener, EventMigratingData, EventPersistStateData, EventSystemInfoData, ) __all__ = [ 'Event', 'EventAbortingData', 'EventCrawlerStatusData', 'EventData', 'EventExitData', 'EventListener', 'EventManager', 'EventMigratingData', 'EventPersistStateData', 'EventSystemInfoData', 'LocalEventManager', ] ================================================ FILE: src/crawlee/events/_event_manager.py ================================================ # Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/events/event_manager.ts from __future__ import annotations import asyncio import inspect from collections import defaultdict from datetime import timedelta from functools import wraps from logging import getLogger from typing import TYPE_CHECKING, Any, Literal, TypedDict, cast, overload from pyee.asyncio import AsyncIOEventEmitter from crawlee._utils.context import ensure_context from crawlee._utils.docs import docs_group from crawlee._utils.recurring_task import RecurringTask from crawlee._utils.wait import wait_for_all_tasks_for_finish from crawlee.events._types import ( Event, EventAbortingData, EventCrawlerStatusData, EventExitData, EventListener, EventMigratingData, EventPersistStateData, EventSystemInfoData, ) if TYPE_CHECKING: from collections.abc import Awaitable, Callable from types import TracebackType from typing_extensions import NotRequired from crawlee.events._types import EventData, WrappedListener logger = getLogger(__name__) class EventManagerOptions(TypedDict): """Arguments for the `EventManager` constructor. It is intended for typing forwarded `__init__` arguments in the subclasses. """ persist_state_interval: NotRequired[timedelta] """Interval between emitted `PersistState` events to maintain state persistence.""" close_timeout: NotRequired[timedelta | None] """Optional timeout for canceling pending event listeners if they exceed this duration.""" @docs_group('Event managers') class EventManager: """Manage events and their listeners, enabling registration, emission, and execution control. It allows for registering event listeners, emitting events, and ensuring all listeners complete their execution. Built on top of `pyee.asyncio.AsyncIOEventEmitter`. It implements additional features such as waiting for all listeners to complete and emitting `PersistState` events at regular intervals. """ def __init__( self, *, persist_state_interval: timedelta = timedelta(minutes=1), close_timeout: timedelta | None = None, ) -> None: """Initialize a new instance. Args: persist_state_interval: Interval between emitted `PersistState` events to maintain state persistence. close_timeout: Optional timeout for canceling pending event listeners if they exceed this duration. """ self._persist_state_interval = persist_state_interval self._close_timeout = close_timeout # Asynchronous event emitter for handle events and invoke the event listeners. self._event_emitter = AsyncIOEventEmitter() # Listeners are wrapped inside asyncio.Task. Store their references here so that we can wait for them to finish. self._listener_tasks: set[asyncio.Task] = set() # Store the mapping between events, listeners and their wrappers in the following way: # event -> listener -> [wrapped_listener_1, wrapped_listener_2, ...] self._listeners_to_wrappers: dict[Event, dict[EventListener[Any], list[WrappedListener]]] = defaultdict( lambda: defaultdict(list), ) # Recurring task for emitting persist state events. self._emit_persist_state_event_rec_task = RecurringTask( func=self._emit_persist_state_event, delay=self._persist_state_interval, ) # Flag to indicate the context state. self._active = False @property def active(self) -> bool: """Indicate whether the context is active.""" return self._active async def __aenter__(self) -> EventManager: """Initialize the event manager upon entering the async context. Raises: RuntimeError: If the context manager is already active. """ if self._active: raise RuntimeError(f'The {self.__class__.__name__} is already active.') self._active = True self._emit_persist_state_event_rec_task.start() return self async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None, ) -> None: """Close the local event manager upon exiting the async context. This will stop listening for the events, and it will wait for all the event listeners to finish. Raises: RuntimeError: If the context manager is not active. """ if not self._active: raise RuntimeError(f'The {self.__class__.__name__} is not active.') # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved. await self._emit_persist_state_event_rec_task.stop() await self._emit_persist_state_event() await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout) self._event_emitter.remove_all_listeners() self._listener_tasks.clear() self._listeners_to_wrappers.clear() self._active = False @overload def on(self, *, event: Literal[Event.PERSIST_STATE], listener: EventListener[EventPersistStateData]) -> None: ... @overload def on(self, *, event: Literal[Event.SYSTEM_INFO], listener: EventListener[EventSystemInfoData]) -> None: ... @overload def on(self, *, event: Literal[Event.MIGRATING], listener: EventListener[EventMigratingData]) -> None: ... @overload def on(self, *, event: Literal[Event.ABORTING], listener: EventListener[EventAbortingData]) -> None: ... @overload def on(self, *, event: Literal[Event.EXIT], listener: EventListener[EventExitData]) -> None: ... @overload def on(self, *, event: Literal[Event.CRAWLER_STATUS], listener: EventListener[EventCrawlerStatusData]) -> None: ... @overload def on(self, *, event: Event, listener: EventListener[None]) -> None: ... def on(self, *, event: Event, listener: EventListener[Any]) -> None: """Register an event listener for a specific event. Args: event: The event for which to listen to. listener: The function (sync or async) which is to be called when the event is emitted. """ signature = inspect.signature(listener) @wraps(cast('Callable[..., None | Awaitable[None]]', listener)) async def listener_wrapper(event_data: EventData) -> None: try: bound_args = signature.bind(event_data) except TypeError: # Parameterless listener bound_args = signature.bind() # If the listener is a coroutine function, just call it, otherwise, run it in a separate thread # to avoid blocking the event loop coro = ( listener(*bound_args.args, **bound_args.kwargs) if inspect.iscoroutinefunction(listener) else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs) ) listener_name = listener.__name__ if hasattr(listener, '__name__') else listener.__class__.__name__ listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener_name}') self._listener_tasks.add(listener_task) try: logger.debug('EventManager.on.listener_wrapper(): Awaiting listener task...') await listener_task logger.debug('EventManager.on.listener_wrapper(): Listener task completed.') except Exception: # We need to swallow the exception and just log it here, otherwise it could break the event emitter logger.exception( 'Exception in the event listener', extra={ 'event_name': event.value, 'listener_name': listener.__name__ if hasattr(listener, '__name__') else listener.__class__.__name__, }, ) finally: logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...') self._listener_tasks.remove(listener_task) self._listeners_to_wrappers[event][listener].append(listener_wrapper) self._event_emitter.add_listener(event.value, listener_wrapper) def off(self, *, event: Event, listener: EventListener[Any] | None = None) -> None: """Remove a specific listener or all listeners for an event. Args: event: The Actor event for which to remove listeners. listener: The listener which is supposed to be removed. If not passed, all listeners of this event are removed. """ if listener: for listener_wrapper in self._listeners_to_wrappers[event][listener]: self._event_emitter.remove_listener(event.value, listener_wrapper) self._listeners_to_wrappers[event][listener] = [] else: self._listeners_to_wrappers[event] = defaultdict(list) self._event_emitter.remove_all_listeners(event.value) @overload def emit(self, *, event: Literal[Event.PERSIST_STATE], event_data: EventPersistStateData) -> None: ... @overload def emit(self, *, event: Literal[Event.SYSTEM_INFO], event_data: EventSystemInfoData) -> None: ... @overload def emit(self, *, event: Literal[Event.MIGRATING], event_data: EventMigratingData) -> None: ... @overload def emit(self, *, event: Literal[Event.ABORTING], event_data: EventAbortingData) -> None: ... @overload def emit(self, *, event: Literal[Event.EXIT], event_data: EventExitData) -> None: ... @overload def emit(self, *, event: Literal[Event.CRAWLER_STATUS], event_data: EventCrawlerStatusData) -> None: ... @overload def emit(self, *, event: Event, event_data: Any) -> None: ... @ensure_context def emit(self, *, event: Event, event_data: EventData) -> None: """Emit an event with the associated data to all registered listeners. Args: event: The event which will be emitted. event_data: The data which will be passed to the event listeners. """ self._event_emitter.emit(event.value, event_data) @ensure_context async def wait_for_all_listeners_to_complete(self, *, timeout: timedelta | None = None) -> None: """Wait for all currently executing event listeners to complete. Args: timeout: The maximum time to wait for the event listeners to finish. If they do not complete within the specified timeout, they will be canceled. """ async def wait_for_listeners() -> None: """Gathers all listener tasks and awaits their completion, logging any exceptions encountered.""" results = await asyncio.gather(*self._listener_tasks, return_exceptions=True) for result in results: if isinstance(result, Exception): logger.exception('Event listener raised an exception.', exc_info=result) tasks = [asyncio.create_task(wait_for_listeners(), name=f'Task-{wait_for_listeners.__name__}')] await wait_for_all_tasks_for_finish(tasks=tasks, logger=logger, timeout=timeout) async def _emit_persist_state_event(self) -> None: """Emit a persist state event with the given migration status.""" self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False)) ================================================ FILE: src/crawlee/events/_local_event_manager.py ================================================ # Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/events/local_event_manager.ts from __future__ import annotations import asyncio from datetime import timedelta from logging import getLogger from typing import TYPE_CHECKING from crawlee._utils.docs import docs_group from crawlee._utils.recurring_task import RecurringTask from crawlee._utils.system import get_cpu_info, get_memory_info from crawlee.configuration import Configuration from crawlee.events._event_manager import EventManager, EventManagerOptions from crawlee.events._types import Event, EventSystemInfoData if TYPE_CHECKING: from types import TracebackType from typing_extensions import Unpack logger = getLogger(__name__) @docs_group('Event managers') class LocalEventManager(EventManager): """Event manager for local environments. It extends the `EventManager` to emit `SystemInfo` events at regular intervals. The `LocalEventManager` is intended to be used in local environments, where the system metrics are required managing the `Snapshotter` and `AutoscaledPool`. """ def __init__( self, system_info_interval: timedelta = timedelta(seconds=1), **event_manager_options: Unpack[EventManagerOptions], ) -> None: """Initialize a new instance. In most cases, you should use the `from_config` constructor to create a new instance based on the provided configuration. Args: system_info_interval: Interval at which `SystemInfo` events are emitted. event_manager_options: Additional options for the parent class. """ self._system_info_interval = system_info_interval # Recurring task for emitting system info events. self._emit_system_info_event_rec_task = RecurringTask( func=self._emit_system_info_event, delay=self._system_info_interval, ) super().__init__(**event_manager_options) @classmethod def from_config(cls, config: Configuration | None = None) -> LocalEventManager: """Initialize a new instance based on the provided `Configuration`. Args: config: The `Configuration` instance. Uses the global (default) one if not provided. """ config = config or Configuration.get_global_configuration() return cls( system_info_interval=config.system_info_interval, persist_state_interval=config.persist_state_interval, ) async def __aenter__(self) -> LocalEventManager: """Initialize the local event manager upon entering the async context. It starts emitting system info events at regular intervals. """ await super().__aenter__() self._emit_system_info_event_rec_task.start() return self async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None, ) -> None: """Close the local event manager upon exiting the async context. It stops emitting system info events and closes the event manager. """ await self._emit_system_info_event_rec_task.stop() await super().__aexit__(exc_type, exc_value, exc_traceback) async def _emit_system_info_event(self) -> None: """Emit a system info event with the current CPU and memory usage.""" cpu_info = await asyncio.to_thread(get_cpu_info) memory_info = await asyncio.to_thread(get_memory_info) event_data = EventSystemInfoData(cpu_info=cpu_info, memory_info=memory_info) self.emit(event=Event.SYSTEM_INFO, event_data=event_data) ================================================ FILE: src/crawlee/events/_types.py ================================================ from __future__ import annotations from collections.abc import Callable, Coroutine from enum import Enum from typing import Annotated, Any, TypeVar from pydantic import BaseModel, ConfigDict, Field from crawlee._utils.docs import docs_group from crawlee._utils.models import timedelta_secs from crawlee._utils.system import CpuInfo, MemoryUsageInfo @docs_group('Event data') class Event(str, Enum): """Names of all possible events that can be emitted using an `EventManager`.""" # Core events PERSIST_STATE = 'persistState' SYSTEM_INFO = 'systemInfo' MIGRATING = 'migrating' ABORTING = 'aborting' EXIT = 'exit' # Session pool events SESSION_RETIRED = 'sessionRetired' # Browser pool events BROWSER_LAUNCHED = 'browserLaunched' BROWSER_RETIRED = 'browserRetired' BROWSER_CLOSED = 'browserClosed' PAGE_CREATED = 'pageCreated' PAGE_CLOSED = 'pageClosed' # State events CRAWLER_STATUS = 'crawlerStatus' @docs_group('Event data') class EventPersistStateData(BaseModel): """Data for the persist state event.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) is_migrating: Annotated[bool, Field(alias='isMigrating')] @docs_group('Event data') class EventSystemInfoData(BaseModel): """Data for the system info event.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')] memory_info: Annotated[ MemoryUsageInfo, Field(alias='memoryInfo'), ] @docs_group('Event data') class EventMigratingData(BaseModel): """Data for the migrating event.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) # The remaining time in seconds before the migration is forced and the process is killed # Optional because it's not present when the event handler is called manually time_remaining: Annotated[timedelta_secs | None, Field(alias='timeRemainingSecs')] = None @docs_group('Event data') class EventAbortingData(BaseModel): """Data for the aborting event.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) @docs_group('Event data') class EventExitData(BaseModel): """Data for the exit event.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) @docs_group('Event data') class EventCrawlerStatusData(BaseModel): """Data for the crawler status event.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) message: str """A message describing the current status of the crawler.""" crawler_id: int """The ID of the crawler that emitted the event.""" EventData = ( EventPersistStateData | EventSystemInfoData | EventMigratingData | EventAbortingData | EventExitData | EventCrawlerStatusData ) """A helper type for all possible event payloads""" WrappedListener = Callable[..., Coroutine[Any, Any, None]] TEvent = TypeVar('TEvent') EventListener = ( Callable[ [TEvent], None | Coroutine[Any, Any, None], ] | Callable[ [], None | Coroutine[Any, Any, None], ] ) """An event listener function - it can be both sync and async and may accept zero or one argument.""" ================================================ FILE: src/crawlee/events/py.typed ================================================ ================================================ FILE: src/crawlee/fingerprint_suite/__init__.py ================================================ from ._browserforge_adapter import BrowserforgeFingerprintGenerator as DefaultFingerprintGenerator from ._fingerprint_generator import FingerprintGenerator from ._header_generator import HeaderGenerator from ._types import HeaderGeneratorOptions, ScreenOptions __all__ = [ 'DefaultFingerprintGenerator', 'FingerprintGenerator', 'HeaderGenerator', 'HeaderGeneratorOptions', 'ScreenOptions', ] ================================================ FILE: src/crawlee/fingerprint_suite/_browserforge_adapter.py ================================================ from __future__ import annotations import random from collections.abc import Iterable from copy import deepcopy from functools import reduce from operator import or_ from typing import TYPE_CHECKING, Any, Literal import apify_fingerprint_datapoints from browserforge.bayesian_network import extract_json from browserforge.fingerprints import Fingerprint as bf_Fingerprint from browserforge.fingerprints import FingerprintGenerator as bf_FingerprintGenerator from browserforge.fingerprints import Screen from browserforge.headers.generator import HeaderGenerator as bf_HeaderGenerator from browserforge.headers.generator import ListOrString from typing_extensions import override from crawlee._utils.docs import docs_group from ._consts import BROWSER_TYPE_HEADER_KEYWORD from ._fingerprint_generator import FingerprintGenerator if TYPE_CHECKING: from browserforge.headers import Browser from ._types import HeaderGeneratorOptions, ScreenOptions, SupportedBrowserType class PatchedHeaderGenerator(bf_HeaderGenerator): """Browserforge `HeaderGenerator` that contains patches specific for our usage of the generator.""" def _get_accept_language_header(self, locales: tuple[str, ...] | list[str] | str) -> str: """Generate the Accept-Language header based on the given locales. Patched version due to PR of upstream repo not being merged: https://github.com/daijro/browserforge/pull/24 Args: locales: Locale(s). Returns: Accept-Language header string. """ # Convert to tuple if needed for consistent handling. if isinstance(locales, str): locales_tuple: tuple[str, ...] = (locales,) elif isinstance(locales, list): locales_tuple = tuple(locales) else: locales_tuple = locales # First locale does not include quality factor, q=1 is considered as implicit. additional_locales = [f'{locale};q={0.9 - index * 0.1:.1f}' for index, locale in enumerate(locales_tuple[1:])] return ','.join((locales_tuple[0], *additional_locales)) def generate( self, *, browser: Iterable[str | Browser] | None = None, os: ListOrString | None = None, device: ListOrString | None = None, locale: ListOrString | None = None, http_version: Literal[1, 2] | None = None, user_agent: ListOrString | None = None, strict: bool | None = None, request_dependent_headers: dict[str, str] | None = None, ) -> dict[str, str]: """Generate HTTP headers based on the specified parameters. For detailed description of the original method see: `browserforge.headers.generator.HeaderGenerator.generate` This patched version of the method adds additional quality checks on the output of the original method. It tries to generate headers several times until they match the requirements. Returns: A generated headers. """ # browserforge header generation can be flaky. Enforce basic QA on generated headers max_attempts = 10 single_browser = self._get_single_browser_type(browser) if single_browser == 'chrome': # `BrowserForge` header generator considers `chrome` in general sense and therefore will generate also # other `chrome` based browser headers. This adapter desires only specific subset of `chrome` headers # that contain all 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform' headers. # Increase max attempts as from `BrowserForge` header generator perspective even `chromium` # headers without `sec-...` headers are valid. max_attempts += 50 # Use browserforge to generate headers until it satisfies our additional requirements. for _attempt in range(max_attempts): generated_header: dict[str, str] = super().generate( browser=single_browser, os=os, device=device, locale=locale, http_version=http_version, user_agent=user_agent, strict=strict, request_dependent_headers=request_dependent_headers, ) if ('headless' in generated_header.get('User-Agent', '').lower()) or ( 'headless' in generated_header.get('sec-ch-ua', '').lower() ): # It can be a valid header, but we never want to leak "headless". Get a different one. continue if any( keyword in generated_header['User-Agent'] for keyword in self._get_expected_browser_keywords(single_browser) ): if single_browser == 'chrome' and not self._contains_all_sec_headers(generated_header): # Accept chromium header only with all sec headers. continue return generated_header raise RuntimeError('Failed to generate header.') def _contains_all_sec_headers(self, headers: dict[str, str]) -> bool: return all(header_name in headers for header_name in ('sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform')) def _get_expected_browser_keywords(self, browser: str | None) -> set[str]: if not browser: # Allow all possible keywords when there is no preference for specific browser type. return reduce(or_, BROWSER_TYPE_HEADER_KEYWORD.values()) return BROWSER_TYPE_HEADER_KEYWORD[browser] def _get_single_browser_type(self, browser: Iterable[str | Browser] | None) -> str | None: """Get single browser type. Browserforge header generator accepts wider range of possible types. Narrow it to single optional string as that is how we use it. Handling the original multitype would be pointlessly complex. """ # In our case we never pass more than one browser type. In general case more browsers are just bigger pool to # select from, so narrowing it to any of them is still a valid action as we are going to pick just one anyway. if isinstance(browser, str): return browser if isinstance(browser, Iterable): choice = random.choice( [ single_browser if isinstance(single_browser, str) else single_browser.name for single_browser in browser ] ) if choice in {'chrome', 'firefox', 'safari', 'edge'}: return choice raise ValueError('Invalid browser type.') return None class PatchedFingerprintGenerator(bf_FingerprintGenerator): """Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo.""" def __init__( self, *, screen: Screen | None = None, strict: bool = False, mock_webrtc: bool = False, slim: bool = False, **header_kwargs, # noqa:ANN003 # Upstream repo types missing. ) -> None: """Initialize a new instance. Args: screen: Screen constraints for the generated fingerprint. strict: Whether to raise an exception if the constraints are too strict. mock_webrtc: Whether to mock WebRTC when injecting the fingerprint. slim: Disables performance-heavy evasions when injecting the fingerprint. **header_kwargs: Header generation options for `HeaderGenerator`. """ super().__init__(screen=screen, strict=strict, mock_webrtc=mock_webrtc, slim=slim) # Replace `self.header_generator` To make sure that we consistently use `PatchedHeaderGenerator` self.header_generator = PatchedHeaderGenerator(**header_kwargs) @docs_group('Other') class BrowserforgeFingerprintGenerator(FingerprintGenerator): """`FingerprintGenerator` adapter for fingerprint generator from `browserforge`. `browserforge` is a browser header and fingerprint generator: https://github.com/daijro/browserforge """ def __init__( self, *, header_options: HeaderGeneratorOptions | None = None, screen_options: ScreenOptions | None = None, mock_web_rtc: bool | None = None, slim: bool | None = None, ) -> None: """Initialize a new instance. All generator options are optional. If any value is not specified, then `None` is set in the options. Default values for options set to `None` are implementation detail of used fingerprint generator. Specific default values should not be relied upon. Use explicit values if it matters for your use case. Args: header_options: Collection of header related attributes that can be used by the fingerprint generator. screen_options: Defines the screen constrains for the fingerprint generator. mock_web_rtc: Whether to mock WebRTC when injecting the fingerprint. slim: Disables performance-heavy evasions when injecting the fingerprint. """ bf_options: dict[str, Any] = {'mock_webrtc': mock_web_rtc, 'slim': slim} if header_options is None: bf_header_options = {} else: bf_header_options = deepcopy(header_options.model_dump()) bf_header_options['browser'] = bf_header_options.pop('browsers', None) bf_header_options['os'] = bf_header_options.pop('operating_systems', None) bf_header_options['device'] = bf_header_options.pop('devices', None) bf_header_options['locale'] = bf_header_options.pop('locales', None) if screen_options is None: bf_options['screen'] = Screen() else: bf_options['screen'] = Screen(**screen_options.model_dump()) self._options = {**bf_options, **bf_header_options} self._generator = PatchedFingerprintGenerator() @override def generate(self) -> bf_Fingerprint: # browserforge fingerprint generation can be flaky # https://github.com/daijro/browserforge/issues/22" # During test runs around 10 % flakiness was detected. # Max attempt set to 10 as (0.1)^10 is considered sufficiently low probability. max_attempts = 10 for attempt in range(max_attempts): try: return self._generator.generate(**self._options) except ValueError: # noqa:PERF203 if attempt == max_attempts: raise raise RuntimeError('Failed to generate fingerprint.') class BrowserforgeHeaderGenerator: """`HeaderGenerator` adapter for fingerprint generator from `browserforge`.""" def __init__(self) -> None: self._generator = PatchedHeaderGenerator(locale=['en-US', 'en']) def generate(self, browser_type: SupportedBrowserType = 'chrome') -> dict[str, str]: """Generate headers.""" return self._generator.generate(browser=[browser_type]) def get_available_header_network() -> dict: """Get header network that contains possible header values.""" return extract_json(apify_fingerprint_datapoints.get_header_network()) def get_available_header_values(header_network: dict, node_name: str | set[str]) -> set[str]: """Get set of possible header values from available header network.""" node_names = {node_name} if isinstance(node_name, str) else node_name for node in header_network['nodes']: if node['name'] in node_names: return set(node['possibleValues']) return set() ================================================ FILE: src/crawlee/fingerprint_suite/_consts.py ================================================ from __future__ import annotations COMMON_ACCEPT_LANGUAGE = 'en-US,en;q=0.9' BROWSER_TYPE_HEADER_KEYWORD = { 'chrome': {'Chrome', 'CriOS'}, 'firefox': {'Firefox', 'FxiOS'}, 'edge': {'Edg', 'Edge', 'EdgA', 'EdgiOS'}, 'safari': {'Safari'}, } ================================================ FILE: src/crawlee/fingerprint_suite/_fingerprint_generator.py ================================================ from __future__ import annotations from abc import ABC, abstractmethod from typing import TYPE_CHECKING from crawlee._utils.docs import docs_group if TYPE_CHECKING: from browserforge.fingerprints import Fingerprint @docs_group('Other') class FingerprintGenerator(ABC): """A class for creating browser fingerprints that mimic browser fingerprints of real users.""" @abstractmethod def generate(self) -> Fingerprint: """Generate browser fingerprints. This is experimental feature. Return type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely it will change to custom `Fingerprint` class defined in this repo later. """ ================================================ FILE: src/crawlee/fingerprint_suite/_header_generator.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, Literal from crawlee._types import HttpHeaders from crawlee._utils.docs import docs_group from crawlee.fingerprint_suite._browserforge_adapter import BrowserforgeHeaderGenerator if TYPE_CHECKING: from crawlee.fingerprint_suite._types import SupportedBrowserType def fingerprint_browser_type_from_playwright_browser_type( playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'], ) -> SupportedBrowserType: if playwright_browser_type in {'chromium', 'chrome'}: return 'chrome' if playwright_browser_type == 'firefox': return 'firefox' if playwright_browser_type == 'webkit': return 'safari' raise ValueError(f'Unsupported browser type: {playwright_browser_type}') @docs_group('Other') class HeaderGenerator: """Generate realistic looking or browser-like HTTP headers.""" def __init__(self) -> None: self._generator = BrowserforgeHeaderGenerator() def _select_specific_headers(self, all_headers: dict[str, str], header_names: set[str]) -> HttpHeaders: return HttpHeaders({key: value for key, value in all_headers.items() if key in header_names}) def get_specific_headers( self, header_names: set[str] | None = None, browser_type: SupportedBrowserType = 'chrome' ) -> HttpHeaders: """Return subset of headers based on the selected `header_names`. If no `header_names` are specified, full unfiltered headers are returned. """ all_headers = self._generator.generate(browser_type=browser_type) if not header_names: return HttpHeaders(all_headers) return self._select_specific_headers(all_headers, header_names) def get_common_headers(self) -> HttpHeaders: """Get common HTTP headers ("Accept", "Accept-Language"). We do not modify the "Accept-Encoding", "Connection" and other headers. They should be included and handled by the HTTP client or browser. """ all_headers = self._generator.generate() return self._select_specific_headers(all_headers, header_names={'Accept', 'Accept-Language'}) def get_random_user_agent_header(self) -> HttpHeaders: """Get a random User-Agent header.""" all_headers = self._generator.generate() return self._select_specific_headers(all_headers, header_names={'User-Agent'}) def get_user_agent_header( self, *, browser_type: SupportedBrowserType = 'chrome', ) -> HttpHeaders: """Get the User-Agent header based on the browser type.""" if browser_type not in {'chrome', 'firefox', 'safari', 'edge'}: raise ValueError(f'Unsupported browser type: {browser_type}') all_headers = self._generator.generate(browser_type=browser_type) return self._select_specific_headers(all_headers, header_names={'User-Agent'}) def get_sec_ch_ua_headers( self, *, browser_type: SupportedBrowserType = 'chrome', ) -> HttpHeaders: """Get the sec-ch-ua headers based on the browser type.""" if browser_type not in {'chrome', 'firefox', 'safari', 'edge'}: raise ValueError(f'Unsupported browser type: {browser_type}') all_headers = self._generator.generate(browser_type=browser_type) return self._select_specific_headers( all_headers, header_names={'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform'} ) ================================================ FILE: src/crawlee/fingerprint_suite/_types.py ================================================ from __future__ import annotations from typing import Annotated, Literal from pydantic import BaseModel, ConfigDict, Field SupportedOperatingSystems = Literal['windows', 'macos', 'linux', 'android', 'ios'] SupportedDevices = Literal['desktop', 'mobile'] SupportedHttpVersion = Literal['1', '2'] SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge'] class ScreenOptions(BaseModel): model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True) """Defines the screen constrains for the fingerprint generator.""" min_width: Annotated[float | None, Field(alias='minWidth')] = None """Minimal screen width constraint for the fingerprint generator.""" max_width: Annotated[float | None, Field(alias='maxWidth')] = None """Maximal screen width constraint for the fingerprint generator.""" min_height: Annotated[float | None, Field(alias='minHeight')] = None """Minimal screen height constraint for the fingerprint generator.""" max_height: Annotated[float | None, Field(alias='maxHeight')] = None """Maximal screen height constraint for the fingerprint generator.""" class HeaderGeneratorOptions(BaseModel): """Collection of header related attributes that can be used by the fingerprint generator.""" model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True) browsers: list[SupportedBrowserType] | None = None """List of BrowserSpecifications to generate the headers for.""" operating_systems: Annotated[list[SupportedOperatingSystems] | None, Field(alias='operatingSystems')] = None """List of operating systems to generate the headers for.""" devices: list[SupportedDevices] | None = None """List of devices to generate the headers for.""" locales: list[str] | None = None """List of at most 10 languages to include in the [Accept-Language] (https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language) request header in the language format accepted by that header, for example `en`, `en-US` or `de`.""" http_version: Annotated[SupportedHttpVersion | None, Field(alias='httpVersion')] = None """HTTP version to be used for header generation (the headers differ depending on the version).""" strict: bool | None = None """If true, the generator will throw an error if it cannot generate headers based on the input.""" ================================================ FILE: src/crawlee/fingerprint_suite/py.typed ================================================ ================================================ FILE: src/crawlee/http_clients/__init__.py ================================================ from crawlee._utils.try_import import install_import_hook as _install_import_hook from crawlee._utils.try_import import try_import as _try_import # These imports have only mandatory dependencies, so they are imported directly. from ._base import HttpClient, HttpCrawlingResult, HttpResponse from ._impit import ImpitHttpClient _install_import_hook(__name__) # The following imports are wrapped in try_import to handle optional dependencies, # ensuring the module can still function even if these dependencies are missing. with _try_import(__name__, 'CurlImpersonateHttpClient'): from ._curl_impersonate import CurlImpersonateHttpClient with _try_import(__name__, 'HttpxHttpClient'): from ._httpx import HttpxHttpClient __all__ = [ 'CurlImpersonateHttpClient', 'HttpClient', 'HttpCrawlingResult', 'HttpResponse', 'HttpxHttpClient', 'ImpitHttpClient', ] ================================================ FILE: src/crawlee/http_clients/_base.py ================================================ from __future__ import annotations from abc import ABC, abstractmethod from dataclasses import dataclass from typing import TYPE_CHECKING, Protocol from crawlee._utils.docs import docs_group if TYPE_CHECKING: from collections.abc import AsyncIterator from contextlib import AbstractAsyncContextManager from datetime import timedelta from types import TracebackType from crawlee import Request from crawlee._types import HttpHeaders, HttpMethod, HttpPayload from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions import Session from crawlee.statistics import Statistics @docs_group('Other') class HttpResponse(Protocol): """Define the interface that any HTTP response object must implement.""" @property def http_version(self) -> str: """The HTTP version used in the response.""" @property def status_code(self) -> int: """The HTTP status code received from the server.""" @property def headers(self) -> HttpHeaders: """The HTTP headers received in the response.""" async def read(self) -> bytes: """Read the entire content of the response body. This method loads the complete response body into memory at once. It should be used for responses received from regular HTTP requests (via `send_request` or `crawl` methods). Raises: RuntimeError: If called on a response received from the `stream` method. """ def read_stream(self) -> AsyncIterator[bytes]: """Iterate over the content of the response body in chunks. This method should be used for responses received from the `stream` method to process large response bodies without loading them entirely into memory. It allows for efficient processing of potentially large data by yielding chunks sequentially. Raises: RuntimeError: If the stream has already been consumed or if the response was not obtained from the `stream` method. """ @dataclass(frozen=True) @docs_group('Crawling contexts') class HttpCrawlingResult: """Result of an HTTP-only crawl. Mainly for the purpose of composing specific crawling contexts (e.g. `BeautifulSoupCrawlingContext`, `ParselCrawlingContext`, ...). """ http_response: HttpResponse """The HTTP response received from the server.""" @docs_group('HTTP clients') class HttpClient(ABC): """An abstract base class for HTTP clients used in crawlers (`BasicCrawler` subclasses).""" @abstractmethod def __init__( self, *, persist_cookies_per_session: bool = True, ) -> None: """Initialize a new instance. Args: persist_cookies_per_session: Whether to persist cookies per HTTP session. """ self._persist_cookies_per_session = persist_cookies_per_session # Flag to indicate the context state. self._active = False @property def active(self) -> bool: """Indicate whether the context is active.""" return self._active @abstractmethod async def crawl( self, request: Request, *, session: Session | None = None, proxy_info: ProxyInfo | None = None, statistics: Statistics | None = None, timeout: timedelta | None = None, ) -> HttpCrawlingResult: """Perform the crawling for a given request. This method is called from `crawler.run()`. Args: request: The request to be crawled. session: The session associated with the request. proxy_info: The information about the proxy to be used. statistics: The statistics object to register status codes. timeout: Maximum time allowed to process the request. Raises: ProxyError: Raised if a proxy-related error occurs. Returns: The result of the crawling. """ @abstractmethod async def send_request( self, url: str, *, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None, payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> HttpResponse: """Send an HTTP request via the client. This method is called from `context.send_request()` helper. Args: url: The URL to send the request to. method: The HTTP method to use. headers: The headers to include in the request. payload: The data to be sent as the request body. session: The session associated with the request. proxy_info: The information about the proxy to be used. timeout: Maximum time allowed to process the request. Raises: ProxyError: Raised if a proxy-related error occurs. Returns: The HTTP response received from the server. """ @abstractmethod def stream( self, url: str, *, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None, payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> AbstractAsyncContextManager[HttpResponse]: """Stream an HTTP request via the client. This method should be used for downloading potentially large data where you need to process the response body in chunks rather than loading it entirely into memory. Args: url: The URL to send the request to. method: The HTTP method to use. headers: The headers to include in the request. payload: The data to be sent as the request body. session: The session associated with the request. proxy_info: The information about the proxy to be used. timeout: The maximum time to wait for establishing the connection. Raises: ProxyError: Raised if a proxy-related error occurs. Returns: An async context manager yielding the HTTP response with streaming capabilities. """ @abstractmethod async def cleanup(self) -> None: """Clean up resources used by the client. This method is called when the client is no longer needed and should be overridden in subclasses to perform any necessary cleanup such as closing connections, releasing file handles, or other resource deallocation. """ async def __aenter__(self) -> HttpClient: """Initialize the client when entering the context manager. Raises: RuntimeError: If the context manager is already active. """ if self._active: raise RuntimeError(f'The {self.__class__.__name__} is already active.') self._active = True return self async def __aexit__( self, exc_type: BaseException | None, exc_value: BaseException | None, traceback: TracebackType | None ) -> None: """Deinitialize the client and clean up resources when exiting the context manager. Raises: RuntimeError: If the context manager is already active. """ if not self._active: raise RuntimeError(f'The {self.__class__.__name__} is not active.') await self.cleanup() self._active = False ================================================ FILE: src/crawlee/http_clients/_curl_impersonate.py ================================================ from __future__ import annotations import asyncio from contextlib import asynccontextmanager from http.cookiejar import Cookie from typing import TYPE_CHECKING, Any, cast from curl_cffi import CurlInfo from curl_cffi.const import CurlHttpVersion from curl_cffi.requests import AsyncSession from curl_cffi.requests.cookies import Cookies as CurlCookies from curl_cffi.requests.cookies import CurlMorsel from curl_cffi.requests.exceptions import ProxyError as CurlProxyError from curl_cffi.requests.exceptions import RequestException as CurlRequestError from curl_cffi.requests.exceptions import Timeout from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME from typing_extensions import override from crawlee._types import HttpHeaders, HttpMethod, HttpPayload from crawlee._utils.blocked import ROTATE_PROXY_ERRORS from crawlee._utils.docs import docs_group from crawlee.errors import ProxyError from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse if TYPE_CHECKING: from collections.abc import AsyncGenerator from datetime import timedelta from curl_cffi import Curl from curl_cffi.requests import Request as CurlRequest from curl_cffi.requests import Response from curl_cffi.requests.session import HttpMethod as CurlHttpMethod from crawlee import Request from crawlee._types import HttpMethod from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions import Session from crawlee.statistics import Statistics class _EmptyCookies(CurlCookies): @override def get_cookies_for_curl(self, request: CurlRequest) -> list[CurlMorsel]: return [] @override def update_cookies_from_curl(self, morsels: list[CurlMorsel]) -> None: return None class _AsyncSession(AsyncSession): @override def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) self._cookies = _EmptyCookies() class _CurlImpersonateResponse: """Adapter class for `curl_cffi.requests.Response` to conform to the `HttpResponse` protocol.""" def __init__(self, response: Response) -> None: self._response = response @property def http_version(self) -> str: if self._response.http_version == CurlHttpVersion.NONE: return 'NONE' if self._response.http_version == CurlHttpVersion.V1_0: return 'HTTP/1.0' if self._response.http_version == CurlHttpVersion.V1_1: return 'HTTP/1.1' if self._response.http_version in { CurlHttpVersion.V2_0, CurlHttpVersion.V2TLS, CurlHttpVersion.V2_PRIOR_KNOWLEDGE, }: return 'HTTP/2' if self._response.http_version == CurlHttpVersion.V3: return 'HTTP/3' raise ValueError(f'Unknown HTTP version: {self._response.http_version}') @property def status_code(self) -> int: return self._response.status_code @property def headers(self) -> HttpHeaders: return HttpHeaders({key: value for key, value in self._response.headers.items() if value}) async def read(self) -> bytes: if self._response.astream_task: raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method') return self._response.content async def read_stream(self) -> AsyncGenerator[bytes, None]: if not self._response.astream_task: raise RuntimeError('Cannot read stream, Response not obtained from `stream` method.') if isinstance(self._response.astream_task, asyncio.Future) and self._response.astream_task.done(): raise RuntimeError('Cannot read stream, it was already consumed.') async for chunk in self._response.aiter_content(): yield chunk @docs_group('HTTP clients') class CurlImpersonateHttpClient(HttpClient): """HTTP client based on the `curl-cffi` library. This client uses the `curl-cffi` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses) and to manage sessions, proxies, and error handling. See the `HttpClient` class for more common information about HTTP clients. ### Usage ```python from crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler from crawlee.http_clients import CurlImpersonateHttpClient http_client = CurlImpersonateHttpClient() crawler = HttpCrawler(http_client=http_client) ``` """ def __init__( self, *, persist_cookies_per_session: bool = True, **async_session_kwargs: Any, ) -> None: """Initialize a new instance. Args: persist_cookies_per_session: Whether to persist cookies per HTTP session. async_session_kwargs: Additional keyword arguments for `curl_cffi.requests.AsyncSession`. """ super().__init__( persist_cookies_per_session=persist_cookies_per_session, ) self._async_session_kwargs = async_session_kwargs self._client_by_proxy_url = dict[str | None, AsyncSession]() @override async def crawl( self, request: Request, *, session: Session | None = None, proxy_info: ProxyInfo | None = None, statistics: Statistics | None = None, timeout: timedelta | None = None, ) -> HttpCrawlingResult: client = self._get_client(proxy_info.url if proxy_info else None) try: response = await client.request( url=request.url, method=self._convert_method(request.method), headers=request.headers, data=request.payload, cookies=session.cookies.jar if session else None, timeout=timeout.total_seconds() if timeout else None, ) except Timeout as exc: raise asyncio.TimeoutError from exc except CurlRequestError as exc: if self._is_proxy_error(exc): raise ProxyError from exc raise if statistics: statistics.register_status_code(response.status_code) if self._persist_cookies_per_session and session and response.curl: response_cookies = self._get_cookies(response.curl) session.cookies.store_cookies(response_cookies) request.loaded_url = response.url return HttpCrawlingResult( http_response=_CurlImpersonateResponse(response), ) @override async def send_request( self, url: str, *, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None, payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> HttpResponse: if isinstance(headers, dict) or headers is None: headers = HttpHeaders(headers or {}) proxy_url = proxy_info.url if proxy_info else None client = self._get_client(proxy_url) try: response = await client.request( url=url, method=self._convert_method(method), headers=dict(headers) if headers else None, data=payload, cookies=session.cookies.jar if session else None, timeout=timeout.total_seconds() if timeout else None, ) except Timeout as exc: raise asyncio.TimeoutError from exc except CurlRequestError as exc: if self._is_proxy_error(exc): raise ProxyError from exc raise if self._persist_cookies_per_session and session and response.curl: response_cookies = self._get_cookies(response.curl) session.cookies.store_cookies(response_cookies) return _CurlImpersonateResponse(response) @asynccontextmanager @override async def stream( self, url: str, *, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None, payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> AsyncGenerator[HttpResponse]: if isinstance(headers, dict) or headers is None: headers = HttpHeaders(headers or {}) proxy_url = proxy_info.url if proxy_info else None client = self._get_client(proxy_url) try: response = await client.request( url=url, method=self._convert_method(method), headers=dict(headers) if headers else None, data=payload, cookies=session.cookies.jar if session else None, stream=True, timeout=timeout.total_seconds() if timeout else None, ) except Timeout as exc: raise asyncio.TimeoutError from exc except CurlRequestError as exc: if self._is_proxy_error(exc): raise ProxyError from exc raise if self._persist_cookies_per_session and session and response.curl: response_cookies = self._get_cookies(response.curl) session.cookies.store_cookies(response_cookies) try: yield _CurlImpersonateResponse(response) finally: await response.aclose() def _get_client(self, proxy_url: str | None) -> AsyncSession: """Retrieve or create an asynchronous HTTP session for the given proxy URL. Check if an `AsyncSession` already exists for the specified proxy URL. If no session is found, create a new one with the provided proxy settings and additional session options. Store the new session for future use. """ # Check if a session for the given proxy URL has already been created. if proxy_url not in self._client_by_proxy_url: # Prepare a default kwargs for the new session. A provided proxy URL and a chrome for impersonation # are set as default options. kwargs: dict[str, Any] = { 'proxy': proxy_url, 'impersonate': CURL_DEFAULT_CHROME, } # Update the default kwargs with any additional user-provided kwargs. kwargs.update(self._async_session_kwargs) # Create and store the new session with the specified kwargs. self._client_by_proxy_url[proxy_url] = _AsyncSession(**kwargs) return self._client_by_proxy_url[proxy_url] def _convert_method(self, method: HttpMethod) -> CurlHttpMethod: """Convert from Crawlee HTTP method to curl-cffi HTTP method. Args: method: Crawlee HTTP method. Returns: Corresponding curl-cffi HTTP method. Raises: ValueError: If the provided HTTP method is not supported. """ method_upper = method.upper() # curl-cffi requires uppercase methods match method_upper: case 'GET': return 'GET' case 'POST': return 'POST' case 'PUT': return 'PUT' case 'DELETE': return 'DELETE' case 'OPTIONS': return 'OPTIONS' case 'HEAD': return 'HEAD' case 'TRACE': return 'TRACE' case 'PATCH': return 'PATCH' case _: raise ValueError(f'HTTP method {method} is not supported in {self.__class__.__name__}.') @staticmethod def _is_proxy_error(error: CurlRequestError) -> bool: """Determine whether the given error is related to a proxy issue. Check if the error message contains known proxy-related error keywords or if it is an instance of `CurlProxyError`. """ if any(needle in str(error) for needle in ROTATE_PROXY_ERRORS): return True if isinstance(error, CurlProxyError): # noqa: SIM103 return True return False @staticmethod def _get_cookies(curl: Curl) -> list[Cookie]: cookies = list[Cookie]() # Implementation of getinfo always returns list[bytes] for CurlInfo.COOKIELIST. cookie_list = cast('list[bytes]', curl.getinfo(CurlInfo.COOKIELIST)) for curl_cookie in cookie_list: curl_morsel = CurlMorsel.from_curl_format(curl_cookie) cookie = curl_morsel.to_cookiejar_cookie() cookies.append(cookie) return cookies async def cleanup(self) -> None: for client in self._client_by_proxy_url.values(): await client.close() self._client_by_proxy_url.clear() ================================================ FILE: src/crawlee/http_clients/_httpx.py ================================================ from __future__ import annotations import asyncio from contextlib import asynccontextmanager from logging import getLogger from typing import TYPE_CHECKING, Any, cast import httpx from typing_extensions import override from crawlee._types import HttpHeaders from crawlee._utils.blocked import ROTATE_PROXY_ERRORS from crawlee._utils.docs import docs_group from crawlee.errors import ProxyError from crawlee.fingerprint_suite import HeaderGenerator from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse if TYPE_CHECKING: from collections.abc import AsyncGenerator, AsyncIterator from datetime import timedelta from ssl import SSLContext from crawlee import Request from crawlee._types import HttpMethod, HttpPayload from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions import Session from crawlee.statistics import Statistics logger = getLogger(__name__) class _HttpxResponse: """Adapter class for `httpx.Response` to conform to the `HttpResponse` protocol.""" def __init__(self, response: httpx.Response) -> None: self._response = response @property def http_version(self) -> str: return self._response.http_version @property def status_code(self) -> int: return self._response.status_code @property def headers(self) -> HttpHeaders: return HttpHeaders(dict(self._response.headers)) async def read(self) -> bytes: if not self._response.is_closed: raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method') return await self._response.aread() async def read_stream(self) -> AsyncIterator[bytes]: if self._response.is_stream_consumed: raise RuntimeError('Stream is already consumed.') else: async for chunk in self._response.aiter_bytes(): yield chunk class _HttpxTransport(httpx.AsyncHTTPTransport): """HTTP transport adapter that stores response cookies in a `Session`. This transport adapter modifies the handling of HTTP requests to update the session cookies based on the response cookies, ensuring that the cookies are stored in the session object rather than the `HTTPX` client itself. """ @override async def handle_async_request(self, request: httpx.Request) -> httpx.Response: response = await super().handle_async_request(request) response.request = request if session := cast('Session', request.extensions.get('crawlee_session')): session.cookies.store_cookies(list(response.cookies.jar)) if 'Set-Cookie' in response.headers: del response.headers['Set-Cookie'] return response @docs_group('HTTP clients') class HttpxHttpClient(HttpClient): """HTTP client based on the `HTTPX` library. This client uses the `HTTPX` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses) and to manage sessions, proxies, and error handling. See the `HttpClient` class for more common information about HTTP clients. ### Usage ```python from crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler from crawlee.http_clients import HttpxHttpClient http_client = HttpxHttpClient() crawler = HttpCrawler(http_client=http_client) ``` """ _DEFAULT_HEADER_GENERATOR = HeaderGenerator() def __init__( self, *, persist_cookies_per_session: bool = True, http1: bool = True, http2: bool = True, verify: str | bool | SSLContext = True, header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR, **async_client_kwargs: Any, ) -> None: """Initialize a new instance. Args: persist_cookies_per_session: Whether to persist cookies per HTTP session. http1: Whether to enable HTTP/1.1 support. http2: Whether to enable HTTP/2 support. verify: SSL certificates used to verify the identity of requested hosts. header_generator: Header generator instance to use for generating common headers. async_client_kwargs: Additional keyword arguments for `httpx.AsyncClient`. """ super().__init__( persist_cookies_per_session=persist_cookies_per_session, ) self._http1 = http1 self._http2 = http2 self._async_client_kwargs = async_client_kwargs self._header_generator = header_generator self._ssl_context = httpx.create_ssl_context(verify=verify) self._transport: _HttpxTransport | None = None self._client_by_proxy_url = dict[str | None, httpx.AsyncClient]() @override async def crawl( self, request: Request, *, session: Session | None = None, proxy_info: ProxyInfo | None = None, statistics: Statistics | None = None, timeout: timedelta | None = None, ) -> HttpCrawlingResult: client = self._get_client(proxy_info.url if proxy_info else None) headers = self._combine_headers(request.headers) http_request = client.build_request( url=request.url, method=request.method, headers=headers, content=request.payload, cookies=session.cookies.jar if session else None, extensions={'crawlee_session': session if self._persist_cookies_per_session else None}, timeout=timeout.total_seconds() if timeout is not None else httpx.USE_CLIENT_DEFAULT, ) try: response = await client.send(http_request) except httpx.TimeoutException as exc: raise asyncio.TimeoutError from exc except httpx.TransportError as exc: if self._is_proxy_error(exc): raise ProxyError from exc raise if statistics: statistics.register_status_code(response.status_code) request.loaded_url = str(response.url) return HttpCrawlingResult( http_response=_HttpxResponse(response), ) @override async def send_request( self, url: str, *, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None, payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> HttpResponse: client = self._get_client(proxy_info.url if proxy_info else None) http_request = self._build_request( client=client, url=url, method=method, headers=headers, payload=payload, session=session, timeout=httpx.Timeout(timeout.total_seconds()) if timeout is not None else None, ) try: response = await client.send(http_request) except httpx.TimeoutException as exc: raise asyncio.TimeoutError from exc except httpx.TransportError as exc: if self._is_proxy_error(exc): raise ProxyError from exc raise return _HttpxResponse(response) @asynccontextmanager @override async def stream( self, url: str, *, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None, payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> AsyncGenerator[HttpResponse]: client = self._get_client(proxy_info.url if proxy_info else None) http_request = self._build_request( client=client, url=url, method=method, headers=headers, payload=payload, session=session, timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None, ) try: response = await client.send(http_request, stream=True) except httpx.TimeoutException as exc: raise asyncio.TimeoutError from exc try: yield _HttpxResponse(response) finally: await response.aclose() def _build_request( self, client: httpx.AsyncClient, url: str, method: HttpMethod, headers: HttpHeaders | dict[str, str] | None, payload: HttpPayload | None, session: Session | None = None, timeout: httpx.Timeout | None = None, ) -> httpx.Request: """Build an `httpx.Request` using the provided parameters.""" if isinstance(headers, dict) or headers is None: headers = HttpHeaders(headers or {}) headers = self._combine_headers(headers) return client.build_request( url=url, method=method, headers=dict(headers) if headers else None, content=payload, extensions={'crawlee_session': session if self._persist_cookies_per_session else None}, timeout=timeout or httpx.USE_CLIENT_DEFAULT, ) def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient: """Retrieve or create an HTTP client for the given proxy URL. If a client for the specified proxy URL does not exist, create and store a new one. """ if not self._transport: # Configure connection pool limits and keep-alive connections for transport limits = self._async_client_kwargs.get( 'limits', httpx.Limits(max_connections=1000, max_keepalive_connections=200) ) self._transport = _HttpxTransport( http1=self._http1, http2=self._http2, verify=self._ssl_context, limits=limits, ) if proxy_url not in self._client_by_proxy_url: # Prepare a default kwargs for the new client. kwargs: dict[str, Any] = { 'proxy': proxy_url, 'http1': self._http1, 'http2': self._http2, 'follow_redirects': True, } # Update the default kwargs with any additional user-provided kwargs. kwargs.update(self._async_client_kwargs) kwargs.update( { 'transport': self._transport, 'verify': self._ssl_context, } ) client = httpx.AsyncClient(**kwargs) self._client_by_proxy_url[proxy_url] = client return self._client_by_proxy_url[proxy_url] def _combine_headers(self, explicit_headers: HttpHeaders | None) -> HttpHeaders | None: """Merge default headers with explicit headers for an HTTP request. Generate a final set of request headers by combining default headers, a random User-Agent header, and any explicitly provided headers. """ common_headers = self._header_generator.get_common_headers() if self._header_generator else HttpHeaders() user_agent_header = ( self._header_generator.get_random_user_agent_header() if self._header_generator else HttpHeaders() ) explicit_headers = explicit_headers or HttpHeaders() headers = common_headers | user_agent_header | explicit_headers return headers or None @staticmethod def _is_proxy_error(error: httpx.TransportError) -> bool: """Determine whether the given error is related to a proxy issue. Check if the error is an instance of `httpx.ProxyError` or if its message contains known proxy-related error keywords. """ if isinstance(error, httpx.ProxyError): return True if any(needle in str(error) for needle in ROTATE_PROXY_ERRORS): # noqa: SIM103 return True return False async def cleanup(self) -> None: for client in self._client_by_proxy_url.values(): await client.aclose() self._client_by_proxy_url.clear() if self._transport: await self._transport.aclose() self._transport = None ================================================ FILE: src/crawlee/http_clients/_impit.py ================================================ from __future__ import annotations import asyncio from contextlib import asynccontextmanager from logging import getLogger from typing import TYPE_CHECKING, Any, TypedDict from cachetools import LRUCache from impit import AsyncClient, Browser, HTTPError, Response, TimeoutException, TransportError from impit import ProxyError as ImpitProxyError from typing_extensions import override from crawlee._types import HttpHeaders from crawlee._utils.blocked import ROTATE_PROXY_ERRORS from crawlee._utils.docs import docs_group from crawlee.errors import ProxyError from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse if TYPE_CHECKING: from collections.abc import AsyncGenerator, AsyncIterator from datetime import timedelta from http.cookiejar import CookieJar from crawlee import Request from crawlee._types import HttpMethod, HttpPayload from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions import Session from crawlee.statistics import Statistics logger = getLogger(__name__) class _ClientCacheEntry(TypedDict): """Type definition for client cache entries.""" client: AsyncClient cookie_jar: CookieJar | None class _ImpitResponse: """Adapter class for `impit.Response` to conform to the `HttpResponse` protocol.""" def __init__(self, response: Response) -> None: self._response = response @property def http_version(self) -> str: return str(self._response.http_version) @property def status_code(self) -> int: return int(self._response.status_code) @property def headers(self) -> HttpHeaders: return HttpHeaders(dict(self._response.headers)) async def read(self) -> bytes: if not self._response.is_closed: raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method') return self._response.content async def read_stream(self) -> AsyncIterator[bytes]: if self._response.is_stream_consumed: raise RuntimeError('Stream is already consumed.') else: async for chunk in self._response.aiter_bytes(): yield chunk @docs_group('HTTP clients') class ImpitHttpClient(HttpClient): """HTTP client based on the `impit` library. This client uses the `impit` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses) and to manage sessions, proxies, and error handling. See the `HttpClient` class for more common information about HTTP clients. ### Usage ```python from crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler from crawlee.http_clients import ImpitHttpClient http_client = ImpitHttpClient() crawler = HttpCrawler(http_client=http_client) ``` """ def __init__( self, *, persist_cookies_per_session: bool = True, http3: bool = False, verify: bool = True, browser: Browser | None = 'firefox', **async_client_kwargs: Any, ) -> None: """Initialize a new instance. Args: persist_cookies_per_session: Whether to persist cookies per HTTP session. http3: Whether to enable HTTP/3 support. verify: SSL certificates used to verify the identity of requested hosts. browser: Browser to impersonate. async_client_kwargs: Additional keyword arguments for `impit.AsyncClient`. """ super().__init__( persist_cookies_per_session=persist_cookies_per_session, ) self._http3 = http3 self._verify = verify self._browser = browser self._async_client_kwargs = async_client_kwargs self._client_by_proxy_url = LRUCache[str | None, _ClientCacheEntry](maxsize=10) @override async def crawl( self, request: Request, *, session: Session | None = None, proxy_info: ProxyInfo | None = None, statistics: Statistics | None = None, timeout: timedelta | None = None, ) -> HttpCrawlingResult: client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None) try: response = await client.request( url=request.url, method=request.method, content=request.payload, headers=dict(request.headers) if request.headers else None, timeout=timeout.total_seconds() if timeout else None, ) except TimeoutException as exc: raise asyncio.TimeoutError from exc except (TransportError, HTTPError) as exc: if self._is_proxy_error(exc): raise ProxyError from exc raise if statistics: statistics.register_status_code(response.status_code) request.loaded_url = str(response.url) return HttpCrawlingResult(http_response=_ImpitResponse(response)) @override async def send_request( self, url: str, *, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None, payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> HttpResponse: if isinstance(headers, dict) or headers is None: headers = HttpHeaders(headers or {}) client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None) try: response = await client.request( method=method, url=url, content=payload, headers=dict(headers) if headers else None, timeout=timeout.total_seconds() if timeout else None, ) except TimeoutException as exc: raise asyncio.TimeoutError from exc except (TransportError, HTTPError) as exc: if self._is_proxy_error(exc): raise ProxyError from exc raise return _ImpitResponse(response) @asynccontextmanager @override async def stream( self, url: str, *, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None, payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> AsyncGenerator[HttpResponse]: client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None) try: response = await client.request( method=method, url=url, content=payload, headers=dict(headers) if headers else None, timeout=timeout.total_seconds() if timeout else None, stream=True, ) except TimeoutException as exc: raise asyncio.TimeoutError from exc try: yield _ImpitResponse(response) finally: # TODO: https://github.com/apify/impit/issues/242 # Quickly closing Response while reading the response body causes an error in the Rust generator in `impit`. # With a short sleep and sync closing, the error does not occur. # Replace with `response.aclose` when this is resolved in impit. await asyncio.sleep(0.01) response.close() def _get_client(self, proxy_url: str | None, cookie_jar: CookieJar | None) -> AsyncClient: """Retrieve or create an HTTP client for the given proxy URL. If a client for the specified proxy URL does not exist, create and store a new one. """ cached_data = self._client_by_proxy_url.get(proxy_url) if cached_data: client = cached_data['client'] client_cookie_jar = cached_data['cookie_jar'] if client_cookie_jar is cookie_jar: # If the cookie jar matches, return the existing client. return client # Prepare a default kwargs for the new client. kwargs: dict[str, Any] = { 'proxy': proxy_url, 'http3': self._http3, 'verify': self._verify, 'follow_redirects': True, 'browser': self._browser, } # Update the default kwargs with any additional user-provided kwargs. kwargs.update(self._async_client_kwargs) client = AsyncClient(**kwargs, cookie_jar=cookie_jar) self._client_by_proxy_url[proxy_url] = _ClientCacheEntry(client=client, cookie_jar=cookie_jar) return client @staticmethod def _is_proxy_error(error: HTTPError) -> bool: """Determine whether the given error is related to a proxy issue. Check if the error message contains known proxy-related error keywords. """ if isinstance(error, ImpitProxyError): return True if any(needle in str(error) for needle in ROTATE_PROXY_ERRORS): # noqa: SIM103 return True return False @override async def cleanup(self) -> None: """Clean up resources used by the HTTP client.""" self._client_by_proxy_url.clear() ================================================ FILE: src/crawlee/otel/__init__.py ================================================ from crawlee.otel.crawler_instrumentor import CrawlerInstrumentor __all__ = [ 'CrawlerInstrumentor', ] ================================================ FILE: src/crawlee/otel/crawler_instrumentor.py ================================================ from __future__ import annotations import inspect from typing import TYPE_CHECKING, Any from opentelemetry.instrumentation.instrumentor import BaseInstrumentor from opentelemetry.instrumentation.utils import unwrap from opentelemetry.semconv.attributes.code_attributes import CODE_FUNCTION_NAME from opentelemetry.semconv.attributes.http_attributes import HTTP_REQUEST_METHOD from opentelemetry.semconv.attributes.url_attributes import URL_FULL from opentelemetry.trace import get_tracer from wrapt import wrap_function_wrapper from crawlee._utils.docs import docs_group from crawlee.crawlers import BasicCrawler, ContextPipeline from crawlee.crawlers._basic._context_pipeline import _Middleware if TYPE_CHECKING: from collections.abc import Callable from crawlee.crawlers import BasicCrawlingContext @docs_group('Other') class CrawlerInstrumentor(BaseInstrumentor): """Helper class for instrumenting crawlers with OpenTelemetry.""" def __init__( self, *, instrument_classes: list[type] | None = None, request_handling_instrumentation: bool = True ) -> None: """Initialize the instrumentor. Args: instrument_classes: List of classes to be instrumented - all their public methods and coroutines will be wrapped by generic instrumentation wrapper that will create spans for them. request_handling_instrumentation: When `True`, the most relevant methods in the request handling pipeline will be instrumented. When `False`, no request handling instrumentation will be done. """ self._tracer = get_tracer(__name__) async def _simple_async_wrapper(wrapped: Any, _: Any, args: Any, kwargs: Any) -> Any: with self._tracer.start_as_current_span( name=wrapped.__name__, attributes={CODE_FUNCTION_NAME: wrapped.__qualname__} ): return await wrapped(*args, **kwargs) def _simple_wrapper(wrapped: Any, _: Any, args: Any, kwargs: Any) -> Any: with self._tracer.start_as_current_span( name=wrapped.__name__, attributes={CODE_FUNCTION_NAME: wrapped.__qualname__} ): return wrapped(*args, **kwargs) def _init_wrapper(wrapped: Any, _: Any, args: Any, kwargs: Any) -> None: with self._tracer.start_as_current_span( name=wrapped.__name__, attributes={CODE_FUNCTION_NAME: wrapped.__qualname__} ): wrapped(*args, **kwargs) self._instrumented: list[tuple[Any, str, Callable]] = [] self._simple_wrapper = _simple_wrapper self._simple_async_wrapper = _simple_async_wrapper self._init_wrapper = _init_wrapper if instrument_classes: for _class in instrument_classes: self._instrument_all_public_methods(on_class=_class) if request_handling_instrumentation: async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any: with self._tracer.start_as_current_span( name=f'{instance.generator.__name__}, {wrapped.__name__}', # type:ignore[attr-defined] # valid in our context attributes={ URL_FULL: instance.input_context.request.url, CODE_FUNCTION_NAME: instance.generator.__qualname__, # type:ignore[attr-defined] # valid in our context }, ): return await wrapped(*args, **kwargs) async def context_pipeline_wrapper( wrapped: Any, _: ContextPipeline[BasicCrawlingContext], args: Any, kwargs: Any ) -> Any: context = args[0] final_context_consumer = args[1] async def wrapped_final_consumer(*args: Any, **kwargs: Any) -> Any: with self._tracer.start_as_current_span( name='request_handler', attributes={URL_FULL: context.request.url, HTTP_REQUEST_METHOD: context.request.method}, ): return await final_context_consumer(*args, **kwargs) with self._tracer.start_as_current_span( name='ContextPipeline', attributes={URL_FULL: context.request.url, HTTP_REQUEST_METHOD: context.request.method}, ): return await wrapped(context, wrapped_final_consumer, **kwargs) async def _commit_request_handler_result_wrapper( wrapped: Callable[[Any], Any], _: BasicCrawler, args: Any, kwargs: Any ) -> Any: context = args[0] with self._tracer.start_as_current_span( name='Commit results', attributes={URL_FULL: context.request.url, HTTP_REQUEST_METHOD: context.request.method}, ): return await wrapped(*args, **kwargs) # Handpicked interesting methods to instrument self._instrumented.extend( [ (_Middleware, 'action', middleware_wrapper), (_Middleware, 'cleanup', middleware_wrapper), (ContextPipeline, '__call__', context_pipeline_wrapper), (BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper), (BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper), ] ) def instrumentation_dependencies(self) -> list[str]: """Return a list of python packages with versions that will be instrumented.""" return ['crawlee'] def _instrument_all_public_methods(self, on_class: type) -> None: public_coroutines = { name for name, member in inspect.getmembers(on_class, predicate=inspect.iscoroutinefunction) if not name.startswith('_') } public_methods = { name for name, member in inspect.getmembers(on_class, predicate=inspect.isfunction) if not name.startswith('_') } - public_coroutines for coroutine in public_coroutines: self._instrumented.append((on_class, coroutine, self._simple_async_wrapper)) for method in public_methods: self._instrumented.append((on_class, method, self._simple_wrapper)) self._instrumented.append((on_class, '__init__', self._init_wrapper)) def _instrument(self, **_: Any) -> None: for _class, method, wrapper in self._instrumented: wrap_function_wrapper(_class, method, wrapper) def _uninstrument(self, **_: Any) -> None: for _class, method, __ in self._instrumented: unwrap(_class, method) ================================================ FILE: src/crawlee/project_template/cookiecutter.json ================================================ { "project_name": "crawlee-python-project", "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}", "crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit"], "__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}", "http_client": ["impit", "httpx", "curl-impersonate"], "package_manager": ["poetry", "pip", "uv"], "enable_apify_integration": false, "install_project": true, "start_url": "https://crawlee.dev", "_jinja2_env_vars": { "line_statement_prefix": "# %" }, "_extensions": ["jinja2.ext.do"] } ================================================ FILE: src/crawlee/project_template/hooks/post_gen_project.py ================================================ import platform import subprocess from pathlib import Path # % if cookiecutter.package_manager in ['poetry', 'uv'] Path('requirements.txt').unlink() # % if cookiecutter.install_project == True # % if cookiecutter.package_manager == 'poetry' subprocess.check_call(['poetry', 'install']) # % elif cookiecutter.package_manager == 'uv' subprocess.check_call(['uv', 'sync']) # % endif # % if cookiecutter.crawler_type == 'playwright' manager = "{{ cookiecutter.package_manager }}" subprocess.check_call([manager, 'run', 'playwright', 'install']) # % endif # % endif # % elif cookiecutter.package_manager == 'pip' import venv # noqa: E402 # Create a virtual environment venv_root = Path('.venv') venv.main([str(venv_root)]) # % if cookiecutter.install_project == True if platform.system() == 'Windows': # noqa: SIM108 path = venv_root / 'Scripts' else: path = venv_root / 'bin' # Install requirements and generate requirements.txt as an impromptu lockfile subprocess.check_call([str(path / 'pip'), 'install', '-r', 'requirements.txt']) Path('requirements.txt').write_text( subprocess.check_output([str(path / 'pip'), 'freeze']).decode() ) # % if cookiecutter.crawler_type == 'playwright' subprocess.check_call([str(path / 'playwright'), 'install']) # % endif # % endif # % endif ================================================ FILE: src/crawlee/project_template/hooks/pre_gen_project.py ================================================ # % if cookiecutter.package_manager in ['poetry', 'uv'] import subprocess import shutil import re import sys manager = "{{cookiecutter.package_manager}}" manager_text = manager.title() # % if cookiecutter.package_manager == 'poetry' version_regex = r'Poetry \(version 2\..*\)' r_version = '2.x' # % elif cookiecutter.package_manager == 'uv' version_regex = r'uv (0\..*)' r_version = '0.x' # % endif # Check if package manager is available in PATH if not shutil.which(manager): sys.stderr.write(f'\nError: You selected {manager_text} as your package manager, but it is not installed. Please install it and try again.\n') sys.exit(1) # Check if the package manager is executable try: version = subprocess.check_output([manager, '--version']).decode().strip() except OSError: sys.stderr.write(f'\nError: Your selected package manager {manager_text} was found but failed to execute.\n') sys.exit(1) # Check if the version matches the required regex if not re.match(version_regex, version): sys.stderr.write(f'\nError: Your selected package manager {manager_text} requires version {r_version}, but {version} is installed.\n') sys.exit(1) # % endif ================================================ FILE: src/crawlee/project_template/templates/main.py ================================================ # % if cookiecutter.enable_apify_integration from apify import Actor # % endif # % block import required # % endblock # % if cookiecutter.http_client == 'curl-impersonate' from crawlee.http_clients import CurlImpersonateHttpClient # % elif cookiecutter.http_client == 'httpx' from crawlee.http_clients import HttpxHttpClient # % elif cookiecutter.http_client == 'impit' from crawlee.http_clients import ImpitHttpClient # % endif from .routes import router # % filter truncate(0, end='') # % block http_client_instantiation # % if cookiecutter.http_client == 'curl-impersonate' http_client=CurlImpersonateHttpClient(), # % elif cookiecutter.http_client == 'httpx' http_client=HttpxHttpClient(), # % elif cookiecutter.http_client == 'impit' http_client=ImpitHttpClient(), # % endif # % endblock # % endfilter # % if self.pre_main is defined {{self.pre_main()}} # % endif async def main() -> None: """The crawler entry point.""" # % filter truncate(0, end='') # % block instantiation required # % endblock # % endfilter # % if cookiecutter.enable_apify_integration async with Actor: # % set indent_width = 8 # % else # % set indent_width = 4 # % endif # % filter indent(width=indent_width, first=True) {{self.instantiation()}} await crawler.run( [ '{{ cookiecutter.start_url }}', ] ) # % endfilter ================================================ FILE: src/crawlee/project_template/templates/main_beautifulsoup.py ================================================ # % extends 'main.py' # % block import from crawlee.crawlers import BeautifulSoupCrawler # % endblock # % block instantiation crawler = BeautifulSoupCrawler( request_handler=router, max_requests_per_crawl=10, {{ self.http_client_instantiation() }}) # % endblock ================================================ FILE: src/crawlee/project_template/templates/main_parsel.py ================================================ # % extends 'main.py' # % block import from crawlee.crawlers import ParselCrawler # % endblock # % block instantiation crawler = ParselCrawler( request_handler=router, max_requests_per_crawl=10, {{ self.http_client_instantiation() }}) # % endblock ================================================ FILE: src/crawlee/project_template/templates/main_playwright.py ================================================ # % extends 'main.py' # % block import from crawlee.crawlers import PlaywrightCrawler # % endblock # % block instantiation crawler = PlaywrightCrawler( request_handler=router, headless=True, max_requests_per_crawl=10, {{ self.http_client_instantiation() }}) # % endblock ================================================ FILE: src/crawlee/project_template/templates/main_playwright_camoufox.py ================================================ # % extends 'main.py' # % block import from camoufox import AsyncNewBrowser from typing_extensions import override from crawlee._utils.context import ensure_context from crawlee.browsers import PlaywrightBrowserPlugin, PlaywrightBrowserController, BrowserPool from crawlee.crawlers import PlaywrightCrawler # % endblock # % block pre_main class CamoufoxPlugin(PlaywrightBrowserPlugin): """Example browser plugin that uses Camoufox Browser, but otherwise keeps the functionality of PlaywrightBrowserPlugin.""" @ensure_context @override async def new_browser(self) -> PlaywrightBrowserController: if not self._playwright: raise RuntimeError('Playwright browser plugin is not initialized.') return PlaywrightBrowserController( browser=await AsyncNewBrowser(self._playwright, headless=True), max_open_pages_per_browser=1, # Increase, if camoufox can handle it in your usecase. header_generator=None, # This turns off the crawlee header_generation. Camoufox has its own. ) # % endblock # % block instantiation crawler = PlaywrightCrawler( max_requests_per_crawl=10, request_handler=router, browser_pool=BrowserPool(plugins=[CamoufoxPlugin()]) ) # % endblock ================================================ FILE: src/crawlee/project_template/templates/main_playwright_chrome.py ================================================ # % extends 'main.py' # % block import from crawlee.crawlers import PlaywrightCrawler # % endblock # % block instantiation crawler = PlaywrightCrawler( request_handler=router, headless=True, max_requests_per_crawl=10, browser_type="chrome", {{ self.http_client_instantiation() }} ) # % endblock ================================================ FILE: src/crawlee/project_template/templates/main_playwright_firefox.py ================================================ # % extends 'main.py' # % block import from crawlee.crawlers import PlaywrightCrawler # % endblock # % block instantiation crawler = PlaywrightCrawler( request_handler=router, headless=True, max_requests_per_crawl=10, browser_type="firefox", {{ self.http_client_instantiation() }} ) # % endblock ================================================ FILE: src/crawlee/project_template/templates/main_playwright_webkit.py ================================================ # % extends 'main.py' # % block import from crawlee.crawlers import PlaywrightCrawler # % endblock # % block instantiation crawler = PlaywrightCrawler( request_handler=router, headless=True, max_requests_per_crawl=10, browser_type="webkit", {{ self.http_client_instantiation() }} ) # % endblock ================================================ FILE: src/crawlee/project_template/templates/routes_beautifulsoup.py ================================================ from crawlee.crawlers import BeautifulSoupCrawlingContext from crawlee.router import Router router = Router[BeautifulSoupCrawlingContext]() @router.default_handler async def default_handler(context: BeautifulSoupCrawlingContext) -> None: """Default request handler.""" context.log.info(f'Processing {context.request.url} ...') title = context.soup.find('title') await context.push_data( { 'url': context.request.loaded_url, 'title': title.text if title else None, } ) await context.enqueue_links() ================================================ FILE: src/crawlee/project_template/templates/routes_parsel.py ================================================ from crawlee.crawlers import ParselCrawlingContext from crawlee.router import Router router = Router[ParselCrawlingContext]() @router.default_handler async def default_handler(context: ParselCrawlingContext) -> None: """Default request handler.""" context.log.info(f'Processing {context.request.url} ...') title = context.selector.xpath('//title/text()').get() await context.push_data( { 'url': context.request.loaded_url, 'title': title, } ) await context.enqueue_links() ================================================ FILE: src/crawlee/project_template/templates/routes_playwright.py ================================================ from crawlee.crawlers import PlaywrightCrawlingContext from crawlee.router import Router router = Router[PlaywrightCrawlingContext]() @router.default_handler async def default_handler(context: PlaywrightCrawlingContext) -> None: """Default request handler.""" context.log.info(f'Processing {context.request.url} ...') title = await context.page.query_selector('title') await context.push_data( { 'url': context.request.loaded_url, 'title': await title.inner_text() if title else None, } ) await context.enqueue_links() ================================================ FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/.dockerignore ================================================ .venv ================================================ FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile ================================================ # First, specify the base Docker image. # You can see the Docker images from Apify at https://hub.docker.com/r/apify/. # You can also use any other image from Docker Hub. # % if cookiecutter.crawler_type == 'playwright' FROM apify/actor-python-playwright:3.13 # % elif cookiecutter.crawler_type == 'playwright-camoufox' FROM apify/actor-python-playwright-camoufox:3.13 # % elif cookiecutter.crawler_type == 'playwright-chrome' FROM apify/actor-python-playwright-chrome:3.13 # % elif cookiecutter.crawler_type == 'playwright-firefox' FROM apify/actor-python-playwright-firefox:3.13 # % elif cookiecutter.crawler_type == 'playwright-webkit' FROM apify/actor-python-playwright-webkit:3.13 # % else FROM apify/actor-python:3.13 # % endif RUN apt update && apt install -yq git && rm -rf /var/lib/apt/lists/* # % if cookiecutter.package_manager == 'poetry' RUN pip install -U pip setuptools \ && pip install 'poetry<3' \ && poetry self add 'poetry-plugin-export' # Second, copy just poetry.lock and pyproject.toml into the Actor image, # since those should be the only files that affects the dependency install in the next step, # in order to speed up the build COPY pyproject.toml poetry.lock ./ # Install the dependencies RUN echo "Python version:" \ && python --version \ && echo "Installing dependencies:" \ # Export packages from poetry.lock && poetry export -f requirements.txt --without-hashes | \ # Replace playwright version so that it matches whatever is pre-installed in the image (the `hash` checks if playwright is installed) sed "s/^playwright==\(.*\)/playwright==$(hash playwright 2>/dev/null && (playwright --version | cut -d ' ' -f 2) || echo '\1')/" | \ # Install everything using pip (ignore dependency checks - the lockfile is correct, period) pip install -r /dev/stdin --no-dependencies \ && echo "All installed Python packages:" \ && pip freeze # % elif cookiecutter.package_manager == 'uv' RUN pip install -U pip setuptools \ && pip install 'uv<1' ENV UV_PROJECT_ENVIRONMENT="/usr/local" COPY pyproject.toml uv.lock ./ RUN echo "Python version:" \ && python --version \ && echo "Installing dependencies:" \ # Check if playwright is already installed && PLAYWRIGHT_INSTALLED=$(pip freeze | grep -q playwright && echo "true" || echo "false") \ && if [ "$PLAYWRIGHT_INSTALLED" = "true" ]; then \ echo "Playwright already installed, excluding from uv sync" \ && uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact --no-install-package playwright; \ else \ echo "Playwright not found, installing all dependencies" \ && uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact; \ fi \ && echo "All installed Python packages:" \ && pip freeze # % elif cookiecutter.package_manager == 'pip' RUN pip install -U pip setuptools # Second, copy just requirements.txt into the Actor image, # since it should be the only file that affects the dependency install in the next step, # in order to speed up the build COPY requirements.txt ./ # Install the dependencies RUN echo "Python version:" \ && python --version \ && echo "Installing dependencies:" \ # Install everything using pip, set playwright version so that it matches whatever is pre-installed in the image && cat requirements.txt | \ # Replace playwright version so that it matches whatever is pre-installed in the image (the `hash` checks if playwright is installed) sed "s/^playwright==\(.*\)/playwright==$(hash playwright 2>/dev/null && (playwright --version | cut -d ' ' -f 2) || echo '\1')/" | \ # Install everything using pip pip install -r /dev/stdin \ && echo "All installed Python packages:" \ && pip freeze # % elif cookiecutter.package_manager == 'manual' # TODO install dependencies # % endif # Next, copy the remaining files and directories with the source code. # Since we do this after installing the dependencies, quick build will be really fast # for most source file changes. COPY . ./ # Use compileall to ensure the runnability of the Actor Python code. RUN python -m compileall -q . # % if cookiecutter.crawler_type == 'playwright-camoufox' # Fetch camoufox files that are always needed when using camoufox. RUN python -m camoufox fetch # % endif # Specify how to launch the source code of your Actor. CMD ["python", "-m", "{{ cookiecutter.__package_name }}"] ================================================ FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/README.md ================================================ # {{cookiecutter.project_name}} Project skeleton generated by Crawlee ({{ cookiecutter.crawler_type | capitalize }} template). ## Usage {% if cookiecutter.package_manager == 'poetry' -%} To get started, ensure you have [Poetry](https://python-poetry.org/), a package and dependency management system, installed on your machine. We recommend installing it with the following command: ```sh pipx install poetry ``` Next, install the project dependencies: ```sh poetry install ``` Finally, launch the crawler with: ```sh poetry run python -m {{cookiecutter.__package_name}} ``` {% elif cookiecutter.package_manager == 'pip' -%} To install dependencies, your can run the following command: ```sh python -m pip install . ``` When the dependencies are installed, you may launch the crawler with: ```sh python -m {{cookiecutter.__package_name}} ``` {% elif cookiecutter.package_manager == 'uv' -%} To get started, ensure you have [UV](https://docs.astral.sh/uv/), a package and dependency management system, installed on your machine. We recommend installing it with the following command: ```sh pipx install uv ``` Next, install the project dependencies: ```sh uv sync ``` Finally, launch the crawler with: ```sh uv run python -m {{cookiecutter.__package_name}} ``` {% elif cookiecutter.package_manager == 'pip' -%} To install dependencies, your can run the following command: ```sh python -m pip install . ``` When the dependencies are installed, you may launch the crawler with: ```sh python -m {{cookiecutter.__package_name}} ``` {% elif cookiecutter.package_manager == 'manual' -%} You selected the manual dependency installation method, so you're on your own. There is a simple `requirements.txt` file to get you started. {% endif %} ================================================ FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml ================================================ # % if cookiecutter.crawler_type.startswith('playwright') # % set extras = ['playwright'] # % else # % set extras = [cookiecutter.crawler_type] # % endif # % if cookiecutter.http_client == 'curl-impersonate' # % do extras.append('curl-impersonate') # % elif cookiecutter.http_client == 'httpx' # % do extras.append('httpx') # % endif [project] name = "{{cookiecutter.project_name}}" version = "0.0.1" description = "" authors = [ {name = "Your Name",email = "you@example.com"} ] readme = "README.md" requires-python = ">=3.10,<4.0" dependencies = [ "crawlee[{{ extras|join(',') }}]", # % if cookiecutter.crawler_type == 'playwright-camoufox' "camoufox[geoip]~=0.4.5", # % endif # % if cookiecutter.enable_apify_integration "apify", # % endif ] # % if cookiecutter.package_manager == 'poetry' [tool.poetry] package-mode = false [build-system] requires = ["poetry-core>=2.0.0,<3.0.0"] build-backend = "poetry.core.masonry.api" # % endif ================================================ FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt ================================================ # % if cookiecutter.crawler_type == 'playwright-camoufox' camoufox[geoip]~=0.4.5 # % endif # % if cookiecutter.crawler_type.startswith('playwright') # % set extras = ['playwright'] # % else # % set extras = [cookiecutter.crawler_type] # % endif # % if cookiecutter.enable_apify_integration apify # % endif # % if cookiecutter.http_client == 'curl-impersonate' # % do extras.append('curl-impersonate') # % endif # % if cookiecutter.http_client == 'httpx' # % do extras.append('httpx') # % endif crawlee[{{ extras | join(',') }}] ================================================ FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py ================================================ ================================================ FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py ================================================ import asyncio # % if cookiecutter.http_client == 'curl-impersonate' import platform # % if 'playwright' in cookiecutter.crawler_type import warnings # % endif # % endif {{ '' }} from .main import main if __name__ == '__main__': # % if cookiecutter.http_client == 'curl-impersonate' if platform.system() == 'Windows': # This mitigates a warning raised by curl-cffi. # % if 'playwright' in cookiecutter.crawler_type warnings.warn( message=('curl-cffi suggests using WindowsSelectorEventLoopPolicy, but this conflicts with Playwright. ' 'Ignore the curl-cffi warning.'), category=UserWarning, stacklevel=2, ) # % else asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # % endif # % endif {{ '' }} asyncio.run(main()) ================================================ FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py ================================================ # % include 'main_%s.py' % cookiecutter.__crawler_type ================================================ FILE: src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py ================================================ # % if cookiecutter.crawler_type.startswith('playwright') # % include 'routes_playwright.py' # % else # % include 'routes_%s.py' % cookiecutter.__crawler_type # % endif ================================================ FILE: src/crawlee/proxy_configuration.py ================================================ from __future__ import annotations import inspect from collections import defaultdict from dataclasses import dataclass from typing import TYPE_CHECKING from more_itertools import flatten from pydantic import AnyHttpUrl, TypeAdapter from typing_extensions import Protocol from yarl import URL from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.docs import docs_group if TYPE_CHECKING: from collections.abc import Awaitable, Sequence from crawlee import Request __all__ = ['ProxyConfiguration', 'ProxyInfo'] @dataclass @docs_group('Other') class ProxyInfo: """Provides information about a proxy connection that is used for requests.""" url: str """The URL of the proxy.""" scheme: str """The scheme of the proxy.""" hostname: str """The hostname of the proxy.""" port: int """The proxy port.""" username: str = '' """The username for the proxy.""" password: str = '' """The password for the proxy.""" session_id: str | None = None """The identifier of the used proxy session, if used. Using the same session ID guarantees getting the same proxy URL.""" proxy_tier: int | None = None """The tier of the proxy.""" @docs_group('Configuration') class ProxyConfiguration: """Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures them to use the selected proxies for all connections. You can get information about the currently used proxy by inspecting the {@apilink ProxyInfo} property in your crawler's page function. There, you can inspect the proxy's URL and other attributes. If you want to use your own proxies, use the {@apilink ProxyConfigurationOptions.proxyUrls} option. Your list of proxy URLs will be rotated by the configuration if this option is provided. """ def __init__( self, *, proxy_urls: list[str | None] | None = None, new_url_function: _NewUrlFunction | None = None, tiered_proxy_urls: list[list[str | None]] | None = None, ) -> None: """Initialize a new instance. Exactly one of `proxy_urls`, `tiered_proxy_urls` or `new_url_function` must be specified. Args: proxy_urls: A list of URLs of proxies that will be rotated in a round-robin fashion tiered_proxy_urls: A list of URL tiers (where a tier is a list of proxy URLs). Crawlers will automatically try to use the lowest tier (smallest index) where blocking does not happen. The proxy URLs in the selected tier will be rotated in a round-robin fashion. new_url_function: A function that returns a proxy URL for a given Request. This provides full control over the proxy selection mechanism. """ self._next_custom_url_index = 0 self._used_proxy_urls = dict[str, URL | None]() self._url_validator = TypeAdapter(AnyHttpUrl) # Validation if sum(map(bool, (proxy_urls, new_url_function, list(flatten(tiered_proxy_urls or []))))) != 1: raise ValueError( 'Exactly one of `proxy_urls`, `tiered_proxy_urls` and `new_url_function` ' 'must be specified (and non-empty).' ) self._proxy_urls = [self._create_url(url) for url in proxy_urls] if proxy_urls else [] self._proxy_tier_tracker = ( _ProxyTierTracker([[self._create_url(url) for url in tier] for tier in tiered_proxy_urls]) if tiered_proxy_urls else None ) self._new_url_function = new_url_function def _create_url(self, url: str | None) -> URL | None: """Create URL from input string. None means that intentionally no proxy should be used.""" if url is None: return None self._url_validator.validate_python(url) return URL(url) async def new_proxy_info( self, session_id: str | None, request: Request | None, proxy_tier: int | None ) -> ProxyInfo | None: """Return a new ProxyInfo object based on the configured proxy rotation strategy. Args: session_id: Session identifier. If provided, same proxy URL will be returned for subsequent calls with this ID. Will be auto-generated for tiered proxies if not provided. request: Request object used for proxy rotation and tier selection. Required for tiered proxies to track retries and adjust tier accordingly. proxy_tier: Specific proxy tier to use. If not provided, will be automatically selected based on configuration. """ if self._proxy_tier_tracker is not None and session_id is None: session_id = crypto_random_object_id(6) url, proxy_tier = await self._pick_url(session_id, request, proxy_tier) if url is None: return None if url.port is None: raise ValueError(f'Port is None for URL: {url}') if url.host is None: raise ValueError(f'Host is None for URL: {url}') info = ProxyInfo( url=str(url), scheme=url.scheme, hostname=url.host, port=url.port, username=url.user or '', password=url.password or '', ) if session_id is not None: info.session_id = session_id if proxy_tier is not None: info.proxy_tier = proxy_tier return info async def new_url( self, session_id: str | None = None, request: Request | None = None, proxy_tier: int | None = None ) -> str | None: """Return a proxy URL string based on the configured proxy rotation strategy. Args: session_id: Session identifier. If provided, same proxy URL will be returned for subsequent calls with this ID. Will be auto-generated for tiered proxies if not provided. request: Request object used for proxy rotation and tier selection. Required for tiered proxies to track retries and adjust tier accordingly. proxy_tier: Specific proxy tier to use. If not provided, will be automatically selected based on configuration. """ proxy_info = await self.new_proxy_info(session_id, request, proxy_tier) return proxy_info.url if proxy_info else None async def _pick_url( self, session_id: str | None, request: Request | None, proxy_tier: int | None ) -> tuple[URL | None, int | None]: if self._new_url_function: try: result = self._new_url_function(session_id, request) if inspect.isawaitable(result): result = await result return URL(str(result)) if result is not None else None, None except Exception as e: raise ValueError('The provided "new_url_function" did not return a valid URL') from e if self._proxy_tier_tracker: if request is not None and proxy_tier is None: hostname = URL(request.url).host if hostname is None: raise ValueError('The request URL does not have a hostname') if request.last_proxy_tier is not None: self._proxy_tier_tracker.add_error(hostname, request.last_proxy_tier) proxy_tier = self._proxy_tier_tracker.predict_tier(hostname) request.last_proxy_tier = proxy_tier request.forefront = True if proxy_tier is not None: urls = self._proxy_tier_tracker.get_tier_urls(proxy_tier) else: urls = self._proxy_tier_tracker.all_urls elif self._proxy_urls: urls = self._proxy_urls else: raise RuntimeError('Invalid state') if session_id is None: url = urls[self._next_custom_url_index % len(urls)] self._next_custom_url_index += 1 return url, proxy_tier if session_id not in self._used_proxy_urls: self._used_proxy_urls[session_id] = urls[self._next_custom_url_index % len(urls)] self._next_custom_url_index += 1 return self._used_proxy_urls[session_id], proxy_tier class _ProxyTierTracker: """Tracks the state of currently used proxy tiers and their error frequency for individual crawled domains.""" def __init__(self, tiered_proxy_urls: list[list[URL | None]]) -> None: self._tiered_proxy_urls = tiered_proxy_urls self._histogram_by_domain = defaultdict[str, list[int]](lambda: [0 for _tier in tiered_proxy_urls]) self._current_tier_by_domain = defaultdict[str, int](lambda: 0) @property def all_urls(self) -> Sequence[URL | None]: return list(flatten(self._tiered_proxy_urls)) def get_tier_urls(self, tier_number: int) -> Sequence[URL | None]: return self._tiered_proxy_urls[tier_number] def add_error(self, domain: str, tier: int) -> None: self._histogram_by_domain[domain][tier] += 10 def predict_tier(self, domain: str) -> int: histogram = self._histogram_by_domain[domain] current_tier = self._current_tier_by_domain[domain] for index, value in enumerate(histogram): if index == current_tier: continue if value > 0: histogram[index] -= 1 left = histogram[current_tier - 1] if current_tier > 0 else float('inf') right = histogram[current_tier + 1] if current_tier < len(histogram) - 1 else float('inf') if histogram[current_tier] > min(left, right): self._current_tier_by_domain[domain] = current_tier - 1 if left <= right else current_tier + 1 elif histogram[current_tier] == left: self._current_tier_by_domain[domain] -= 1 return self._current_tier_by_domain[domain] class _NewUrlFunction(Protocol): def __call__( self, session_id: str | None = None, request: Request | None = None, ) -> str | None | Awaitable[str | None]: ... ================================================ FILE: src/crawlee/py.typed ================================================ ================================================ FILE: src/crawlee/request_loaders/__init__.py ================================================ from ._request_list import RequestList from ._request_loader import RequestLoader from ._request_manager import RequestManager from ._request_manager_tandem import RequestManagerTandem from ._sitemap_request_loader import SitemapRequestLoader __all__ = ['RequestList', 'RequestLoader', 'RequestManager', 'RequestManagerTandem', 'SitemapRequestLoader'] ================================================ FILE: src/crawlee/request_loaders/_request_list.py ================================================ from __future__ import annotations import asyncio import contextlib from collections.abc import AsyncGenerator, AsyncIterable, AsyncIterator, Iterable from logging import getLogger from typing import Annotated from pydantic import BaseModel, ConfigDict, Field from typing_extensions import override from crawlee._request import Request from crawlee._utils.docs import docs_group from crawlee.request_loaders._request_loader import RequestLoader logger = getLogger(__name__) class RequestListState(BaseModel): model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) next_index: Annotated[int, Field(alias='nextIndex')] = 0 next_unique_key: Annotated[str | None, Field(alias='nextUniqueKey')] = None in_progress: Annotated[set[str], Field(alias='inProgress')] = set() class RequestListData(BaseModel): requests: Annotated[list[Request], Field()] @docs_group('Request loaders') class RequestList(RequestLoader): """Represents a (potentially very large) list of URLs to crawl.""" def __init__( self, requests: Iterable[str | Request] | AsyncIterable[str | Request] | None = None, name: str | None = None, persist_state_key: str | None = None, persist_requests_key: str | None = None, ) -> None: """Initialize a new instance. Args: requests: The request objects (or their string representations) to be added to the provider. name: A name of the request list. persist_state_key: A key for persisting the progress information of the RequestList. If you do not pass a key but pass a `name`, a key will be derived using the name. Otherwise, state will not be persisted. persist_requests_key: A key for persisting the request data loaded from the `requests` iterator. If specified, the request data will be stored in the KeyValueStore to make sure that they don't change over time. This is useful if the `requests` iterator pulls the data dynamically. """ from crawlee._utils.recoverable_state import RecoverableState # noqa: PLC0415 self._name = name self._handled_count = 0 self._assumed_total_count = 0 self._next: tuple[Request | None, Request | None] = (None, None) if persist_state_key is None and name is not None: persist_state_key = f'SDK_REQUEST_LIST_STATE-{name}' self._state = RecoverableState( default_state=RequestListState(), persistence_enabled=bool(persist_state_key), persist_state_key=persist_state_key or '', logger=logger, ) self._persist_request_data = bool(persist_requests_key) self._requests_data = RecoverableState( default_state=RequestListData(requests=[]), # With request data persistence enabled, a snapshot of the requests will be done on initialization persistence_enabled='explicit_only' if self._persist_request_data else False, persist_state_key=persist_requests_key or '', logger=logger, ) self._requests: AsyncIterator[str | Request] if isinstance(requests, AsyncIterable): self._requests = requests.__aiter__() # ty: ignore[invalid-assignment] elif requests is None: self._requests = self._iterate_in_threadpool([]) else: self._requests = self._iterate_in_threadpool(requests) self._requests_lock: asyncio.Lock | None = None async def _get_state(self) -> RequestListState: # If state is already initialized, we are done if self._state.is_initialized: return self._state.current_value # Initialize recoverable state await self._state.initialize() await self._requests_data.initialize() # Initialize lock if necessary if self._requests_lock is None: self._requests_lock = asyncio.Lock() # If the RequestList is configured to persist request data, ensure that a copy of request data is used if self._persist_request_data: async with self._requests_lock: if not await self._requests_data.has_persisted_state(): self._requests_data.current_value.requests = [ request if isinstance(request, Request) else Request.from_url(request) async for request in self._requests ] await self._requests_data.persist_state() self._requests = self._iterate_in_threadpool( self._requests_data.current_value.requests[self._state.current_value.next_index :] ) # If not using persistent request data, advance the request iterator else: async with self._requests_lock: for _ in range(self._state.current_value.next_index): with contextlib.suppress(StopAsyncIteration): await self._requests.__anext__() # Check consistency of the stored state and the request iterator if (unique_key_to_check := self._state.current_value.next_unique_key) is not None: await self._ensure_next_request() next_unique_key = self._next[0].unique_key if self._next[0] is not None else None if next_unique_key != unique_key_to_check: raise RuntimeError( f"""Mismatch at index { self._state.current_value.next_index } in persisted requests - Expected unique key `{unique_key_to_check}`, got `{next_unique_key}`""" ) return self._state.current_value @property def name(self) -> str | None: return self._name @override async def get_handled_count(self) -> int: return self._handled_count @override async def get_total_count(self) -> int: return self._assumed_total_count @override async def is_empty(self) -> bool: await self._ensure_next_request() return self._next[0] is None @override async def is_finished(self) -> bool: state = await self._get_state() return len(state.in_progress) == 0 and await self.is_empty() @override async def fetch_next_request(self) -> Request | None: await self._get_state() await self._ensure_next_request() if self._next[0] is None: return None state = await self._get_state() state.in_progress.add(self._next[0].unique_key) self._assumed_total_count += 1 next_request = self._next[0] if next_request is not None: state.next_index += 1 state.next_unique_key = self._next[1].unique_key if self._next[1] is not None else None self._next = (self._next[1], None) await self._ensure_next_request() return next_request @override async def mark_request_as_handled(self, request: Request) -> None: self._handled_count += 1 state = await self._get_state() state.in_progress.remove(request.unique_key) async def _ensure_next_request(self) -> None: await self._get_state() if self._requests_lock is None: self._requests_lock = asyncio.Lock() async with self._requests_lock: if None in self._next: if self._next[0] is None: to_enqueue = [item async for item in self._dequeue_requests(2)] self._next = (to_enqueue[0], to_enqueue[1]) else: to_enqueue = [item async for item in self._dequeue_requests(1)] self._next = (self._next[0], to_enqueue[0]) async def _dequeue_requests(self, count: int) -> AsyncGenerator[Request | None]: for _ in range(count): try: yield self._transform_request(await self._requests.__anext__()) except StopAsyncIteration: # noqa: PERF203 yield None async def _iterate_in_threadpool(self, iterable: Iterable[str | Request]) -> AsyncIterator[str | Request]: """Inspired by a function of the same name from encode/starlette.""" iterator = iter(iterable) class _StopIteration(Exception): # noqa: N818 pass def _next() -> str | Request: # We can't raise `StopIteration` from within the threadpool iterator # and catch it outside that context, so we coerce them into a different # exception type. try: return next(iterator) except StopIteration: raise _StopIteration # noqa: B904 try: while True: yield await asyncio.to_thread(_next) except _StopIteration: return ================================================ FILE: src/crawlee/request_loaders/_request_loader.py ================================================ from __future__ import annotations from abc import ABC, abstractmethod from typing import TYPE_CHECKING from crawlee import Request from crawlee._utils.docs import docs_group if TYPE_CHECKING: from collections.abc import Sequence from crawlee.request_loaders import RequestManager, RequestManagerTandem from crawlee.storage_clients.models import ProcessedRequest @docs_group('Request loaders') class RequestLoader(ABC): """An abstract class defining the interface for classes that provide access to a read-only stream of requests. Request loaders are used to manage and provide access to a storage of crawling requests. Key responsibilities: - Fetching the next request to be processed. - Marking requests as successfully handled after processing. - Managing state information such as the total and handled request counts. """ @abstractmethod async def get_handled_count(self) -> int: """Get the number of requests in the loader that have been handled.""" @abstractmethod async def get_total_count(self) -> int: """Get an offline approximation of the total number of requests in the loader (i.e. pending + handled).""" @abstractmethod async def is_empty(self) -> bool: """Return True if there are no more requests in the loader (there might still be unfinished requests).""" @abstractmethod async def is_finished(self) -> bool: """Return True if all requests have been handled.""" @abstractmethod async def fetch_next_request(self) -> Request | None: """Return the next request to be processed, or `None` if there are no more pending requests. The method should return `None` if and only if `is_finished` would return `True`. In other cases, the method should wait until a request appears. """ @abstractmethod async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: """Mark a request as handled after a successful processing (or after giving up retrying).""" async def to_tandem(self, request_manager: RequestManager | None = None) -> RequestManagerTandem: """Combine the loader with a request manager to support adding and reclaiming requests. Args: request_manager: Request manager to combine the loader with. If None is given, the default request queue is used. """ # Import here to avoid circular imports. from crawlee.request_loaders import RequestManagerTandem # noqa: PLC0415 from crawlee.storages import RequestQueue # noqa: PLC0415 if request_manager is None: request_manager = await RequestQueue.open() return RequestManagerTandem(self, request_manager) def _transform_request(self, request: str | Request) -> Request: """Transform a request-like object into a Request object.""" if isinstance(request, Request): return request if isinstance(request, str): return Request.from_url(request) raise ValueError(f'Invalid request type: {type(request)}') def _transform_requests(self, requests: Sequence[str | Request]) -> list[Request]: """Transform a list of request-like objects into a list of `Request` objects.""" processed_requests = dict[str, Request]() for request in requests: processed_request = self._transform_request(request) processed_requests.setdefault(processed_request.unique_key, processed_request) return list(processed_requests.values()) ================================================ FILE: src/crawlee/request_loaders/_request_manager.py ================================================ from __future__ import annotations from abc import ABC, abstractmethod from datetime import timedelta from typing import TYPE_CHECKING from crawlee._utils.docs import docs_group from crawlee.request_loaders._request_loader import RequestLoader from crawlee.storage_clients.models import ProcessedRequest if TYPE_CHECKING: from collections.abc import Sequence from crawlee._request import Request @docs_group('Request loaders') class RequestManager(RequestLoader, ABC): """Base class that extends `RequestLoader` with the capability to enqueue new requests and reclaim failed ones.""" @abstractmethod async def drop(self) -> None: """Remove persistent state either from the Apify Cloud storage or from the local database.""" @abstractmethod async def add_request( self, request: str | Request, *, forefront: bool = False, ) -> ProcessedRequest | None: """Add a single request to the manager and store it in underlying resource client. Args: request: The request object (or its string representation) to be added to the manager. forefront: Determines whether the request should be added to the beginning (if True) or the end (if False) of the manager. Returns: Information about the request addition to the manager or None if the request was not added. """ async def add_requests( self, requests: Sequence[str | Request], *, forefront: bool = False, batch_size: int = 1000, # noqa: ARG002 wait_time_between_batches: timedelta = timedelta(seconds=1), # noqa: ARG002 wait_for_all_requests_to_be_added: bool = False, # noqa: ARG002 wait_for_all_requests_to_be_added_timeout: timedelta | None = None, # noqa: ARG002 ) -> None: """Add requests to the manager in batches. Args: requests: Requests to enqueue. forefront: If True, add requests to the beginning of the queue. batch_size: The number of requests to add in one batch. wait_time_between_batches: Time to wait between adding batches. wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning. wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added. """ # Default and dumb implementation. processed_requests = list[ProcessedRequest]() for request in requests: processed_request = await self.add_request(request, forefront=forefront) if processed_request: processed_requests.append(processed_request) @abstractmethod async def reclaim_request(self, request: Request, *, forefront: bool = False) -> ProcessedRequest | None: """Reclaims a failed request back to the source, so that it can be returned for processing later again. It is possible to modify the request data by supplying an updated request as a parameter. """ ================================================ FILE: src/crawlee/request_loaders/_request_manager_tandem.py ================================================ from __future__ import annotations from datetime import timedelta from logging import getLogger from typing import TYPE_CHECKING from typing_extensions import override from crawlee._utils.docs import docs_group from crawlee.request_loaders import RequestManager if TYPE_CHECKING: from collections.abc import Sequence from crawlee import Request from crawlee.request_loaders import RequestLoader from crawlee.storage_clients.models import ProcessedRequest logger = getLogger(__name__) @docs_group('Request loaders') class RequestManagerTandem(RequestManager): """Implements a tandem behaviour for a pair of `RequestLoader` and `RequestManager`. In this scenario, the contents of the "loader" get transferred into the "manager", allowing processing the requests from both sources and also enqueueing new requests (not possible with plain `RequestManager`). """ def __init__(self, request_loader: RequestLoader, request_manager: RequestManager) -> None: self._read_only_loader = request_loader self._read_write_manager = request_manager @override async def get_handled_count(self) -> int: return await self._read_write_manager.get_handled_count() @override async def get_total_count(self) -> int: return (await self._read_only_loader.get_total_count()) + (await self._read_write_manager.get_total_count()) @override async def is_empty(self) -> bool: return (await self._read_only_loader.is_empty()) and (await self._read_write_manager.is_empty()) @override async def is_finished(self) -> bool: return (await self._read_only_loader.is_finished()) and (await self._read_write_manager.is_finished()) @override async def add_request(self, request: str | Request, *, forefront: bool = False) -> ProcessedRequest | None: return await self._read_write_manager.add_request(request, forefront=forefront) @override async def add_requests( self, requests: Sequence[str | Request], *, forefront: bool = False, batch_size: int = 1000, wait_time_between_batches: timedelta = timedelta(seconds=1), wait_for_all_requests_to_be_added: bool = False, wait_for_all_requests_to_be_added_timeout: timedelta | None = None, ) -> None: return await self._read_write_manager.add_requests( requests, forefront=forefront, batch_size=batch_size, wait_time_between_batches=wait_time_between_batches, wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added, wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout, ) @override async def fetch_next_request(self) -> Request | None: if await self._read_only_loader.is_finished(): return await self._read_write_manager.fetch_next_request() request = await self._read_only_loader.fetch_next_request() if not request: return await self._read_write_manager.fetch_next_request() try: await self._read_write_manager.add_request(request, forefront=True) except Exception: logger.exception( 'Adding request from the RequestLoader to the RequestManager failed, the request has been dropped', extra={'url': request.url, 'unique_key': request.unique_key}, ) return None await self._read_only_loader.mark_request_as_handled(request) return await self._read_write_manager.fetch_next_request() @override async def reclaim_request(self, request: Request, *, forefront: bool = False) -> None: await self._read_write_manager.reclaim_request(request, forefront=forefront) @override async def mark_request_as_handled(self, request: Request) -> None: await self._read_write_manager.mark_request_as_handled(request) @override async def drop(self) -> None: await self._read_write_manager.drop() ================================================ FILE: src/crawlee/request_loaders/_sitemap_request_loader.py ================================================ from __future__ import annotations import asyncio from collections import deque from contextlib import suppress from logging import getLogger from typing import TYPE_CHECKING, Annotated, Any from pydantic import BaseModel, ConfigDict, Field from typing_extensions import override from crawlee import Request, RequestOptions from crawlee._utils.docs import docs_group from crawlee._utils.globs import Glob from crawlee._utils.recoverable_state import RecoverableState from crawlee._utils.sitemap import NestedSitemap, ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap from crawlee.request_loaders._request_loader import RequestLoader if TYPE_CHECKING: import re from collections.abc import Callable, Sequence from types import TracebackType from crawlee import RequestTransformAction from crawlee.http_clients import HttpClient from crawlee.proxy_configuration import ProxyInfo from crawlee.storage_clients.models import ProcessedRequest logger = getLogger(__name__) class SitemapRequestLoaderState(BaseModel): """State model for persisting sitemap request loader data. The crawler processes one sitemap at a time. The current sitemap is stored in `in_progress_sitemap_url`. The `parse_sitemap` function parses the sitemap and returns elements as an async iterator. Each element retrieved from the iterator is processed based on its type. If the element is a `NestedSitemap`, its URL is added to `pending_sitemap_urls` if it hasn't been processed yet (not in `processed_sitemap_urls`). If the element is a `SitemapUrl`, the system checks whether it already exists in `current_sitemap_processed_urls`. If it exists, the loader was restarted from a saved state and the URL is skipped. If the URL is new, it is first added to `url_queue`, then to `current_sitemap_processed_urls`, and `total_count` is incremented by 1. When all elements from the current sitemap iterator have been processed, `in_progress_sitemap_url` is set to `None`, the sitemap URL is added to `processed_sitemap_urls`, and `current_sitemap_processed_urls` is cleared. The next sitemap is retrieved from `pending_sitemap_urls`, skipping any URLs that already exist in `processed_sitemap_urls`. If `pending_sitemap_urls` is empty, `completed` is set to `True`. When `fetch_next_request` is called, a URL is extracted from `url_queue` and placed in `in_progress`. When `mark_request_as_handled` is called for the extracted URL, it is removed from `in_progress` and `handled_count` is incremented by 1. During initial startup or restart after persistence, state validation occurs in `_get_state`. If both `pending_sitemap_urls` and `in_progress_sitemap_url` are empty and `completed` is False, this indicates a fresh start. In this case, `self._sitemap_urls` are moved to `pending_sitemap_urls`. Otherwise, the system is restarting from a persisted state. If `in_progress` contains any URLs, they are moved back to `url_queue` and `in_progress` is cleared. """ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) url_queue: Annotated[deque[str], Field(alias='urlQueue')] """Queue of URLs extracted from sitemaps and ready for processing.""" in_progress: Annotated[set[str], Field(alias='inProgress')] = set() """Set of request URLs currently being processed.""" pending_sitemap_urls: Annotated[deque[str], Field(alias='pendingSitemapUrls')] """Queue of sitemap URLs that need to be fetched and processed.""" in_progress_sitemap_url: Annotated[str | None, Field(alias='inProgressSitemapUrl')] = None """The sitemap URL currently being processed.""" current_sitemap_processed_urls: Annotated[set[str], Field(alias='currentSitemapProcessedUrls')] = set() """URLs from the current sitemap that have been added to the queue.""" processed_sitemap_urls: Annotated[set[str], Field(alias='processedSitemapUrls')] = set() """Set of processed sitemap URLs.""" completed: Annotated[bool, Field(alias='sitemapCompleted')] = False """Whether all sitemaps have been fully processed.""" total_count: Annotated[int, Field(alias='totalCount')] = 0 """Total number of URLs found and added to the queue from all processed sitemaps.""" handled_count: Annotated[int, Field(alias='handledCount')] = 0 """Number of URLs that have been successfully handled.""" @docs_group('Request loaders') class SitemapRequestLoader(RequestLoader): """A request loader that reads URLs from sitemap(s). The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats. Note that HTML pages containing links are not supported - those should be handled by regular crawlers and the `enqueue_links` functionality. The loader fetches and parses sitemaps in the background, allowing crawling to start before all URLs are loaded. It supports filtering URLs using glob and regex patterns. The loader supports state persistence, allowing it to resume from where it left off after interruption when a `persist_state_key` is provided during initialization. """ def __init__( self, sitemap_urls: list[str], http_client: HttpClient, *, proxy_info: ProxyInfo | None = None, include: list[re.Pattern[Any] | Glob] | None = None, exclude: list[re.Pattern[Any] | Glob] | None = None, max_buffer_size: int = 200, persist_state_key: str | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, ) -> None: """Initialize the sitemap request loader. Args: sitemap_urls: Configuration options for the loader. proxy_info: Optional proxy to use for fetching sitemaps. include: List of glob or regex patterns to include URLs. exclude: List of glob or regex patterns to exclude URLs. max_buffer_size: Maximum number of URLs to buffer in memory. http_client: the instance of `HttpClient` to use for fetching sitemaps. persist_state_key: A key for persisting the loader's state in the KeyValueStore. When provided, allows resuming from where it left off after interruption. If None, no state persistence occurs. transform_request_function: An optional function to transform requests generated by the loader. It receives `RequestOptions` with `url` and should return either modified `RequestOptions` or a `RequestTransformAction`. """ self._http_client = http_client self._sitemap_urls = sitemap_urls self._include = include self._exclude = exclude self._proxy_info = proxy_info self._max_buffer_size = max_buffer_size self._transform_request_function = transform_request_function # Synchronization for queue operations self._queue_has_capacity = asyncio.Event() self._queue_has_capacity.set() self._queue_lock = asyncio.Lock() # Initialize recoverable state self._state = RecoverableState( default_state=SitemapRequestLoaderState( url_queue=deque(), pending_sitemap_urls=deque(), ), persistence_enabled=bool(persist_state_key), persist_state_key=persist_state_key or '', logger=logger, ) # Start background loading self._loading_task = asyncio.create_task(self._load_sitemaps()) async def _get_state(self) -> SitemapRequestLoaderState: """Initialize and return the current state.""" async with self._queue_lock: if self._state.is_initialized: return self._state.current_value await self._state.initialize() # Initialize pending sitemaps on first run has_sitemap_for_processing = ( self._state.current_value.pending_sitemap_urls or self._state.current_value.in_progress_sitemap_url ) if not has_sitemap_for_processing and not self._state.current_value.completed: self._state.current_value.pending_sitemap_urls.extend(self._sitemap_urls) if self._state.current_value.in_progress: self._state.current_value.url_queue.extendleft(self._state.current_value.in_progress) self._state.current_value.in_progress.clear() if ( self._state.current_value.url_queue and len(self._state.current_value.url_queue) >= self._max_buffer_size ): # Notify that the queue is full self._queue_has_capacity.clear() return self._state.current_value def _check_url_patterns( self, target_url: str, include: Sequence[re.Pattern[Any] | Glob] | None, exclude: Sequence[re.Pattern[Any] | Glob] | None, ) -> bool: """Check if a URL matches configured include/exclude patterns.""" # If the URL matches any `exclude` pattern, reject it for pattern in exclude or (): if isinstance(pattern, Glob): pattern = pattern.regexp # noqa: PLW2901 if pattern.match(target_url) is not None: return False # If there are no `include` patterns and the URL passed all `exclude` patterns, accept the URL if include is None: return True # If the URL matches any `include` pattern, accept it for pattern in include: if isinstance(pattern, Glob): pattern = pattern.regexp # noqa: PLW2901 if pattern.match(target_url) is not None: return True # The URL does not match any `include` pattern - reject it return False async def _load_sitemaps(self) -> None: """Load URLs from sitemaps in the background.""" try: # Get actual state while (state := await self._get_state()) and (state.pending_sitemap_urls or state.in_progress_sitemap_url): # Get sitemap URL for parsing sitemap_url = state.in_progress_sitemap_url if not sitemap_url: sitemap_url = state.pending_sitemap_urls.popleft() # Skip processed urls if sitemap_url in state.processed_sitemap_urls: continue state.in_progress_sitemap_url = sitemap_url parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3) async for item in parse_sitemap( [SitemapSource(type='url', url=sitemap_url)], self._http_client, proxy_info=self._proxy_info, options=parse_options, ): if isinstance(item, NestedSitemap): # Add nested sitemap to queue if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls: state.pending_sitemap_urls.append(item.loc) continue if isinstance(item, SitemapUrl): url = item.loc state = await self._get_state() # Skip if already processed if url in state.current_sitemap_processed_urls: continue # Check if URL should be included if not self._check_url_patterns(url, self._include, self._exclude): continue # Check if we have capacity in the queue await self._queue_has_capacity.wait() state = await self._get_state() async with self._queue_lock: state.url_queue.append(url) state.current_sitemap_processed_urls.add(url) state.total_count += 1 if len(state.url_queue) >= self._max_buffer_size: # Notify that the queue is full self._queue_has_capacity.clear() # Clear current sitemap after processing state = await self._get_state() current_sitemap_url = state.in_progress_sitemap_url state.in_progress_sitemap_url = None if current_sitemap_url: state.processed_sitemap_urls.add(current_sitemap_url) state.current_sitemap_processed_urls.clear() # Mark as completed after processing all sitemap urls state.completed = True except Exception: logger.exception('Error loading sitemaps') raise @override async def get_total_count(self) -> int: """Return the total number of URLs found so far.""" state = await self._get_state() return state.total_count @override async def get_handled_count(self) -> int: """Return the number of URLs that have been handled.""" state = await self._get_state() return state.handled_count @override async def is_empty(self) -> bool: """Check if there are no more URLs to process.""" state = await self._get_state() return not state.url_queue @override async def is_finished(self) -> bool: """Check if all URLs have been processed.""" state = await self._get_state() return not state.url_queue and len(state.in_progress) == 0 and self._loading_task.done() @override async def fetch_next_request(self) -> Request | None: """Fetch the next request to process.""" while not (await self.is_finished()): state = await self._get_state() if not state.url_queue: await asyncio.sleep(0.1) continue async with self._queue_lock: url = state.url_queue.popleft() request_option = RequestOptions(url=url) if self._transform_request_function: transform_request_option = self._transform_request_function(request_option) if transform_request_option == 'skip': state.total_count -= 1 continue if transform_request_option != 'unchanged': request_option = transform_request_option request = Request.from_url(**request_option) state.in_progress.add(request.url) if len(state.url_queue) < self._max_buffer_size: self._queue_has_capacity.set() return request return None @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: """Mark a request as successfully handled.""" state = await self._get_state() if request.url in state.in_progress: state.in_progress.remove(request.url) state.handled_count += 1 return None async def abort_loading(self) -> None: """Abort the sitemap loading process.""" if self._loading_task and not self._loading_task.done(): self._loading_task.cancel() with suppress(asyncio.CancelledError): await self._loading_task async def start(self) -> None: """Start the sitemap loading process.""" if self._loading_task and not self._loading_task.done(): return self._loading_task = asyncio.create_task(self._load_sitemaps()) async def close(self) -> None: """Close the request loader.""" await self.abort_loading() await self._state.teardown() async def __aenter__(self) -> SitemapRequestLoader: """Enter the context manager.""" await self.start() return self async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None ) -> None: """Exit the context manager.""" await self.close() ================================================ FILE: src/crawlee/router.py ================================================ from __future__ import annotations import asyncio from collections.abc import Awaitable, Callable from typing import Generic, TypeVar from crawlee._request import RequestState from crawlee._types import BasicCrawlingContext from crawlee._utils.docs import docs_group __all__ = ['Router'] from crawlee.errors import UserHandlerTimeoutError TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext) RequestHandler = Callable[[TCrawlingContext], Awaitable[None]] @docs_group('Other') class Router(Generic[TCrawlingContext]): """A request dispatching system that routes requests to registered handlers based on their labels. The `Router` allows you to define and register request handlers for specific labels. When a request is received, the router invokes the corresponding `request_handler` based on the request's `label`. If no matching handler is found, the default handler is used. ### Usage ```python from crawlee.crawlers import HttpCrawler, HttpCrawlingContext from crawlee.router import Router router = Router[HttpCrawlingContext]() # Handler for requests without a matching label handler @router.default_handler async def default_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Request without label {context.request.url} ...') # Handler for category requests @router.handler(label='category') async def category_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Category request {context.request.url} ...') # Handler for product requests @router.handler(label='product') async def product_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Product {context.request.url} ...') async def main() -> None: crawler = HttpCrawler(request_handler=router) await crawler.run() """ def __init__(self) -> None: self._default_handler: RequestHandler[TCrawlingContext] | None = None self._handlers_by_label = dict[str, RequestHandler[TCrawlingContext]]() def default_handler(self: Router, handler: RequestHandler[TCrawlingContext]) -> RequestHandler[TCrawlingContext]: """Register a default request handler. The default request handler is invoked for requests that have either no label or a label for which we have no matching handler. """ if self._default_handler is not None: raise RuntimeError('A default handler is already configured') self._default_handler = handler return handler def handler( self, label: str, ) -> Callable[[RequestHandler[TCrawlingContext]], Callable[[TCrawlingContext], Awaitable]]: """Register a request handler based on a label. This decorator registers a request handler for a specific label. The handler will be invoked only for requests that have the exact same label. """ if label in self._handlers_by_label: raise RuntimeError(f'A handler for label `{label}` is already registered') def wrapper(handler: Callable[[TCrawlingContext], Awaitable]) -> Callable[[TCrawlingContext], Awaitable]: self._handlers_by_label[label] = handler return handler return wrapper async def __call__(self, context: TCrawlingContext) -> None: """Invoke a request handler that matches the request label (or the default).""" context.request.state = RequestState.REQUEST_HANDLER if context.request.label is None or context.request.label not in self._handlers_by_label: if self._default_handler is None: raise RuntimeError( f'No handler matches label `{context.request.label}` and no default handler is configured' ) user_defined_handler = self._default_handler else: user_defined_handler = self._handlers_by_label[context.request.label] try: return await user_defined_handler(context) except asyncio.TimeoutError as e: # Timeout in handler, but not timeout of handler. raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e ================================================ FILE: src/crawlee/sessions/__init__.py ================================================ from ._cookies import CookieParam, SessionCookies from ._session import Session from ._session_pool import SessionPool __all__ = ['CookieParam', 'Session', 'SessionCookies', 'SessionPool'] ================================================ FILE: src/crawlee/sessions/_cookies.py ================================================ from __future__ import annotations from copy import deepcopy from http.cookiejar import Cookie, CookieJar from typing import TYPE_CHECKING, Any, Literal from typing_extensions import NotRequired, Required, TypedDict from crawlee._utils.docs import docs_group if TYPE_CHECKING: from collections.abc import Iterator from typing import TypeGuard @docs_group('Session management') class CookieParam(TypedDict, total=False): """Dictionary representation of cookies for `SessionCookies.set` method.""" name: Required[str] """Cookie name.""" value: Required[str] """Cookie value.""" domain: NotRequired[str] """Domain for which the cookie is set.""" path: NotRequired[str] """Path on the specified domain for which the cookie is set.""" secure: NotRequired[bool] """Set the `Secure` flag for the cookie.""" http_only: NotRequired[bool] """Set the `HttpOnly` flag for the cookie.""" expires: NotRequired[int] """Expiration date for the cookie, None for a session cookie.""" same_site: NotRequired[Literal['Lax', 'None', 'Strict']] """Set the `SameSite` attribute for the cookie.""" class PlaywrightCookieParam(TypedDict, total=False): """Cookie parameters in Playwright format with camelCase naming.""" name: NotRequired[str] value: NotRequired[str] domain: NotRequired[str] path: NotRequired[str] secure: NotRequired[bool] httpOnly: NotRequired[bool] expires: NotRequired[float] sameSite: NotRequired[Literal['Lax', 'None', 'Strict']] partitionKey: NotRequired[str | None] @docs_group('Session management') class SessionCookies: """Storage cookies for session with browser-compatible serialization and deserialization.""" def __init__(self, cookies: SessionCookies | CookieJar | dict[str, str] | list[CookieParam] | None = None) -> None: if isinstance(cookies, CookieJar): self._jar = cookies return self._jar = CookieJar() if isinstance(cookies, list): for item in cookies: self.set(**item) elif isinstance(cookies, SessionCookies): for cookie in cookies.jar: self._jar.set_cookie(cookie) elif isinstance(cookies, dict): cookies_dict: dict[str, str] = cookies for key, value in cookies_dict.items(): self.set(key, value) @property def jar(self) -> CookieJar: """The cookie jar instance.""" return self._jar def set( self, name: str, value: str, *, domain: str = '', path: str = '/', expires: int | None = None, http_only: bool = False, secure: bool = False, same_site: Literal['Lax', 'None', 'Strict'] | None = None, **_kwargs: Any, # Unknown parameters will be ignored. ) -> None: """Create and store a cookie with modern browser attributes. Args: name: Cookie name. value: Cookie value. domain: Cookie domain. path: Cookie path. expires: Cookie expiration timestamp. http_only: Whether cookie is HTTP-only. secure: Whether cookie requires secure context. same_site: SameSite cookie attribute value. """ cookie = Cookie( version=0, name=name, value=value, port=None, port_specified=False, domain=domain, domain_specified=bool(domain), domain_initial_dot=domain.startswith('.'), path=path, path_specified=bool(path), secure=secure, expires=expires, discard=True, comment=None, comment_url=None, rest={'HttpOnly': ''} if http_only else {}, rfc2109=False, ) if same_site: cookie.set_nonstandard_attr('SameSite', same_site) self.jar.set_cookie(cookie) def _convert_cookie_to_dict(self, cookie: Cookie) -> CookieParam: """Convert `http.cookiejar.Cookie` to dictionary format. Args: cookie: Cookie object to convert. """ cookie_dict = CookieParam( name=cookie.name, value=cookie.value or '', domain=cookie.domain, path=cookie.path, secure=cookie.secure, http_only=cookie.has_nonstandard_attr('HttpOnly'), ) if cookie.expires: cookie_dict['expires'] = cookie.expires if (same_site := cookie.get_nonstandard_attr('SameSite')) and self._is_valid_same_site(same_site): cookie_dict['same_site'] = same_site return cookie_dict def _to_playwright(self, cookie_dict: CookieParam) -> PlaywrightCookieParam: """Convert internal cookie to Playwright format.""" result: dict = dict(cookie_dict) if 'http_only' in result: result['httpOnly'] = result.pop('http_only') if 'same_site' in result: result['sameSite'] = result.pop('same_site') if 'expires' in result: result['expires'] = float(result['expires']) return PlaywrightCookieParam(**result) def _from_playwright(self, cookie_dict: PlaywrightCookieParam) -> CookieParam: """Convert Playwright cookie to internal format.""" result: dict = dict(cookie_dict) if 'httpOnly' in result: result['http_only'] = result.pop('httpOnly') if 'sameSite' in result: result['same_site'] = result.pop('sameSite') if 'expires' in result: expires = int(result['expires']) result['expires'] = None if expires == -1 else expires return CookieParam(name=result.pop('name', ''), value=result.pop('value', ''), **result) def get_cookies_as_dicts(self) -> list[CookieParam]: """Convert cookies to a list with `CookieParam` dicts.""" return [self._convert_cookie_to_dict(cookie) for cookie in self.jar] def store_cookie(self, cookie: Cookie) -> None: """Store a Cookie object in the session cookie jar. Args: cookie: The Cookie object to store in the jar. """ self.jar.set_cookie(cookie) def store_cookies(self, cookies: list[Cookie]) -> None: """Store multiple cookie objects in the session cookie jar. Args: cookies: A list of cookie objects to store in the jar. """ for cookie in cookies: self.store_cookie(cookie) self._jar.clear_expired_cookies() def set_cookies(self, cookie_dicts: list[CookieParam]) -> None: """Create and store cookies from their dictionary representations. Args: cookie_dicts: List of dictionaries where each dict represents cookie parameters. """ for cookie_dict in cookie_dicts: self.set(**cookie_dict) self._jar.clear_expired_cookies() def get_cookies_as_playwright_format(self) -> list[PlaywrightCookieParam]: """Get cookies in playwright format.""" return [self._to_playwright(cookie) for cookie in self.get_cookies_as_dicts()] def set_cookies_from_playwright_format(self, pw_cookies: list[PlaywrightCookieParam]) -> None: """Set cookies from playwright format.""" for pw_cookie in pw_cookies: cookie_param = self._from_playwright(pw_cookie) self.set(**cookie_param) self._jar.clear_expired_cookies() def __deepcopy__(self, memo: dict[int, Any] | None) -> SessionCookies: # This is necessary because `CookieJar` use `RLock`, which prevents `deepcopy`. cookie_dicts = self.get_cookies_as_dicts() return self.__class__(deepcopy(cookie_dicts, memo)) def __len__(self) -> int: return len(self._jar) def __setitem__(self, name: str, value: str) -> None: self.set(name, value) def __getitem__(self, name: str) -> str | None: for cookie in self._jar: if cookie.name == name: return cookie.value raise KeyError(f"Cookie '{name}' not found") def __iter__(self) -> Iterator[CookieParam]: return (self._convert_cookie_to_dict(cookie) for cookie in self._jar) def __repr__(self) -> str: cookies_str: str = ', '.join( [f'<Cookie {cookie.name}={cookie.value} for {cookie.domain}{cookie.path}>' for cookie in self._jar] ) return f'<SessionCookies[{cookies_str}]>' def __bool__(self) -> bool: for _ in self._jar: return True return False def __eq__(self, other: object) -> bool: if not isinstance(other, SessionCookies): return NotImplemented if len(self) != len(other): return False self_keys = {(cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in self._jar} other_keys = {(cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in other.jar} return self_keys == other_keys def __hash__(self) -> int: """Return hash based on the cookies key attributes.""" cookie_tuples = frozenset((cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in self._jar) return hash(cookie_tuples) def _is_valid_same_site(self, value: str | None) -> TypeGuard[Literal['Lax', 'None', 'Strict']]: return value in {'Lax', 'None', 'Strict'} ================================================ FILE: src/crawlee/sessions/_models.py ================================================ from __future__ import annotations from datetime import datetime, timedelta from typing import Annotated, Any from pydantic import ( BaseModel, BeforeValidator, ConfigDict, Field, GetPydanticSchema, PlainSerializer, computed_field, ) from ._cookies import CookieParam from ._session import Session class SessionModel(BaseModel): """Model for a Session object.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) id: Annotated[str, Field(alias='id')] max_age: Annotated[timedelta, Field(alias='maxAge')] user_data: Annotated[dict, Field(alias='userData')] max_error_score: Annotated[float, Field(alias='maxErrorScore')] error_score_decrement: Annotated[float, Field(alias='errorScoreDecrement')] created_at: Annotated[datetime, Field(alias='createdAt')] usage_count: Annotated[int, Field(alias='usageCount')] max_usage_count: Annotated[int, Field(alias='maxUsageCount')] error_score: Annotated[float, Field(alias='errorScore')] cookies: Annotated[list[CookieParam], Field(alias='cookies')] blocked_status_codes: Annotated[list[int], Field(alias='blockedStatusCodes')] class SessionPoolModel(BaseModel): """Model for a SessionPool object.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) max_pool_size: Annotated[int, Field(alias='maxPoolSize')] sessions: Annotated[ dict[ str, Annotated[ Session, GetPydanticSchema(lambda _, handler: handler(Any)) ], # handler(Any) is fine - we validate manually in the BeforeValidator ], Field(alias='sessions'), PlainSerializer( lambda value: [session.get_state().model_dump(by_alias=True) for session in value.values()], return_type=list, ), BeforeValidator( lambda value: { session.id: session for item in value if (session := Session.from_model(SessionModel.model_validate(item, by_alias=True))) } ), ] @computed_field(alias='sessionCount') @property def session_count(self) -> int: """Get the total number of sessions currently maintained in the pool.""" return len(self.sessions) @computed_field(alias='usableSessionCount') @property def usable_session_count(self) -> int: """Get the number of sessions that are currently usable.""" return len([session for _, session in self.sessions.items() if session.is_usable]) @computed_field(alias='retiredSessionCount') @property def retired_session_count(self) -> int: """Get the number of sessions that are no longer usable.""" return self.session_count - self.usable_session_count ================================================ FILE: src/crawlee/sessions/_session.py ================================================ # Inspiration: https://github.com/apify/crawlee/blob/v3.9.0/packages/core/src/session_pool/session.ts from __future__ import annotations from datetime import datetime, timedelta, timezone from logging import getLogger from typing import TYPE_CHECKING, ClassVar, Literal, overload from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.docs import docs_group from crawlee.sessions._cookies import CookieParam, SessionCookies if TYPE_CHECKING: from http.cookiejar import CookieJar from crawlee.sessions._models import SessionModel logger = getLogger(__name__) @docs_group('Session management') class Session: """Represent a single user session, managing cookies, error states, and usage limits. A `Session` simulates a specific user with attributes like cookies, IP (via proxy), and potentially a unique browser fingerprint. It maintains its internal state, which can include custom user data (e.g., authorization tokens or headers) and tracks its usability through metrics such as error score, usage count, and expiration. """ _DEFAULT_BLOCKED_STATUS_CODES: ClassVar = [401, 403, 429] """Default status codes that indicate a session is blocked.""" def __init__( self, *, id: str | None = None, max_age: timedelta = timedelta(minutes=50), user_data: dict | None = None, max_error_score: float = 3.0, error_score_decrement: float = 0.5, created_at: datetime | None = None, usage_count: int = 0, max_usage_count: int = 50, error_score: float = 0.0, cookies: SessionCookies | CookieJar | dict[str, str] | list[CookieParam] | None = None, blocked_status_codes: list | None = None, ) -> None: """Initialize a new instance. Args: id: Unique identifier for the session, autogenerated if not provided. max_age: Time duration after which the session expires. user_data: Custom user data associated with the session. max_error_score: Threshold score beyond which the session is considered blocked. error_score_decrement: Value by which the error score is decremented on successful operations. created_at: Timestamp when the session was created, defaults to current UTC time if not provided. usage_count: Number of times the session has been used. max_usage_count: Maximum allowable uses of the session before it is considered expired. error_score: Current error score of the session. cookies: Cookies associated with the session. blocked_status_codes: HTTP status codes that indicate a session should be blocked. """ self._id = id or crypto_random_object_id(length=10) self._max_age = max_age self._user_data = user_data or {} self._max_error_score = max_error_score self._error_score_decrement = error_score_decrement self._created_at = created_at or datetime.now(timezone.utc) self._usage_count = usage_count self._max_usage_count = max_usage_count self._error_score = error_score self._cookies = SessionCookies(cookies) or SessionCookies() self._blocked_status_codes = set(blocked_status_codes or self._DEFAULT_BLOCKED_STATUS_CODES) @classmethod def from_model(cls, model: SessionModel) -> Session: """Initialize a new instance from a `SessionModel`.""" cookies = SessionCookies(model.cookies) return cls(**model.model_dump(exclude={'cookies'}), cookies=cookies) def __repr__(self) -> str: """Get a string representation.""" return f'<{self.__class__.__name__} {self.get_state(as_dict=False)}>' def __eq__(self, other: object) -> bool: """Compare two sessions for equality.""" if not isinstance(other, Session): return NotImplemented return self.get_state(as_dict=True) == other.get_state(as_dict=True) def __hash__(self) -> int: """Return hash based on the session state.""" state = self.get_state(as_dict=True) hashable_items = list[tuple[str, int]]() # Convert dict to tuple of sorted items for consistent hashing. Exclude non-hashable values like cookies # and convert them to their string representation. for key, value in sorted(state.items()): if key == 'cookies': # Use hash of the cookies object if it has __hash__ method. hashable_items.append((key, hash(self._cookies))) elif isinstance(value, (list, dict)): # Convert collections to tuples for hashing. if isinstance(value, list): hashable_items.append((key, hash(tuple(value)))) else: hashable_items.append((key, hash(tuple(sorted(value.items()))))) else: hashable_items.append((key, hash(value))) return hash(tuple(hashable_items)) @property def id(self) -> str: """Get the session ID.""" return self._id @property def user_data(self) -> dict: """Get the user data.""" return self._user_data @property def cookies(self) -> SessionCookies: """Get the cookies.""" return self._cookies @property def error_score(self) -> float: """Get the current error score.""" return self._error_score @property def usage_count(self) -> float: """Get the current usage count.""" return self._usage_count @property def expires_at(self) -> datetime: """Get the expiration datetime of the session.""" return self._created_at + self._max_age @property def is_blocked(self) -> bool: """Indicate whether the session is blocked based on the error score..""" return self._error_score >= self._max_error_score @property def is_expired(self) -> bool: """Indicate whether the session is expired based on the current time.""" return datetime.now(timezone.utc) >= self.expires_at @property def is_max_usage_count_reached(self) -> bool: """Indicate whether the session has reached its maximum usage limit.""" return self._usage_count >= self._max_usage_count @property def is_usable(self) -> bool: """Determine if the session is usable for next requests.""" return not (self.is_blocked or self.is_expired or self.is_max_usage_count_reached) @overload def get_state(self, *, as_dict: Literal[True]) -> dict: ... @overload def get_state(self, *, as_dict: Literal[False]) -> SessionModel: ... def get_state(self, *, as_dict: bool = False) -> SessionModel | dict: """Retrieve the current state of the session either as a model or as a dictionary.""" from ._models import SessionModel # noqa: PLC0415 model = SessionModel( id=self._id, max_age=self._max_age, user_data=self._user_data, max_error_score=self._max_error_score, error_score_decrement=self._error_score_decrement, created_at=self._created_at, usage_count=self._usage_count, max_usage_count=self._max_usage_count, error_score=self._error_score, cookies=self._cookies.get_cookies_as_dicts(), blocked_status_codes=list(self._blocked_status_codes), ) if as_dict: return model.model_dump() return model def mark_good(self) -> None: """Mark the session as good. Should be called after a successful session usage.""" self._usage_count += 1 if self._error_score > 0: self._error_score = max(0, self._error_score - self._error_score_decrement) # Retire the session if it is not usable anymore if not self.is_usable: self.retire() def mark_bad(self) -> None: """Mark the session as bad after an unsuccessful session usage.""" self._error_score += 1 self._usage_count += 1 # Retire the session if it is not usable anymore if not self.is_usable: self.retire() def retire(self) -> None: """Retire the session by setting the error score to the maximum value. This method should be used if the session usage was unsuccessful and you are sure that it is because of the session configuration and not any external matters. For example when server returns 403 status code. If the session does not work due to some external factors as server error such as 5XX you probably want to use `mark_bad` method. """ self._error_score += self._max_error_score self._usage_count += 1 # Note: We emit an event here because of the Puppeteer in TS implementation. def is_blocked_status_code( self, *, status_code: int, ignore_http_error_status_codes: set[int] | None = None, ) -> bool: """Evaluate whether a session should be retired based on the received HTTP status code. Args: status_code: The HTTP status code received from a server response. ignore_http_error_status_codes: Optional status codes to allow suppression of codes from `blocked_status_codes`. Returns: True if the session should be retired, False otherwise. """ return status_code in (self._blocked_status_codes - (ignore_http_error_status_codes or set())) ================================================ FILE: src/crawlee/sessions/_session_pool.py ================================================ # Inspiration: https://github.com/apify/crawlee/blob/v3.9.0/packages/core/src/session_pool/session_pool.ts from __future__ import annotations import random from collections.abc import Callable from logging import getLogger from typing import TYPE_CHECKING, Literal, overload from crawlee import service_locator from crawlee._utils.context import ensure_context from crawlee._utils.docs import docs_group from crawlee._utils.recoverable_state import RecoverableState from crawlee.sessions import Session from crawlee.sessions._models import SessionPoolModel if TYPE_CHECKING: from types import TracebackType from crawlee.events import EventManager logger = getLogger(__name__) CreateSessionFunctionType = Callable[[], Session] @docs_group('Session management') class SessionPool: """A pool of sessions that are managed, rotated, and persisted based on usage and age. It ensures effective session management by maintaining a pool of sessions and rotating them based on usage count, expiration time, or custom rules. It provides methods to retrieve sessions, manage their lifecycle, and optionally persist the state to enable recovery. """ def __init__( self, *, max_pool_size: int = 1000, create_session_settings: dict | None = None, create_session_function: CreateSessionFunctionType | None = None, event_manager: EventManager | None = None, persistence_enabled: bool = False, persist_state_kvs_name: str | None = None, persist_state_key: str = 'CRAWLEE_SESSION_POOL_STATE', ) -> None: """Initialize a new instance. Args: max_pool_size: Maximum number of sessions to maintain in the pool. You can add more sessions to the pool by using the `add_session` method. create_session_settings: Settings for creating new session instances. If None, default settings will be used. Do not set it if you are providing a `create_session_function`. create_session_function: A callable to create new session instances. If None, a default session settings will be used. Do not set it if you are providing `create_session_settings`. event_manager: The event manager to handle events like persist state. persistence_enabled: Flag to enable or disable state persistence of the pool. persist_state_kvs_name: The name of the `KeyValueStore` used for state persistence. persist_state_key: The key under which the session pool's state is stored in the `KeyValueStore`. """ if event_manager: service_locator.set_event_manager(event_manager) self._state = RecoverableState( default_state=SessionPoolModel( max_pool_size=max_pool_size, sessions={}, ), logger=logger, persistence_enabled=persistence_enabled, persist_state_kvs_name=persist_state_kvs_name, persist_state_key=persist_state_key or 'CRAWLEE_SESSION_POOL_STATE', ) self._max_pool_size = max_pool_size self._session_settings = create_session_settings or {} self._create_session_function = create_session_function self._persistence_enabled = persistence_enabled if self._create_session_function and self._session_settings: raise ValueError('Both `create_session_settings` and `create_session_function` cannot be provided.') # Flag to indicate the context state. self._active = False def __repr__(self) -> str: """Get a string representation.""" return f'<{self.__class__.__name__} {self.get_state(as_dict=False)}>' @property def session_count(self) -> int: """Get the total number of sessions currently maintained in the pool.""" return len(self._state.current_value.sessions) @property def usable_session_count(self) -> int: """Get the number of sessions that are currently usable.""" return self._state.current_value.usable_session_count @property def retired_session_count(self) -> int: """Get the number of sessions that are no longer usable.""" return self._state.current_value.retired_session_count @property def active(self) -> bool: """Indicate whether the context is active.""" return self._active async def __aenter__(self) -> SessionPool: """Initialize the pool upon entering the context manager. Raises: RuntimeError: If the context manager is already active. """ if self._active: raise RuntimeError(f'The {self.__class__.__name__} is already active.') self._active = True state = await self._state.initialize() state.max_pool_size = self._max_pool_size self._remove_retired_sessions() if not state.sessions: await self._fill_sessions_to_max() return self async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None, ) -> None: """Deinitialize the pool upon exiting the context manager. Raises: RuntimeError: If the context manager is not active. """ if not self._active: raise RuntimeError(f'The {self.__class__.__name__} is not active.') await self._state.teardown() self._active = False @overload def get_state(self, *, as_dict: Literal[True]) -> dict: ... @overload def get_state(self, *, as_dict: Literal[False]) -> SessionPoolModel: ... @ensure_context def get_state(self, *, as_dict: bool = False) -> SessionPoolModel | dict: """Retrieve the current state of the pool either as a model or as a dictionary.""" model = self._state.current_value.model_copy(deep=True) if as_dict: return model.model_dump() return model @ensure_context def add_session(self, session: Session) -> None: """Add an externally created session to the pool. This is intended only for the cases when you want to add a session that was created outside of the pool. Otherwise, the pool will create new sessions automatically. Args: session: The session to add to the pool. """ state = self._state.current_value if session.id in state.sessions: logger.warning(f'Session with ID {session.id} already exists in the pool.') return state.sessions[session.id] = session @ensure_context async def get_session(self) -> Session: """Retrieve a random session from the pool. This method first ensures the session pool is at its maximum capacity. If the random session is not usable, retired sessions are removed and a new session is created and returned. Returns: The session object. """ await self._fill_sessions_to_max() session = self._get_random_session() if session.is_usable: return session # If the random session is not usable, clean up and create a new session self._remove_retired_sessions() return await self._create_new_session() @ensure_context async def get_session_by_id(self, session_id: str) -> Session | None: """Retrieve a session by ID from the pool. This method first ensures the session pool is at its maximum capacity. It then tries to retrieve a specific session by ID. If the session is not found or not usable, `None` is returned. Args: session_id: The ID of the session to retrieve. Returns: The session object if found and usable, otherwise `None`. """ await self._fill_sessions_to_max() session = self._state.current_value.sessions.get(session_id) if not session: logger.warning(f'Session with ID {session_id} not found.') return None if not session.is_usable: logger.warning(f'Session with ID {session_id} is not usable.') return None return session async def reset_store(self) -> None: """Reset the KVS where the pool state is persisted.""" await self._state.reset() async def _create_new_session(self) -> Session: """Create a new session, add it to the pool and return it.""" if self._create_session_function: new_session = self._create_session_function() else: new_session = Session(**self._session_settings) self._state.current_value.sessions[new_session.id] = new_session return new_session async def _fill_sessions_to_max(self) -> None: """Fill the pool with sessions to the maximum size.""" for _ in range(self._max_pool_size - self.session_count): await self._create_new_session() def _get_random_session(self) -> Session: """Get a random session from the pool.""" state = self._state.current_value if not state.sessions: raise ValueError('No sessions available in the pool.') return random.choice(list(state.sessions.values())) def _remove_retired_sessions(self) -> None: """Remove all sessions from the pool that are no longer usable.""" state = self._state.current_value state.sessions = {session.id: session for session in state.sessions.values() if session.is_usable} ================================================ FILE: src/crawlee/sessions/py.typed ================================================ ================================================ FILE: src/crawlee/statistics/__init__.py ================================================ from ._models import FinalStatistics, StatisticsState from ._statistics import Statistics __all__ = ['FinalStatistics', 'Statistics', 'StatisticsState'] ================================================ FILE: src/crawlee/statistics/_error_snapshotter.py ================================================ from __future__ import annotations import asyncio import hashlib import re import string from typing import TYPE_CHECKING from crawlee.storages import KeyValueStore if TYPE_CHECKING: from crawlee._types import BasicCrawlingContext class ErrorSnapshotter: MAX_ERROR_CHARACTERS = 30 MAX_HASH_LENGTH = 30 MAX_FILENAME_LENGTH = 250 BASE_MESSAGE = 'An error occurred' SNAPSHOT_PREFIX = 'ERROR_SNAPSHOT' ALLOWED_CHARACTERS = string.ascii_letters + string.digits + '!-_.' def __init__(self, *, snapshot_kvs_name: str | None = None) -> None: self._kvs_name = snapshot_kvs_name async def capture_snapshot( self, error_message: str, file_and_line: str, context: BasicCrawlingContext, ) -> None: """Capture error snapshot and save it to key value store. It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler` returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with an exception. Args: error_message: Used in filename of the snapshot. file_and_line: Used in filename of the snapshot. context: Context that is used to get the snapshot. """ if snapshot := await context.get_snapshot(): kvs = await KeyValueStore.open(name=self._kvs_name) snapshot_base_name = self._get_snapshot_base_name(error_message, file_and_line) snapshot_save_tasks = list[asyncio.Task]() if snapshot.html: snapshot_save_tasks.append( asyncio.create_task(self._save_html(kvs, snapshot.html, base_name=snapshot_base_name)) ) if snapshot.screenshot: snapshot_save_tasks.append( asyncio.create_task(self._save_screenshot(kvs, snapshot.screenshot, base_name=snapshot_base_name)) ) await asyncio.gather(*snapshot_save_tasks) async def _save_html(self, kvs: KeyValueStore, html: str, base_name: str) -> None: file_name = f'{base_name}.html' await kvs.set_value(file_name, html, content_type='text/html') async def _save_screenshot(self, kvs: KeyValueStore, screenshot: bytes, base_name: str) -> None: file_name = f'{base_name}.jpg' await kvs.set_value(file_name, screenshot, content_type='image/jpeg') def _sanitize_filename(self, filename: str) -> str: return re.sub(f'[^{re.escape(self.ALLOWED_CHARACTERS)}]', '', filename[: self.MAX_FILENAME_LENGTH]) def _get_snapshot_base_name(self, error_message: str, file_and_line: str) -> str: sha1_hash = hashlib.sha1() # noqa:S324 # Collisions related attacks are of no concern here. sha1_hash.update(file_and_line.encode('utf-8')) hashed_file_and_text = sha1_hash.hexdigest()[: self.MAX_HASH_LENGTH] error_message_start = (error_message or self.BASE_MESSAGE)[: self.MAX_ERROR_CHARACTERS] return self._sanitize_filename(f'{self.SNAPSHOT_PREFIX}_{hashed_file_and_text}_{error_message_start}') ================================================ FILE: src/crawlee/statistics/_error_tracker.py ================================================ # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/utils/src/internals/error_tracker.ts from __future__ import annotations import traceback from collections import Counter, defaultdict from itertools import zip_longest from logging import getLogger from typing import TYPE_CHECKING from crawlee.statistics._error_snapshotter import ErrorSnapshotter if TYPE_CHECKING: from crawlee._types import BasicCrawlingContext GroupName = str | None ErrorFilenameGroups = dict[GroupName, dict[GroupName, Counter[GroupName]]] logger = getLogger(__name__) class ErrorTracker: """Track errors and aggregates their counts by similarity.""" def __init__( self, *, snapshot_kvs_name: str | None = None, show_error_name: bool = True, show_file_and_line_number: bool = True, show_error_message: bool = True, show_full_message: bool = False, save_error_snapshots: bool = False, ) -> None: self.error_snapshotter = ErrorSnapshotter(snapshot_kvs_name=snapshot_kvs_name) if save_error_snapshots else None self.show_error_name = show_error_name self.show_file_and_line_number = show_file_and_line_number self.show_error_message = show_error_message if show_full_message and not show_error_message: raise ValueError('`show_error_message` must be `True` if `show_full_message` is set to `True`') self.show_full_message = show_full_message self._errors: ErrorFilenameGroups = defaultdict(lambda: defaultdict(Counter)) self._early_reported_errors = set[int]() async def add( self, error: Exception, *, context: BasicCrawlingContext | None = None, early: bool = False, ) -> None: """Add an error in the statistics. Args: error: Error to be added to statistics. context: Context used to collect error snapshot. early: Flag indicating that the error is added earlier than usual to have access to resources that will be closed before normal error collection. This prevents double reporting during normal error collection. """ if id(error) in self._early_reported_errors: # Error had to be collected earlier before relevant resources are closed. self._early_reported_errors.remove(id(error)) return if early: self._early_reported_errors.add(id(error)) error_group_name = error.__class__.__name__ if self.show_error_name else None error_group_message = self._get_error_message(error) new_error_group_message = '' # In case of wildcard similarity match error_group_file_and_line = self._get_file_and_line(error) # First two levels are grouped only in case of exact match. specific_groups = self._errors[error_group_file_and_line][error_group_name] # Lowest level group is matched by similarity. if error_group_message in specific_groups: # Exact match. specific_groups.update([error_group_message]) else: for existing_error_group_message in specific_groups: # Add to first group with similar text. Modify text with wildcard characters if necessary. if new_error_group_message := self._create_generic_message( existing_error_group_message, error_group_message ): # Replace old name. specific_groups[new_error_group_message] = specific_groups.pop(existing_error_group_message) # Increment. specific_groups.update([new_error_group_message]) break else: # No similar message found. Create new group. self._errors[error_group_file_and_line][error_group_name].update([error_group_message]) if ( self._errors[error_group_file_and_line][error_group_name][new_error_group_message or error_group_message] == 1 and context is not None ): # Save snapshot only on the first occurrence of the error and only if context and kvs was passed as well. await self._capture_error_snapshot( error_message=new_error_group_message or error_group_message, file_and_line=error_group_file_and_line, context=context, ) async def _capture_error_snapshot( self, error_message: str, file_and_line: str, context: BasicCrawlingContext ) -> None: if self.error_snapshotter: try: await self.error_snapshotter.capture_snapshot( error_message=error_message, file_and_line=file_and_line, context=context ) except Exception: logger.exception(f'Error when trying to collect error snapshot for exception: {error_message}') def _get_file_and_line(self, error: Exception) -> str: if self.show_file_and_line_number: error_traceback = traceback.extract_tb(error.__traceback__) # Show only the most specific frame. return f'{error_traceback[-1].filename.split("/")[-1]}:{error_traceback[-1].lineno}' return '' def _get_error_message(self, error: Exception) -> str: if self.show_error_message: error_content = error.args[0] if error.args else error.__context__ error_content = str(error_content) if error_content else error.__class__.__name__ if self.show_full_message: return error_content return error_content.split('\n')[0] return '' @property def unique_error_count(self) -> int: """Number of distinct kinds of errors.""" unique_error_count = 0 for file_and_line_group in self._errors.values(): for name_group in file_and_line_group.values(): unique_error_count += len(name_group) return unique_error_count @property def total(self) -> int: """Total number of errors.""" error_count = 0 for file_and_line_group in self._errors.values(): for name_group in file_and_line_group.values(): error_count += sum(name_group.values()) return error_count def get_most_common_errors(self, n: int = 3) -> list[tuple[str | None, int]]: """Return n most common errors.""" all_errors: Counter[GroupName] = Counter() for file_and_line_group_name, file_and_line_group in self._errors.items(): for name_group_name, name_group in file_and_line_group.items(): for message_group_name, count in name_group.items(): all_errors[self._get_error_repr(file_and_line_group_name, name_group_name, message_group_name)] = ( count ) return all_errors.most_common(n) def _get_error_repr(self, file_and_line: str | None, name: str | None, message: str | None) -> str: """Get the most specific error representation.""" file_and_line_part = f'{file_and_line}:' if file_and_line else '' name_part = f'{name}:' if name else '' message_part = f'{message}' if message else '' return f'{file_and_line_part}{name_part}{message_part}' @staticmethod def _create_generic_message(message_1: str | None, message_2: str | None) -> str: """Create a generic error message from two messages, if they are similar enough. Different parts of similar messages are replaced by `***`. """ if message_1 is None or message_2 is None: return '' replacement_string = '***' replacement_count = 0 generic_message_parts = [] message_1_parts = message_1.split(' ') message_2_parts = message_2.split(' ') parts_count = min(len(message_1_parts), len(message_2_parts)) for message_1_part, message_2_part in zip_longest(message_1_parts, message_2_parts, fillvalue=''): if message_1_part != message_2_part: generic_message_parts.append(replacement_string) replacement_count += 1 if replacement_count >= parts_count / 2: # Messages are too different. return '' else: generic_message_parts.append(message_1_part) return ' '.join(generic_message_parts) ================================================ FILE: src/crawlee/statistics/_models.py ================================================ from __future__ import annotations import json import warnings from dataclasses import asdict, dataclass from datetime import datetime, timedelta, timezone from typing import TYPE_CHECKING, Annotated, Any from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator, computed_field from typing_extensions import override from crawlee._utils.console import make_table from crawlee._utils.docs import docs_group from crawlee._utils.models import timedelta_ms from crawlee._utils.time import format_duration _STATISTICS_TABLE_WIDTH = 100 @dataclass(frozen=True) @docs_group('Statistics') class FinalStatistics: """Statistics about a crawler run.""" requests_finished: int requests_failed: int retry_histogram: list[int] request_avg_failed_duration: timedelta | None request_avg_finished_duration: timedelta | None requests_finished_per_minute: float requests_failed_per_minute: float request_total_duration: timedelta requests_total: int crawler_runtime: timedelta def to_table(self) -> str: """Print out the Final Statistics data as a table.""" formatted_dict = {} for k, v in asdict(self).items(): if isinstance(v, timedelta): formatted_dict[k] = format_duration(v) else: formatted_dict[k] = v return make_table([(str(k), str(v)) for k, v in formatted_dict.items()], width=_STATISTICS_TABLE_WIDTH) def to_dict(self) -> dict[str, float | int | list[int]]: return {k: v.total_seconds() if isinstance(v, timedelta) else v for k, v in asdict(self).items()} @override def __str__(self) -> str: return json.dumps( {k: v.total_seconds() if isinstance(v, timedelta) else v for k, v in asdict(self).items()}, ) @docs_group('Statistics') class StatisticsState(BaseModel): """Statistic data about a crawler run.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants') stats_id: Annotated[int | None, Field(alias='statsId')] = None requests_finished: Annotated[int, Field(alias='requestsFinished')] = 0 requests_failed: Annotated[int, Field(alias='requestsFailed')] = 0 requests_retries: Annotated[int, Field(alias='requestsRetries')] = 0 requests_failed_per_minute: Annotated[float, Field(alias='requestsFailedPerMinute')] = 0 requests_finished_per_minute: Annotated[float, Field(alias='requestsFinishedPerMinute')] = 0 request_min_duration: Annotated[timedelta_ms | None, Field(alias='requestMinDurationMillis')] = None request_max_duration: Annotated[timedelta_ms | None, Field(alias='requestMaxDurationMillis')] = None request_total_failed_duration: Annotated[timedelta_ms, Field(alias='requestTotalFailedDurationMillis')] = ( timedelta() ) request_total_finished_duration: Annotated[timedelta_ms, Field(alias='requestTotalFinishedDurationMillis')] = ( timedelta() ) crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None # Workaround for Pydantic and type checkers when using Annotated with default_factory if TYPE_CHECKING: errors: dict[str, Any] = {} retry_errors: dict[str, Any] = {} requests_with_status_code: dict[str, int] = {} else: errors: Annotated[dict[str, Any], Field(default_factory=dict)] retry_errors: Annotated[dict[str, Any], Field(alias='retryErrors', default_factory=dict)] requests_with_status_code: Annotated[ dict[str, int], Field(alias='requestsWithStatusCode', default_factory=dict), ] stats_persisted_at: Annotated[ datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc)) ] = None request_retry_histogram: Annotated[ dict[int, int], Field(alias='requestRetryHistogram'), PlainValidator(lambda value: dict(enumerate(value)), json_schema_input_type=list[int]), PlainSerializer( lambda value: [value.get(i, 0) for i in range(max(value.keys(), default=0) + 1)], return_type=list[int], ), ] = {} # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs. _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta() def model_post_init(self, /, __context: Any) -> None: self._runtime_offset = self.crawler_runtime or self._runtime_offset @property def crawler_runtime(self) -> timedelta: if self.crawler_last_started_at: finished_at = self.crawler_finished_at or datetime.now(timezone.utc) return self._runtime_offset + finished_at - self.crawler_last_started_at return self._runtime_offset @crawler_runtime.setter def crawler_runtime(self, value: timedelta) -> None: # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually. # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567 warnings.warn( f"Setting 'crawler_runtime' is deprecated and will be removed in a future version." f' Value {value} will not be used.', DeprecationWarning, stacklevel=2, ) @computed_field(alias='crawlerRuntimeMillis') def crawler_runtime_for_serialization(self) -> timedelta: if self.crawler_last_started_at: finished_at = self.crawler_finished_at or datetime.now(timezone.utc) return self._runtime_offset + finished_at - self.crawler_last_started_at return self._runtime_offset @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) @property def request_total_duration(self) -> timedelta: return self.request_total_finished_duration + self.request_total_failed_duration @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None) @property def request_avg_failed_duration(self) -> timedelta | None: return (self.request_total_failed_duration / self.requests_failed) if self.requests_failed else None @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None) @property def request_avg_finished_duration(self) -> timedelta | None: return (self.request_total_finished_duration / self.requests_finished) if self.requests_finished else None @computed_field(alias='requestsTotal') @property def requests_total(self) -> int: return self.requests_failed + self.requests_finished ================================================ FILE: src/crawlee/statistics/_statistics.py ================================================ # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts from __future__ import annotations import asyncio import math import time from datetime import datetime, timedelta, timezone from logging import Logger, getLogger from typing import TYPE_CHECKING, Generic, Literal from typing_extensions import Self, TypeVar from crawlee._utils.context import ensure_context from crawlee._utils.docs import docs_group from crawlee._utils.recoverable_state import RecoverableState from crawlee._utils.recurring_task import RecurringTask from crawlee.statistics import FinalStatistics, StatisticsState from crawlee.statistics._error_tracker import ErrorTracker if TYPE_CHECKING: from collections.abc import Callable, Coroutine from types import TracebackType from crawlee.storages import KeyValueStore TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState) logger = getLogger(__name__) class RequestProcessingRecord: """Tracks information about the processing of a request.""" def __init__(self) -> None: self._last_run_at_ns: int | None = None self._runs = 0 self.duration: timedelta | None = None def run(self) -> int: """Mark the job as started.""" self._last_run_at_ns = time.perf_counter_ns() self._runs += 1 return self._runs def finish(self) -> timedelta: """Mark the job as finished.""" if self._last_run_at_ns is None: raise RuntimeError('Invalid state') self.duration = timedelta(microseconds=math.ceil((time.perf_counter_ns() - self._last_run_at_ns) / 1000)) return self.duration @property def retry_count(self) -> int: """Number of times the job has been retried.""" return max(0, self._runs - 1) @docs_group('Statistics') class Statistics(Generic[TStatisticsState]): """A class for collecting, tracking, and logging runtime statistics for requests. It is designed to record information such as request durations, retries, successes, and failures, enabling analysis of crawler performance. The collected statistics are persisted to a `KeyValueStore`, ensuring they remain available across crawler migrations, abortions, and restarts. This persistence allows for tracking and evaluation of crawler behavior over its lifecycle. """ __next_id = 0 def __init__( self, *, persistence_enabled: bool | Literal['explicit_only'] = False, persist_state_kvs_name: str | None = None, persist_state_key: str | None = None, persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None, log_message: str = 'Statistics', periodic_message_logger: Logger | None = None, log_interval: timedelta = timedelta(minutes=1), state_model: type[TStatisticsState], statistics_log_format: Literal['table', 'inline'] = 'table', save_error_snapshots: bool = False, ) -> None: self._id = Statistics.__next_id Statistics.__next_id += 1 self.error_tracker = ErrorTracker( save_error_snapshots=save_error_snapshots, snapshot_kvs_name=persist_state_kvs_name, ) self.error_tracker_retry = ErrorTracker(save_error_snapshots=False) self._requests_in_progress = dict[str, RequestProcessingRecord]() self._state = RecoverableState( default_state=state_model(stats_id=self._id), persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}', persistence_enabled=persistence_enabled, persist_state_kvs_name=persist_state_kvs_name, persist_state_kvs_factory=persist_state_kvs_factory, logger=logger, ) self._log_message = log_message self._statistics_log_format = statistics_log_format self._periodic_message_logger = periodic_message_logger or logger self._periodic_logger = RecurringTask(self._log, log_interval) # Flag to indicate the context state. self._active = False def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]: """Create near copy of the `Statistics` with replaced `state_model`.""" new_statistics: Statistics[TNewStatisticsState] = Statistics( persistence_enabled=self._state._persistence_enabled, # noqa: SLF001 persist_state_key=self._state._persist_state_key, # noqa: SLF001 persist_state_kvs_factory=self._state._persist_state_kvs_factory, # noqa: SLF001 log_message=self._log_message, periodic_message_logger=self._periodic_message_logger, state_model=state_model, ) new_statistics._periodic_logger = self._periodic_logger # Accessing private member to create copy like-object. return new_statistics @staticmethod def with_default_state( *, persistence_enabled: bool = False, persist_state_kvs_name: str | None = None, persist_state_key: str | None = None, persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None, log_message: str = 'Statistics', periodic_message_logger: Logger | None = None, log_interval: timedelta = timedelta(minutes=1), statistics_log_format: Literal['table', 'inline'] = 'table', save_error_snapshots: bool = False, ) -> Statistics[StatisticsState]: """Initialize a new instance with default state model `StatisticsState`.""" return Statistics[StatisticsState]( persistence_enabled=persistence_enabled, persist_state_kvs_name=persist_state_kvs_name, persist_state_key=persist_state_key, persist_state_kvs_factory=persist_state_kvs_factory, log_message=log_message, periodic_message_logger=periodic_message_logger, log_interval=log_interval, state_model=StatisticsState, statistics_log_format=statistics_log_format, save_error_snapshots=save_error_snapshots, ) @property def active(self) -> bool: """Indicate whether the context is active.""" return self._active async def __aenter__(self) -> Self: """Subscribe to events and start collecting statistics. Raises: RuntimeError: If the context manager is already active. """ if self._active: raise RuntimeError(f'The {self.__class__.__name__} is already active.') await self._state.initialize() # Reset `crawler_finished_at` to indicate a new run in progress. self.state.crawler_finished_at = None # Start periodic logging and let it print initial state before activation. self._periodic_logger.start() await asyncio.sleep(0.01) self._active = True self.state.crawler_last_started_at = datetime.now(timezone.utc) self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at return self async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None, ) -> None: """Stop collecting statistics. Raises: RuntimeError: If the context manager is not active. """ if not self._active: raise RuntimeError(f'The {self.__class__.__name__} is not active.') if not self.state.crawler_last_started_at: raise RuntimeError('Statistics.state.crawler_last_started_at not set.') # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime await self._periodic_logger.stop() self.state.crawler_finished_at = datetime.now(timezone.utc) self._active = False await self._state.teardown() @property def state(self) -> TStatisticsState: return self._state.current_value @ensure_context def register_status_code(self, code: int) -> None: """Increment the number of times a status code has been received.""" state = self._state.current_value state.requests_with_status_code.setdefault(str(code), 0) state.requests_with_status_code[str(code)] += 1 @ensure_context def record_request_processing_start(self, request_id_or_key: str) -> None: """Mark a request as started.""" record = self._requests_in_progress.get(request_id_or_key, RequestProcessingRecord()) record.run() self._requests_in_progress[request_id_or_key] = record @ensure_context def record_request_processing_finish(self, request_id_or_key: str) -> None: """Mark a request as finished.""" record = self._requests_in_progress.get(request_id_or_key) if record is None: return state = self._state.current_value duration = record.finish() state.requests_finished += 1 state.request_total_finished_duration += duration self._save_retry_count_for_request(record) state.request_min_duration = min( state.request_min_duration if state.request_min_duration is not None else timedelta.max, duration ) state.request_max_duration = max( state.request_max_duration if state.request_max_duration is not None else timedelta(), duration ) del self._requests_in_progress[request_id_or_key] @ensure_context def record_request_processing_failure(self, request_id_or_key: str) -> None: """Mark a request as failed.""" record = self._requests_in_progress.get(request_id_or_key) if record is None: return state = self._state.current_value state.request_total_failed_duration += record.finish() state.requests_failed += 1 self._save_retry_count_for_request(record) del self._requests_in_progress[request_id_or_key] def calculate(self) -> FinalStatistics: """Calculate the current statistics.""" total_minutes = self.state.crawler_runtime.total_seconds() / 60 state = self._state.current_value serialized_state = state.model_dump(by_alias=False) return FinalStatistics( request_avg_failed_duration=state.request_avg_failed_duration, request_avg_finished_duration=state.request_avg_finished_duration, requests_finished_per_minute=round(state.requests_finished / total_minutes) if total_minutes else 0, requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0, request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration, requests_total=state.requests_failed + state.requests_finished, crawler_runtime=state.crawler_runtime, requests_finished=state.requests_finished, requests_failed=state.requests_failed, retry_histogram=serialized_state['request_retry_histogram'], ) async def reset(self) -> None: """Reset the statistics to their defaults and remove any persistent state.""" await self._state.reset() self.error_tracker = ErrorTracker() self.error_tracker_retry = ErrorTracker() self._requests_in_progress.clear() def _log(self) -> None: stats = self.calculate() if self._statistics_log_format == 'table': self._periodic_message_logger.info(f'{self._log_message}\n{stats.to_table()}') else: self._periodic_message_logger.info(self._log_message, extra=stats.to_dict()) def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None: retry_count = record.retry_count state = self._state.current_value if retry_count: state.requests_retries += 1 state.request_retry_histogram.setdefault(retry_count, 0) state.request_retry_histogram[retry_count] += 1 ================================================ FILE: src/crawlee/storage_clients/__init__.py ================================================ from crawlee._utils.try_import import install_import_hook as _install_import_hook from crawlee._utils.try_import import try_import as _try_import # These imports have only mandatory dependencies, so they are imported directly. from ._base import StorageClient from ._file_system import FileSystemStorageClient from ._memory import MemoryStorageClient _install_import_hook(__name__) # The following imports are wrapped in try_import to handle optional dependencies, # ensuring the module can still function even if these dependencies are missing. with _try_import(__name__, 'SqlStorageClient'): from ._sql import SqlStorageClient with _try_import(__name__, 'RedisStorageClient'): from ._redis import RedisStorageClient __all__ = [ 'FileSystemStorageClient', 'MemoryStorageClient', 'RedisStorageClient', 'SqlStorageClient', 'StorageClient', ] ================================================ FILE: src/crawlee/storage_clients/_base/__init__.py ================================================ from ._dataset_client import DatasetClient from ._key_value_store_client import KeyValueStoreClient from ._request_queue_client import RequestQueueClient from ._storage_client import StorageClient __all__ = [ 'DatasetClient', 'KeyValueStoreClient', 'RequestQueueClient', 'StorageClient', ] ================================================ FILE: src/crawlee/storage_clients/_base/_dataset_client.py ================================================ from __future__ import annotations from abc import ABC, abstractmethod from typing import TYPE_CHECKING if TYPE_CHECKING: from collections.abc import AsyncIterator from typing import Any from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata class DatasetClient(ABC): """An abstract class for dataset storage clients. Dataset clients provide an interface for accessing and manipulating dataset storage. They handle operations like adding and getting dataset items across different storage backends. Storage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`, `RequestQueue`), and can operate with various storage systems including memory, file system, databases, and cloud storage solutions. This abstract class defines the interface that all specific dataset clients must implement. """ @abstractmethod async def get_metadata(self) -> DatasetMetadata: """Get the metadata of the dataset.""" @abstractmethod async def drop(self) -> None: """Drop the whole dataset and remove all its items. The backend method for the `Dataset.drop` call. """ @abstractmethod async def purge(self) -> None: """Purge all items from the dataset. The backend method for the `Dataset.purge` call. """ @abstractmethod async def push_data(self, data: list[Any] | dict[str, Any]) -> None: """Push data to the dataset. The backend method for the `Dataset.push_data` call. """ @abstractmethod async def get_data( self, *, offset: int = 0, limit: int | None = 999_999_999_999, clean: bool = False, desc: bool = False, fields: list[str] | None = None, omit: list[str] | None = None, unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, flatten: list[str] | None = None, view: str | None = None, ) -> DatasetItemsListPage: """Get data from the dataset with various filtering options. The backend method for the `Dataset.get_data` call. """ @abstractmethod async def iterate_items( self, *, offset: int = 0, limit: int | None = None, clean: bool = False, desc: bool = False, fields: list[str] | None = None, omit: list[str] | None = None, unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, ) -> AsyncIterator[dict[str, Any]]: """Iterate over the dataset items with filtering options. The backend method for the `Dataset.iterate_items` call. """ # This syntax is to make type checker properly work with abstract AsyncIterator. # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators raise NotImplementedError if False: yield 0 ================================================ FILE: src/crawlee/storage_clients/_base/_key_value_store_client.py ================================================ from __future__ import annotations from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from collections.abc import AsyncIterator from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata class KeyValueStoreClient(ABC): """An abstract class for key-value store (KVS) storage clients. Key-value stores clients provide an interface for accessing and manipulating KVS storage. They handle operations like getting, setting, deleting KVS values across different storage backends. Storage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`, `RequestQueue`), and can operate with various storage systems including memory, file system, databases, and cloud storage solutions. This abstract class defines the interface that all specific KVS clients must implement. """ @abstractmethod async def get_metadata(self) -> KeyValueStoreMetadata: """Get the metadata of the key-value store.""" @abstractmethod async def drop(self) -> None: """Drop the whole key-value store and remove all its values. The backend method for the `KeyValueStore.drop` call. """ @abstractmethod async def purge(self) -> None: """Purge all items from the key-value store. The backend method for the `KeyValueStore.purge` call. """ @abstractmethod async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: """Retrieve the given record from the key-value store. The backend method for the `KeyValueStore.get_value` call. """ @abstractmethod async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None: """Set a value in the key-value store by its key. The backend method for the `KeyValueStore.set_value` call. """ @abstractmethod async def delete_value(self, *, key: str) -> None: """Delete a value from the key-value store by its key. The backend method for the `KeyValueStore.delete_value` call. """ @abstractmethod async def iterate_keys( self, *, exclusive_start_key: str | None = None, limit: int | None = None, ) -> AsyncIterator[KeyValueStoreRecordMetadata]: """Iterate over all the existing keys in the key-value store. The backend method for the `KeyValueStore.iterate_keys` call. """ # This syntax is to make type checker properly work with abstract AsyncIterator. # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators raise NotImplementedError if False: yield 0 @abstractmethod async def get_public_url(self, *, key: str) -> str: """Get the public URL for the given key. The backend method for the `KeyValueStore.get_public_url` call. """ @abstractmethod async def record_exists(self, *, key: str) -> bool: """Check if a record with the given key exists in the key-value store. The backend method for the `KeyValueStore.record_exists` call. Args: key: The key to check for existence. Returns: True if a record with the given key exists, False otherwise. """ ================================================ FILE: src/crawlee/storage_clients/_base/_request_queue_client.py ================================================ from __future__ import annotations from abc import ABC, abstractmethod from typing import TYPE_CHECKING if TYPE_CHECKING: from collections.abc import Sequence from crawlee import Request from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata class RequestQueueClient(ABC): """An abstract class for request queue resource clients. These clients are specific to the type of resource they manage and operate under a designated storage client, like a memory storage client. """ @abstractmethod async def get_metadata(self) -> RequestQueueMetadata: """Get the metadata of the request queue.""" @abstractmethod async def drop(self) -> None: """Drop the whole request queue and remove all its values. The backend method for the `RequestQueue.drop` call. """ @abstractmethod async def purge(self) -> None: """Purge all items from the request queue. The backend method for the `RequestQueue.purge` call. """ @abstractmethod async def add_batch_of_requests( self, requests: Sequence[Request], *, forefront: bool = False, ) -> AddRequestsResponse: """Add batch of requests to the queue. This method adds a batch of requests to the queue. Each request is processed based on its uniqueness (determined by `unique_key`). Duplicates will be identified but not re-added to the queue. Args: requests: The collection of requests to add to the queue. forefront: Whether to put the added requests at the beginning (True) or the end (False) of the queue. When True, the requests will be processed sooner than previously added requests. batch_size: The maximum number of requests to add in a single batch. wait_time_between_batches: The time to wait between adding batches of requests. wait_for_all_requests_to_be_added: If True, the method will wait until all requests are added to the queue before returning. wait_for_all_requests_to_be_added_timeout: The maximum time to wait for all requests to be added. Returns: A response object containing information about which requests were successfully processed and which failed (if any). """ @abstractmethod async def get_request(self, unique_key: str) -> Request | None: """Retrieve a request from the queue. Args: unique_key: Unique key of the request to retrieve. Returns: The retrieved request, or None, if it did not exist. """ @abstractmethod async def fetch_next_request(self) -> Request | None: """Return the next request in the queue to be processed. Once you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled` to mark the request as handled in the queue. If there was some error in processing the request, call `RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer in another call to the `fetch_next_request` method. Note that the `None` return value does not mean the queue processing finished, it means there are currently no pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished` instead. Returns: The request or `None` if there are no more pending requests. """ @abstractmethod async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: """Mark a request as handled after successful processing. Handled requests will never again be returned by the `RequestQueue.fetch_next_request` method. Args: request: The request to mark as handled. Returns: Information about the queue operation. `None` if the given request was not in progress. """ @abstractmethod async def reclaim_request( self, request: Request, *, forefront: bool = False, ) -> ProcessedRequest | None: """Reclaim a failed request back to the queue. The request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`. Args: request: The request to return to the queue. forefront: Whether to add the request to the head or the end of the queue. Returns: Information about the queue operation. `None` if the given request was not in progress. """ @abstractmethod async def is_empty(self) -> bool: """Check if the request queue is empty. Returns: True if the request queue is empty, False otherwise. """ ================================================ FILE: src/crawlee/storage_clients/_base/_storage_client.py ================================================ from __future__ import annotations from abc import ABC, abstractmethod from typing import TYPE_CHECKING from crawlee._utils.docs import docs_group if TYPE_CHECKING: from collections.abc import Hashable from crawlee.configuration import Configuration from ._dataset_client import DatasetClient from ._key_value_store_client import KeyValueStoreClient from ._request_queue_client import RequestQueueClient @docs_group('Storage clients') class StorageClient(ABC): """Base class for storage clients. The `StorageClient` serves as an abstract base class that defines the interface for accessing Crawlee's storage types: datasets, key-value stores, and request queues. It provides methods to open clients for each of these storage types and handles common functionality. Storage clients implementations can be provided for various backends (file system, memory, databases, various cloud providers, etc.) to support different use cases from development to production environments. Each storage client implementation is responsible for ensuring proper initialization, data persistence (where applicable), and consistent access patterns across all storage types it supports. """ def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable: # noqa: ARG002 """Return a cache key that can differentiate between different storages of this and other clients. Can be based on configuration or on the client itself. By default, returns a module and name of the client class. """ return f'{self.__class__.__module__}.{self.__class__.__name__}' @abstractmethod async def create_dataset_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> DatasetClient: """Create a dataset client.""" @abstractmethod async def create_kvs_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> KeyValueStoreClient: """Create a key-value store client.""" @abstractmethod async def create_rq_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> RequestQueueClient: """Create a request queue client.""" def get_rate_limit_errors(self) -> dict[int, int]: """Return statistics about rate limit errors encountered by the HTTP client in storage client.""" return {} async def _purge_if_needed( self, client: DatasetClient | KeyValueStoreClient | RequestQueueClient, configuration: Configuration, ) -> None: """Purge the client if needed. The purge is only performed if the configuration indicates that it should be done and the client is not a named storage. Named storages are considered global and will typically outlive the run, so they are not purged. Args: client: The storage client to potentially purge. configuration: Configuration that determines whether purging should occur. """ metadata = await client.get_metadata() if configuration.purge_on_start and metadata.name is None: await client.purge() ================================================ FILE: src/crawlee/storage_clients/_base/py.typed ================================================ ================================================ FILE: src/crawlee/storage_clients/_file_system/__init__.py ================================================ from ._dataset_client import FileSystemDatasetClient from ._key_value_store_client import FileSystemKeyValueStoreClient from ._request_queue_client import FileSystemRequestQueueClient from ._storage_client import FileSystemStorageClient __all__ = [ 'FileSystemDatasetClient', 'FileSystemKeyValueStoreClient', 'FileSystemRequestQueueClient', 'FileSystemStorageClient', ] ================================================ FILE: src/crawlee/storage_clients/_file_system/_dataset_client.py ================================================ from __future__ import annotations import asyncio import json import shutil from datetime import datetime, timezone from logging import getLogger from pathlib import Path from typing import TYPE_CHECKING, Any from pydantic import ValidationError from typing_extensions import Self, override from crawlee._consts import METADATA_FILENAME from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.file import atomic_write, json_dumps from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata if TYPE_CHECKING: from collections.abc import AsyncIterator from crawlee.configuration import Configuration logger = getLogger(__name__) class FileSystemDatasetClient(DatasetClient): """File system implementation of the dataset client. This client persists dataset items to the file system as individual JSON files within a structured directory hierarchy following the pattern: ``` {STORAGE_DIR}/datasets/{DATASET_ID}/{ITEM_ID}.json ``` Each item is stored as a separate file, which allows for durability and the ability to recover after process termination. Dataset operations like filtering, sorting, and pagination are implemented by processing the stored files according to the requested parameters. This implementation is ideal for long-running crawlers where data persistence is important, and for development environments where you want to easily inspect the collected data between runs. """ _STORAGE_SUBDIR = 'datasets' """The name of the subdirectory where datasets are stored.""" _STORAGE_SUBSUBDIR_DEFAULT = 'default' """The name of the subdirectory for the default dataset.""" _ITEM_FILENAME_DIGITS = 9 """Number of digits used for the dataset item file names (e.g., 000000019.json).""" def __init__( self, *, metadata: DatasetMetadata, path_to_dataset: Path, lock: asyncio.Lock, ) -> None: """Initialize a new instance. Preferably use the `FileSystemDatasetClient.open` class method to create a new instance. """ self._metadata = metadata self._path_to_dataset = path_to_dataset """The full path to the dataset directory.""" self._lock = lock """A lock to ensure that only one operation is performed at a time.""" @override async def get_metadata(self) -> DatasetMetadata: return self._metadata @property def path_to_dataset(self) -> Path: """The full path to the dataset directory.""" return self._path_to_dataset @property def path_to_metadata(self) -> Path: """The full path to the dataset metadata file.""" return self.path_to_dataset / METADATA_FILENAME @classmethod async def open( cls, *, id: str | None, name: str | None, alias: str | None, configuration: Configuration, ) -> Self: """Open or create a file system dataset client. This method attempts to open an existing dataset from the file system. If a dataset with the specified ID or name exists, it loads the metadata from the stored files. If no existing dataset is found, a new one is created. Args: id: The ID of the dataset to open. If provided, searches for existing dataset by ID. name: The name of the dataset for named (global scope) storages. alias: The alias of the dataset for unnamed (run scope) storages. configuration: The configuration object containing storage directory settings. Returns: An instance for the opened or created storage client. Raises: ValueError: If a dataset with the specified ID is not found, if metadata is invalid, or if both name and alias are provided. """ # Validate input parameters. raise_if_too_many_kwargs(id=id, name=name, alias=alias) dataset_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR if not dataset_base_path.exists(): await asyncio.to_thread(dataset_base_path.mkdir, parents=True, exist_ok=True) # Get a new instance by ID. if id: found = False for dataset_dir in dataset_base_path.iterdir(): if not dataset_dir.is_dir(): continue path_to_metadata = dataset_dir / METADATA_FILENAME if not path_to_metadata.exists(): continue try: file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8') try: file_content = json.load(file) metadata = DatasetMetadata(**file_content) if metadata.id == id: client = cls( metadata=metadata, path_to_dataset=dataset_base_path / dataset_dir, lock=asyncio.Lock(), ) await client._update_metadata(update_accessed_at=True) found = True break finally: await asyncio.to_thread(file.close) except (json.JSONDecodeError, ValidationError): continue if not found: raise ValueError(f'Dataset with ID "{id}" not found') # Get a new instance by name or alias. else: dataset_dir = Path(name) if name else Path(alias) if alias else Path('default') path_to_dataset = dataset_base_path / dataset_dir path_to_metadata = path_to_dataset / METADATA_FILENAME # If the dataset directory exists, reconstruct the client from the metadata file. if path_to_dataset.exists() and path_to_metadata.exists(): file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8') try: file_content = json.load(file) finally: await asyncio.to_thread(file.close) try: metadata = DatasetMetadata(**file_content) except ValidationError as exc: raise ValueError(f'Invalid metadata file for dataset "{name or alias}"') from exc client = cls( metadata=metadata, path_to_dataset=path_to_dataset, lock=asyncio.Lock(), ) await client._update_metadata(update_accessed_at=True) # Otherwise, create a new dataset client. else: now = datetime.now(timezone.utc) metadata = DatasetMetadata( id=crypto_random_object_id(), name=name, created_at=now, accessed_at=now, modified_at=now, item_count=0, ) client = cls( metadata=metadata, path_to_dataset=path_to_dataset, lock=asyncio.Lock(), ) await client._update_metadata() return client @override async def drop(self) -> None: async with self._lock: if self.path_to_dataset.exists(): await asyncio.to_thread(shutil.rmtree, self.path_to_dataset) @override async def purge(self) -> None: async with self._lock: for file_path in await self._get_sorted_data_files(): await asyncio.to_thread(file_path.unlink, missing_ok=True) await self._update_metadata( update_accessed_at=True, update_modified_at=True, new_item_count=0, ) @override async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: async with self._lock: new_item_count = self._metadata.item_count if isinstance(data, list): for item in data: new_item_count += 1 await self._push_item(item, new_item_count) else: new_item_count += 1 await self._push_item(data, new_item_count) # now update metadata under the same lock await self._update_metadata( update_accessed_at=True, update_modified_at=True, new_item_count=new_item_count, ) @override async def get_data( self, *, offset: int = 0, limit: int | None = 999_999_999_999, clean: bool = False, desc: bool = False, fields: list[str] | None = None, omit: list[str] | None = None, unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, flatten: list[str] | None = None, view: str | None = None, ) -> DatasetItemsListPage: # Check for unsupported arguments and log a warning if found. unsupported_args: dict[str, Any] = { 'clean': clean, 'fields': fields, 'omit': omit, 'unwind': unwind, 'skip_hidden': skip_hidden, 'flatten': flatten, 'view': view, } unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)} if unsupported: logger.warning( f'The arguments {list(unsupported.keys())} of get_data are not supported by the ' f'{self.__class__.__name__} client.' ) # If the dataset directory does not exist, log a warning and return an empty page. if not self.path_to_dataset.exists(): logger.warning(f'Dataset directory not found: {self.path_to_dataset}') return DatasetItemsListPage( count=0, offset=offset, limit=limit or 0, total=0, desc=desc, items=[], ) # Get the list of sorted data files. async with self._lock: try: data_files = await self._get_sorted_data_files() except FileNotFoundError: # directory was dropped mid-check return DatasetItemsListPage(count=0, offset=offset, limit=limit or 0, total=0, desc=desc, items=[]) total = len(data_files) # Reverse the order if descending order is requested. if desc: data_files.reverse() # Apply offset and limit slicing. selected_files = data_files[offset:] if limit is not None: selected_files = selected_files[:limit] # Read and parse each data file. items = list[dict[str, Any]]() for file_path in selected_files: try: file_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8') except FileNotFoundError: logger.warning(f'File disappeared during iterate_items(): {file_path}, skipping') continue try: item = json.loads(file_content) except json.JSONDecodeError: logger.exception(f'Corrupt JSON in {file_path}, skipping') continue # Skip empty items if requested. if skip_empty and not item: continue items.append(item) async with self._lock: await self._update_metadata(update_accessed_at=True) # Return a paginated list page of dataset items. return DatasetItemsListPage( count=len(items), offset=offset, limit=limit or total - offset, total=total, desc=desc, items=items, ) @override async def iterate_items( self, *, offset: int = 0, limit: int | None = None, clean: bool = False, desc: bool = False, fields: list[str] | None = None, omit: list[str] | None = None, unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, ) -> AsyncIterator[dict[str, Any]]: # Check for unsupported arguments and log a warning if found. unsupported_args: dict[str, Any] = { 'clean': clean, 'fields': fields, 'omit': omit, 'unwind': unwind, 'skip_hidden': skip_hidden, } unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)} if unsupported: logger.warning( f'The arguments {list(unsupported.keys())} of iterate are not supported ' f'by the {self.__class__.__name__} client.' ) # If the dataset directory does not exist, log a warning and return immediately. if not self.path_to_dataset.exists(): logger.warning(f'Dataset directory not found: {self.path_to_dataset}') return # Get the list of sorted data files. async with self._lock: try: data_files = await self._get_sorted_data_files() except FileNotFoundError: return # Reverse the order if descending order is requested. if desc: data_files.reverse() # Apply offset and limit slicing. selected_files = data_files[offset:] if limit is not None: selected_files = selected_files[:limit] # Iterate over each data file, reading and yielding its parsed content. for file_path in selected_files: try: file_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8') except FileNotFoundError: logger.warning(f'File disappeared during iterate_items(): {file_path}, skipping') continue try: item = json.loads(file_content) except json.JSONDecodeError: logger.exception(f'Corrupt JSON in {file_path}, skipping') continue # Skip empty items if requested. if skip_empty and not item: continue yield item async with self._lock: await self._update_metadata(update_accessed_at=True) async def _update_metadata( self, *, new_item_count: int | None = None, update_accessed_at: bool = False, update_modified_at: bool = False, ) -> None: """Update the dataset metadata file with current information. Args: new_item_count: If provided, update the item count to this value. update_accessed_at: If True, update the `accessed_at` timestamp to the current time. update_modified_at: If True, update the `modified_at` timestamp to the current time. """ now = datetime.now(timezone.utc) if update_accessed_at: self._metadata.accessed_at = now if update_modified_at: self._metadata.modified_at = now if new_item_count is not None: self._metadata.item_count = new_item_count # Ensure the parent directory for the metadata file exists. await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) await atomic_write(self.path_to_metadata, data) async def _push_item(self, item: dict[str, Any], item_id: int) -> None: """Push a single item to the dataset. This method writes the item as a JSON file with a zero-padded numeric filename that reflects its position in the dataset sequence. Args: item: The data item to add to the dataset. item_id: The sequential ID to use for this item's filename. """ # Generate the filename for the new item using zero-padded numbering. filename = f'{str(item_id).zfill(self._ITEM_FILENAME_DIGITS)}.json' file_path = self.path_to_dataset / filename # Ensure the dataset directory exists. await asyncio.to_thread(self.path_to_dataset.mkdir, parents=True, exist_ok=True) # Dump the serialized item to the file. data = await json_dumps(item) await atomic_write(file_path, data) async def _get_sorted_data_files(self) -> list[Path]: """Retrieve and return a sorted list of data files in the dataset directory. The files are sorted numerically based on the filename (without extension), which corresponds to the order items were added to the dataset. Returns: A list of `Path` objects pointing to data files, sorted by numeric filename. """ # Retrieve and sort all JSON files in the dataset directory numerically. files = await asyncio.to_thread( lambda: sorted( self.path_to_dataset.glob('*.json'), key=lambda f: int(f.stem) if f.stem.isdigit() else 0, ) ) # Remove the metadata file from the list if present. if self.path_to_metadata in files: files.remove(self.path_to_metadata) return files ================================================ FILE: src/crawlee/storage_clients/_file_system/_key_value_store_client.py ================================================ from __future__ import annotations import asyncio import functools import json import shutil import urllib.parse from datetime import datetime, timezone from logging import getLogger from pathlib import Path from typing import TYPE_CHECKING, Any from pydantic import ValidationError from typing_extensions import Self, override from crawlee._consts import METADATA_FILENAME from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata if TYPE_CHECKING: from collections.abc import AsyncIterator from crawlee.configuration import Configuration logger = getLogger(__name__) class FileSystemKeyValueStoreClient(KeyValueStoreClient): """File system implementation of the key-value store client. This client persists data to the file system, making it suitable for scenarios where data needs to survive process restarts. Keys are mapped to file paths in a directory structure following the pattern: ``` {STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY} ``` Binary data is stored as-is, while JSON and text data are stored in human-readable format. The implementation automatically handles serialization based on the content type and maintains metadata about each record. This implementation is ideal for long-running crawlers where persistence is important and for development environments where you want to easily inspect the stored data between runs. """ _STORAGE_SUBDIR = 'key_value_stores' """The name of the subdirectory where key-value stores are stored.""" _STORAGE_SUBSUBDIR_DEFAULT = 'default' """The name of the subdirectory for the default key-value store.""" def __init__( self, *, metadata: KeyValueStoreMetadata, path_to_kvs: Path, lock: asyncio.Lock, ) -> None: """Initialize a new instance. Preferably use the `FileSystemKeyValueStoreClient.open` class method to create a new instance. """ self._metadata = metadata self._path_to_kvs = path_to_kvs """The full path to the key-value store directory.""" self._lock = lock """A lock to ensure that only one operation is performed at a time.""" @override async def get_metadata(self) -> KeyValueStoreMetadata: return self._metadata @property def path_to_kvs(self) -> Path: """The full path to the key-value store directory.""" return self._path_to_kvs @property def path_to_metadata(self) -> Path: """The full path to the key-value store metadata file.""" return self.path_to_kvs / METADATA_FILENAME @classmethod async def open( cls, *, id: str | None, name: str | None, alias: str | None, configuration: Configuration, ) -> Self: """Open or create a file system key-value store client. This method attempts to open an existing key-value store from the file system. If a KVS with the specified ID or name exists, it loads the metadata from the stored files. If no existing store is found, a new one is created. Args: id: The ID of the key-value store to open. If provided, searches for existing store by ID. name: The name of the key-value store for named (global scope) storages. alias: The alias of the key-value store for unnamed (run scope) storages. configuration: The configuration object containing storage directory settings. Returns: An instance for the opened or created storage client. Raises: ValueError: If a store with the specified ID is not found, if metadata is invalid, or if both name and alias are provided. """ # Validate input parameters. raise_if_too_many_kwargs(id=id, name=name, alias=alias) kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR if not kvs_base_path.exists(): await asyncio.to_thread(kvs_base_path.mkdir, parents=True, exist_ok=True) # Get a new instance by ID. if id: found = False for kvs_dir in kvs_base_path.iterdir(): if not kvs_dir.is_dir(): continue path_to_metadata = kvs_dir / METADATA_FILENAME if not path_to_metadata.exists(): continue try: file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8') try: file_content = json.load(file) metadata = KeyValueStoreMetadata(**file_content) if metadata.id == id: client = cls( metadata=metadata, path_to_kvs=kvs_base_path / kvs_dir, lock=asyncio.Lock(), ) await client._update_metadata(update_accessed_at=True) found = True break finally: await asyncio.to_thread(file.close) except (json.JSONDecodeError, ValidationError): continue if not found: raise ValueError(f'Key-value store with ID "{id}" not found.') # Get a new instance by name or alias. else: kvs_dir = Path(name) if name else Path(alias) if alias else Path('default') path_to_kvs = kvs_base_path / kvs_dir path_to_metadata = path_to_kvs / METADATA_FILENAME # If the key-value store directory exists, reconstruct the client from the metadata file. if path_to_kvs.exists() and path_to_metadata.exists(): file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8') try: file_content = json.load(file) finally: await asyncio.to_thread(file.close) try: metadata = KeyValueStoreMetadata(**file_content) except ValidationError as exc: raise ValueError(f'Invalid metadata file for key-value store "{name or alias}"') from exc client = cls( metadata=metadata, path_to_kvs=path_to_kvs, lock=asyncio.Lock(), ) await client._update_metadata(update_accessed_at=True) # Otherwise, create a new key-value store client. else: now = datetime.now(timezone.utc) metadata = KeyValueStoreMetadata( id=crypto_random_object_id(), name=name, created_at=now, accessed_at=now, modified_at=now, ) client = cls( metadata=metadata, path_to_kvs=path_to_kvs, lock=asyncio.Lock(), ) await client._update_metadata() return client @override async def drop(self) -> None: # If the client directory exists, remove it recursively. if self.path_to_kvs.exists(): async with self._lock: await asyncio.to_thread(shutil.rmtree, self.path_to_kvs) @override async def purge(self) -> None: async with self._lock: for file_path in self.path_to_kvs.glob('*'): if file_path.name == METADATA_FILENAME: continue await asyncio.to_thread(file_path.unlink, missing_ok=True) await self._update_metadata( update_accessed_at=True, update_modified_at=True, ) @override async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: # Update the metadata to record access async with self._lock: await self._update_metadata(update_accessed_at=True) record_path = self.path_to_kvs / self._encode_key(key) if not record_path.exists(): return None # Found a file for this key, now look for its metadata record_metadata_filepath = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') if not record_metadata_filepath.exists(): logger.warning(f'Found value file for key "{key}" but no metadata file.') return None # Read the metadata file async with self._lock: try: file = await asyncio.to_thread( functools.partial(record_metadata_filepath.open, mode='r', encoding='utf-8'), ) except FileNotFoundError: logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value') return None try: metadata_content = json.load(file) except json.JSONDecodeError: logger.warning(f'Invalid metadata file for key "{key}"') return None finally: await asyncio.to_thread(file.close) try: metadata = KeyValueStoreRecordMetadata(**metadata_content) except ValidationError: logger.warning(f'Invalid metadata schema for key "{key}"') return None # Read the actual value try: value_bytes = await asyncio.to_thread(record_path.read_bytes) except FileNotFoundError: logger.warning(f'Value file disappeared for key "{key}"') return None # Handle None values if metadata.content_type == 'application/x-none': value = None # Handle JSON values elif 'application/json' in metadata.content_type: try: value = json.loads(value_bytes.decode('utf-8')) except (json.JSONDecodeError, UnicodeDecodeError): logger.warning(f'Failed to decode JSON value for key "{key}"') return None # Handle text values elif metadata.content_type.startswith('text/'): try: value = value_bytes.decode('utf-8') except UnicodeDecodeError: logger.warning(f'Failed to decode text value for key "{key}"') return None # Handle binary values else: value = value_bytes # Calculate the size of the value in bytes size = len(value_bytes) return KeyValueStoreRecord( key=metadata.key, value=value, content_type=metadata.content_type, size=size, ) @override async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None: # Special handling for None values if value is None: content_type = 'application/x-none' # Special content type to identify None values value_bytes = b'' else: content_type = content_type or infer_mime_type(value) # Serialize the value to bytes. if 'application/json' in content_type: value_bytes = (await json_dumps(value)).encode('utf-8') elif isinstance(value, str): value_bytes = value.encode('utf-8') elif isinstance(value, (bytes, bytearray)): value_bytes = value else: # Fallback: attempt to convert to string and encode. value_bytes = str(value).encode('utf-8') record_path = self.path_to_kvs / self._encode_key(key) # Prepare the metadata size = len(value_bytes) record_metadata = KeyValueStoreRecordMetadata(key=key, content_type=content_type, size=size) record_metadata_filepath = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') record_metadata_content = await json_dumps(record_metadata.model_dump()) async with self._lock: # Ensure the key-value store directory exists. await asyncio.to_thread(self.path_to_kvs.mkdir, parents=True, exist_ok=True) # Write the value to the file. await atomic_write(record_path, value_bytes) # Write the record metadata to the file. await atomic_write(record_metadata_filepath, record_metadata_content) # Update the KVS metadata to record the access and modification. await self._update_metadata(update_accessed_at=True, update_modified_at=True) @override async def delete_value(self, *, key: str) -> None: record_path = self.path_to_kvs / self._encode_key(key) metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') deleted = False async with self._lock: # Delete the value file and its metadata if found if record_path.exists(): await asyncio.to_thread(record_path.unlink, missing_ok=True) # Delete the metadata file if it exists if metadata_path.exists(): await asyncio.to_thread(metadata_path.unlink, missing_ok=True) else: logger.warning(f'Found value file for key "{key}" but no metadata file when trying to delete it.') deleted = True # If we deleted something, update the KVS metadata if deleted: await self._update_metadata(update_accessed_at=True, update_modified_at=True) @override async def iterate_keys( self, *, exclusive_start_key: str | None = None, limit: int | None = None, ) -> AsyncIterator[KeyValueStoreRecordMetadata]: # Check if the KVS directory exists if not self.path_to_kvs.exists(): return # List and sort all files *inside* a brief lock, then release it immediately: async with self._lock: files = sorted(await asyncio.to_thread(lambda: list(self.path_to_kvs.glob('*')))) count = 0 for file_path in files: # Skip the main metadata file if file_path.name == METADATA_FILENAME: continue # Only process metadata files for records if not file_path.name.endswith(f'.{METADATA_FILENAME}'): continue # Extract the base key name from the metadata filename key_name = self._decode_key(file_path.name[: -len(f'.{METADATA_FILENAME}')]) # Apply exclusive_start_key filter if provided if exclusive_start_key is not None and key_name <= exclusive_start_key: continue # Try to read and parse the metadata file try: metadata_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8') except FileNotFoundError: logger.warning(f'Metadata file disappeared for key "{key_name}", skipping it.') continue try: metadata_dict = json.loads(metadata_content) except json.JSONDecodeError: logger.warning(f'Failed to decode metadata file for key "{key_name}", skipping it.') continue try: record_metadata = KeyValueStoreRecordMetadata(**metadata_dict) except ValidationError: logger.warning(f'Invalid metadata schema for key "{key_name}", skipping it.') yield record_metadata count += 1 if limit and count >= limit: break # Update accessed_at timestamp async with self._lock: await self._update_metadata(update_accessed_at=True) @override async def get_public_url(self, *, key: str) -> str: """Return a file:// URL for the given key. Args: key: The key to get the public URL for. Returns: A file:// URL pointing to the file on the local filesystem. """ record_path = self.path_to_kvs / self._encode_key(key) absolute_path = record_path.absolute() return absolute_path.as_uri() @override async def record_exists(self, *, key: str) -> bool: """Check if a record with the given key exists in the key-value store. Args: key: The key to check for existence. Returns: True if a record with the given key exists, False otherwise. """ # Update the metadata to record access async with self._lock: await self._update_metadata(update_accessed_at=True) record_path = self.path_to_kvs / self._encode_key(key) record_metadata_filepath = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') # Both the value file and metadata file must exist for a record to be considered existing return record_path.exists() and record_metadata_filepath.exists() async def _update_metadata( self, *, update_accessed_at: bool = False, update_modified_at: bool = False, ) -> None: """Update the KVS metadata file with current information. Args: update_accessed_at: If True, update the `accessed_at` timestamp to the current time. update_modified_at: If True, update the `modified_at` timestamp to the current time. """ now = datetime.now(timezone.utc) if update_accessed_at: self._metadata.accessed_at = now if update_modified_at: self._metadata.modified_at = now # Ensure the parent directory for the metadata file exists. await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) await atomic_write(self.path_to_metadata, data) def _encode_key(self, key: str) -> str: """Encode a key to make it safe for use in a file path.""" return urllib.parse.quote(key, safe='') def _decode_key(self, encoded_key: str) -> str: """Decode a key that was encoded to make it safe for use in a file path.""" return urllib.parse.unquote(encoded_key) ================================================ FILE: src/crawlee/storage_clients/_file_system/_request_queue_client.py ================================================ from __future__ import annotations import asyncio import functools import json import shutil from collections import deque from datetime import datetime, timezone from hashlib import sha256 from logging import getLogger from pathlib import Path from typing import TYPE_CHECKING from pydantic import BaseModel, ValidationError from typing_extensions import Self, override from crawlee import Request from crawlee._consts import METADATA_FILENAME from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.file import atomic_write, json_dumps from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee._utils.recoverable_state import RecoverableState from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import ( AddRequestsResponse, ProcessedRequest, RequestQueueMetadata, UnprocessedRequest, ) if TYPE_CHECKING: from collections.abc import Sequence from crawlee.configuration import Configuration from crawlee.storages import KeyValueStore logger = getLogger(__name__) class RequestQueueState(BaseModel): """State model for the `FileSystemRequestQueueClient`.""" sequence_counter: int = 0 """Counter for regular request ordering.""" forefront_sequence_counter: int = 0 """Counter for forefront request ordering.""" forefront_requests: dict[str, int] = {} """Mapping of forefront request unique keys to their sequence numbers.""" regular_requests: dict[str, int] = {} """Mapping of regular request unique keys to their sequence numbers.""" in_progress_requests: set[str] = set() """Set of request unique keys currently being processed.""" handled_requests: set[str] = set() """Set of request unique keys that have been handled.""" class FileSystemRequestQueueClient(RequestQueueClient): """A file system implementation of the request queue client. This client persists requests to the file system as individual JSON files, making it suitable for scenarios where data needs to survive process restarts. Each request is stored as a separate file in a directory structure following the pattern: ``` {STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json ``` The implementation uses `RecoverableState` to maintain ordering information, in-progress status, and request handling status. This allows for proper state recovery across process restarts without embedding metadata in individual request files. File system storage provides durability at the cost of slower I/O operations compared to memory only-based storage. This implementation is ideal for long-running crawlers where persistence is important and for situations where you need to resume crawling after process termination. """ _STORAGE_SUBDIR = 'request_queues' """The name of the subdirectory where request queues are stored.""" _STORAGE_SUBSUBDIR_DEFAULT = 'default' """The name of the subdirectory for the default request queue.""" _MAX_REQUESTS_IN_CACHE = 100_000 """Maximum number of requests to keep in cache for faster access.""" def __init__( self, *, metadata: RequestQueueMetadata, path_to_rq: Path, lock: asyncio.Lock, recoverable_state: RecoverableState[RequestQueueState], ) -> None: """Initialize a new instance. Preferably use the `FileSystemRequestQueueClient.open` class method to create a new instance. """ self._metadata = metadata self._path_to_rq = path_to_rq """The full path to the request queue directory.""" self._lock = lock """A lock to ensure that only one operation is performed at a time.""" self._request_cache = deque[Request]() """Cache for requests: forefront requests at the beginning, regular requests at the end.""" self._request_cache_needs_refresh = True """Flag indicating whether the cache needs to be refreshed from filesystem.""" self._is_empty_cache: bool | None = None """Cache for is_empty result: None means unknown, True/False is cached state.""" self._state = recoverable_state """Recoverable state to maintain request ordering, in-progress status, and handled status.""" @override async def get_metadata(self) -> RequestQueueMetadata: return self._metadata @property def path_to_rq(self) -> Path: """The full path to the request queue directory.""" return self._path_to_rq @property def path_to_metadata(self) -> Path: """The full path to the request queue metadata file.""" return self.path_to_rq / METADATA_FILENAME @classmethod async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState: async def kvs_factory() -> KeyValueStore: from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration) return RecoverableState[RequestQueueState]( default_state=RequestQueueState(), persist_state_key=f'__RQ_STATE_{id}', persist_state_kvs_factory=kvs_factory, persistence_enabled=True, logger=logger, ) @classmethod async def open( cls, *, id: str | None, name: str | None, alias: str | None, configuration: Configuration, ) -> Self: """Open or create a file system request queue client. This method attempts to open an existing request queue from the file system. If a queue with the specified ID or name exists, it loads the metadata and state from the stored files. If no existing queue is found, a new one is created. Args: id: The ID of the request queue to open. If provided, searches for existing queue by ID. name: The name of the request queue for named (global scope) storages. alias: The alias of the request queue for unnamed (run scope) storages. configuration: The configuration object containing storage directory settings. Returns: An instance for the opened or created storage client. Raises: ValueError: If a queue with the specified ID is not found, if metadata is invalid, or if both name and alias are provided. """ # Validate input parameters. raise_if_too_many_kwargs(id=id, name=name, alias=alias) rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR if not rq_base_path.exists(): await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True) # Open an existing RQ by its ID, raise an error if not found. if id: found = False for rq_dir in rq_base_path.iterdir(): if not rq_dir.is_dir(): continue path_to_metadata = rq_dir / METADATA_FILENAME if not path_to_metadata.exists(): continue try: file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8') try: file_content = json.load(file) metadata = RequestQueueMetadata(**file_content) if metadata.id == id: client = cls( metadata=metadata, path_to_rq=rq_base_path / rq_dir, lock=asyncio.Lock(), recoverable_state=await cls._create_recoverable_state( id=id, configuration=configuration ), ) await client._state.initialize() await client._discover_existing_requests() await client._update_metadata(update_accessed_at=True) found = True break finally: await asyncio.to_thread(file.close) except (json.JSONDecodeError, ValidationError): continue if not found: raise ValueError(f'Request queue with ID "{id}" not found') # Open an existing RQ by its name or alias, or create a new one if not found. else: rq_dir = Path(name) if name else Path(alias) if alias else Path('default') path_to_rq = rq_base_path / rq_dir path_to_metadata = path_to_rq / METADATA_FILENAME # If the RQ directory exists, reconstruct the client from the metadata file. if path_to_rq.exists() and path_to_metadata.exists(): file = await asyncio.to_thread(path_to_metadata.open, encoding='utf-8') try: file_content = json.load(file) finally: await asyncio.to_thread(file.close) try: metadata = RequestQueueMetadata(**file_content) except ValidationError as exc: raise ValueError(f'Invalid metadata file for request queue "{name or alias}"') from exc client = cls( metadata=metadata, path_to_rq=path_to_rq, lock=asyncio.Lock(), recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration), ) await client._state.initialize() await client._discover_existing_requests() await client._update_metadata(update_accessed_at=True) # Otherwise, create a new dataset client. else: now = datetime.now(timezone.utc) metadata = RequestQueueMetadata( id=crypto_random_object_id(), name=name, created_at=now, accessed_at=now, modified_at=now, had_multiple_clients=False, handled_request_count=0, pending_request_count=0, total_request_count=0, ) client = cls( metadata=metadata, path_to_rq=path_to_rq, lock=asyncio.Lock(), recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration), ) await client._state.initialize() await client._update_metadata() return client @override async def drop(self) -> None: async with self._lock: # Remove the RQ dir recursively if it exists. if self.path_to_rq.exists(): await asyncio.to_thread(shutil.rmtree, self.path_to_rq) # Clear recoverable state await self._state.reset() await self._state.teardown() self._request_cache.clear() self._request_cache_needs_refresh = True # Invalidate is_empty cache. self._is_empty_cache = None @override async def purge(self) -> None: async with self._lock: request_files = await self._get_request_files(self.path_to_rq) for file_path in request_files: await asyncio.to_thread(file_path.unlink, missing_ok=True) # Clear recoverable state await self._state.reset() self._request_cache.clear() self._request_cache_needs_refresh = True await self._update_metadata( update_modified_at=True, update_accessed_at=True, new_pending_request_count=0, new_handled_request_count=0, new_total_request_count=0, ) # Invalidate is_empty cache. self._is_empty_cache = None @override async def add_batch_of_requests( self, requests: Sequence[Request], *, forefront: bool = False, ) -> AddRequestsResponse: async with self._lock: self._is_empty_cache = None new_total_request_count = self._metadata.total_request_count new_pending_request_count = self._metadata.pending_request_count processed_requests = list[ProcessedRequest]() unprocessed_requests = list[UnprocessedRequest]() state = self._state.current_value all_requests = state.forefront_requests | state.regular_requests requests_to_enqueue = {} # Determine which requests can be added or are modified. for request in requests: # Check if the request has already been handled. if request.unique_key in state.handled_requests: processed_requests.append( ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=True, ) ) # Check if the request is already in progress. # Or if the request is already in the queue and the `forefront` flag is not used, we do not change the # position of the request. elif (request.unique_key in state.in_progress_requests) or ( request.unique_key in all_requests and not forefront ): processed_requests.append( ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=False, ) ) # These requests must either be added or update their position. else: requests_to_enqueue[request.unique_key] = request # Process each request in the batch. for request in requests_to_enqueue.values(): # If the request is not already in the RQ, this is a new request. if request.unique_key not in all_requests: request_path = self._get_request_path(request.unique_key) # Add sequence number to ensure FIFO ordering using state. if forefront: sequence_number = state.forefront_sequence_counter state.forefront_sequence_counter += 1 state.forefront_requests[request.unique_key] = sequence_number else: sequence_number = state.sequence_counter state.sequence_counter += 1 state.regular_requests[request.unique_key] = sequence_number # Save the clean request without extra fields request_data = await json_dumps(request.model_dump()) await atomic_write(request_path, request_data) # Update the metadata counts. new_total_request_count += 1 new_pending_request_count += 1 processed_requests.append( ProcessedRequest( unique_key=request.unique_key, was_already_present=False, was_already_handled=False, ) ) # If the request already exists in the RQ and use the forefront flag to update its position elif forefront: # If the request is among `regular`, remove it from its current position. if request.unique_key in state.regular_requests: state.regular_requests.pop(request.unique_key) # If the request is already in `forefront`, we just need to update its position. state.forefront_requests[request.unique_key] = state.forefront_sequence_counter state.forefront_sequence_counter += 1 processed_requests.append( ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=False, ) ) else: logger.warning(f'Request with unique key "{request.unique_key}" could not be processed.') unprocessed_requests.append( UnprocessedRequest( unique_key=request.unique_key, url=request.url, method=request.method, ) ) await self._update_metadata( update_modified_at=True, update_accessed_at=True, new_total_request_count=new_total_request_count, new_pending_request_count=new_pending_request_count, ) # Invalidate the cache if we added forefront requests. if forefront: self._request_cache_needs_refresh = True # Invalidate is_empty cache. self._is_empty_cache = None return AddRequestsResponse( processed_requests=processed_requests, unprocessed_requests=unprocessed_requests, ) @override async def get_request(self, unique_key: str) -> Request | None: async with self._lock: request_path = self._get_request_path(unique_key) request = await self._parse_request_file(request_path) if request is None: logger.warning(f'Request with unique key "{unique_key}" not found in the queue.') return None await self._update_metadata(update_accessed_at=True) return request @override async def fetch_next_request(self) -> Request | None: async with self._lock: # Refresh cache if needed or if it's empty. if self._request_cache_needs_refresh or not self._request_cache: await self._refresh_cache() next_request: Request | None = None state = self._state.current_value # Fetch from the front of the deque (forefront requests are at the beginning). while self._request_cache and next_request is None: candidate = self._request_cache.popleft() # Skip requests that are already in progress, however this should not happen. if candidate.unique_key not in state.in_progress_requests: next_request = candidate if next_request is not None: state.in_progress_requests.add(next_request.unique_key) return next_request @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: async with self._lock: self._is_empty_cache = None state = self._state.current_value # Check if the request is in progress. if request.unique_key not in state.in_progress_requests: logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.') return None # Update the request's handled_at timestamp. if request.handled_at is None: request.handled_at = datetime.now(timezone.utc) # Dump the updated request to the file. request_path = self._get_request_path(request.unique_key) if not await asyncio.to_thread(request_path.exists): logger.warning(f'Request file for {request.unique_key} does not exist, cannot mark as handled.') return None request_data = await json_dumps(request.model_dump()) await atomic_write(request_path, request_data) # Update state: remove from in-progress and add to handled. state.in_progress_requests.discard(request.unique_key) state.handled_requests.add(request.unique_key) # Update RQ metadata. await self._update_metadata( update_modified_at=True, update_accessed_at=True, new_handled_request_count=self._metadata.handled_request_count + 1, new_pending_request_count=self._metadata.pending_request_count - 1, ) return ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=True, ) @override async def reclaim_request( self, request: Request, *, forefront: bool = False, ) -> ProcessedRequest | None: async with self._lock: self._is_empty_cache = None state = self._state.current_value # Check if the request is in progress. if request.unique_key not in state.in_progress_requests: logger.info(f'Reclaiming request {request.unique_key} that is not in progress.') return None request_path = self._get_request_path(request.unique_key) if not await asyncio.to_thread(request_path.exists): logger.warning(f'Request file for {request.unique_key} does not exist, cannot reclaim.') return None # Update sequence number and state to ensure proper ordering. if forefront: # Remove from regular requests if it was there state.regular_requests.pop(request.unique_key, None) sequence_number = state.forefront_sequence_counter state.forefront_sequence_counter += 1 state.forefront_requests[request.unique_key] = sequence_number else: # Remove from forefront requests if it was there state.forefront_requests.pop(request.unique_key, None) sequence_number = state.sequence_counter state.sequence_counter += 1 state.regular_requests[request.unique_key] = sequence_number # Save the clean request without extra fields request_data = await json_dumps(request.model_dump()) await atomic_write(request_path, request_data) # Remove from in-progress. state.in_progress_requests.discard(request.unique_key) # Update RQ metadata. await self._update_metadata( update_modified_at=True, update_accessed_at=True, ) # Add the request back to the cache. if forefront: self._request_cache.appendleft(request) else: self._request_cache.append(request) return ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=False, ) @override async def is_empty(self) -> bool: async with self._lock: # If we have a cached value, return it immediately. if self._is_empty_cache is not None: return self._is_empty_cache state = self._state.current_value # If there are in-progress requests, return False immediately. if len(state.in_progress_requests) > 0: self._is_empty_cache = False return False # If we have a cached requests, check them first (fast path). if self._request_cache: for req in self._request_cache: if req.unique_key not in state.handled_requests: self._is_empty_cache = False return False self._is_empty_cache = True return len(state.in_progress_requests) == 0 # Fallback: check state for unhandled requests. await self._update_metadata(update_accessed_at=True) # Check if there are any requests that are not handled all_requests = set(state.forefront_requests.keys()) | set(state.regular_requests.keys()) unhandled_requests = all_requests - state.handled_requests if unhandled_requests: self._is_empty_cache = False return False self._is_empty_cache = True return True def _get_request_path(self, unique_key: str) -> Path: """Get the path to a specific request file. Args: unique_key: Unique key of the request. Returns: The path to the request file. """ return self.path_to_rq / f'{self._get_file_base_name_from_unique_key(unique_key)}.json' async def _update_metadata( self, *, new_handled_request_count: int | None = None, new_pending_request_count: int | None = None, new_total_request_count: int | None = None, update_had_multiple_clients: bool = False, update_accessed_at: bool = False, update_modified_at: bool = False, ) -> None: """Update the dataset metadata file with current information. Args: new_handled_request_count: If provided, update the handled_request_count to this value. new_pending_request_count: If provided, update the pending_request_count to this value. new_total_request_count: If provided, update the total_request_count to this value. update_had_multiple_clients: If True, set had_multiple_clients to True. update_accessed_at: If True, update the `accessed_at` timestamp to the current time. update_modified_at: If True, update the `modified_at` timestamp to the current time. """ # Always create a new timestamp to ensure it's truly updated now = datetime.now(timezone.utc) # Update timestamps according to parameters if update_accessed_at: self._metadata.accessed_at = now if update_modified_at: self._metadata.modified_at = now # Update request counts if provided if new_handled_request_count is not None: self._metadata.handled_request_count = new_handled_request_count if new_pending_request_count is not None: self._metadata.pending_request_count = new_pending_request_count if new_total_request_count is not None: self._metadata.total_request_count = new_total_request_count if update_had_multiple_clients: self._metadata.had_multiple_clients = True # Ensure the parent directory for the metadata file exists. await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) await atomic_write(self.path_to_metadata, data) async def _refresh_cache(self) -> None: """Refresh the request cache from filesystem. This method loads up to _MAX_REQUESTS_IN_CACHE requests from the filesystem, prioritizing forefront requests and maintaining proper ordering. """ self._request_cache.clear() state = self._state.current_value forefront_requests = list[tuple[Request, int]]() # (request, sequence) regular_requests = list[tuple[Request, int]]() # (request, sequence) request_files = await self._get_request_files(self.path_to_rq) for request_file in request_files: request = await self._parse_request_file(request_file) if request is None: continue # Skip handled requests if request.unique_key in state.handled_requests: continue # Skip in-progress requests if request.unique_key in state.in_progress_requests: continue # Determine if request is forefront or regular based on state if request.unique_key in state.forefront_requests: sequence = state.forefront_requests[request.unique_key] forefront_requests.append((request, sequence)) elif request.unique_key in state.regular_requests: sequence = state.regular_requests[request.unique_key] regular_requests.append((request, sequence)) else: # Request not in state, skip it (might be orphaned) logger.warning(f'Request {request.unique_key} not found in state, skipping.') continue # Sort forefront requests by sequence (newest first for LIFO behavior). forefront_requests.sort(key=lambda item: item[1], reverse=True) # Sort regular requests by sequence (oldest first for FIFO behavior). regular_requests.sort(key=lambda item: item[1], reverse=False) # Add forefront requests to the beginning of the cache (left side). Since forefront_requests are sorted # by sequence (newest first), we need to add them in reverse order to maintain correct priority. for request, _ in reversed(forefront_requests): if len(self._request_cache) >= self._MAX_REQUESTS_IN_CACHE: break self._request_cache.appendleft(request) # Add regular requests to the end of the cache (right side). for request, _ in regular_requests: if len(self._request_cache) >= self._MAX_REQUESTS_IN_CACHE: break self._request_cache.append(request) self._request_cache_needs_refresh = False @classmethod async def _get_request_files(cls, path_to_rq: Path) -> list[Path]: """Get all request files from the RQ. Args: path_to_rq: The path to the request queue directory. Returns: A list of paths to all request files. """ # Create the requests directory if it doesn't exist. await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True) # List all the json files. files = list(await asyncio.to_thread(path_to_rq.glob, '*.json')) # Filter out metadata file and non-file entries. filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files) return list(filtered) @classmethod async def _parse_request_file(cls, file_path: Path) -> Request | None: """Parse a request file and return the `Request` object. Args: file_path: The path to the request file. Returns: The parsed `Request` object or `None` if the file could not be read or parsed. """ # Open the request file. try: file = await asyncio.to_thread(functools.partial(file_path.open, mode='r', encoding='utf-8')) except FileNotFoundError: logger.warning(f'Request file "{file_path}" not found.') return None # Read the file content and parse it as JSON. try: file_content = json.load(file) except json.JSONDecodeError as exc: logger.warning(f'Failed to parse request file {file_path}: {exc!s}') return None finally: await asyncio.to_thread(file.close) # Validate the content against the Request model. try: return Request.model_validate(file_content) except ValidationError as exc: logger.warning(f'Failed to validate request file {file_path}: {exc!s}') return None async def _discover_existing_requests(self) -> None: """Discover and load existing requests into the state when opening an existing request queue.""" request_files = await self._get_request_files(self.path_to_rq) state = self._state.current_value for request_file in request_files: request = await self._parse_request_file(request_file) if request is None: continue # Add request to state as regular request (assign sequence numbers) if request.unique_key not in state.regular_requests and request.unique_key not in state.forefront_requests: # Assign as regular request with current sequence counter state.regular_requests[request.unique_key] = state.sequence_counter state.sequence_counter += 1 # Check if request was already handled if request.handled_at is not None: state.handled_requests.add(request.unique_key) @staticmethod def _get_file_base_name_from_unique_key(unique_key: str) -> str: """Generate a deterministic file name for a unique_key. Args: unique_key: Unique key to be used to generate filename. Returns: A file name based on the unique_key. """ # hexdigest produces filenames compliant strings hashed_key = sha256(unique_key.encode('utf-8')).hexdigest() name_length = 15 # Truncate the key to the desired length return hashed_key[:name_length] ================================================ FILE: src/crawlee/storage_clients/_file_system/_storage_client.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING from typing_extensions import override from crawlee._utils.docs import docs_group from crawlee.configuration import Configuration from crawlee.storage_clients._base import StorageClient from ._dataset_client import FileSystemDatasetClient from ._key_value_store_client import FileSystemKeyValueStoreClient from ._request_queue_client import FileSystemRequestQueueClient if TYPE_CHECKING: from collections.abc import Hashable @docs_group('Storage clients') class FileSystemStorageClient(StorageClient): """File system implementation of the storage client. This storage client provides access to datasets, key-value stores, and request queues that persist data to the local file system. Each storage type is implemented with its own specific file system client that stores data in a structured directory hierarchy. Data is stored in JSON format in predictable file paths, making it easy to inspect and manipulate the stored data outside of the Crawlee application if needed. All data persists between program runs but is limited to access from the local machine where the files are stored. Warning: This storage client is not safe for concurrent access from multiple crawler processes. Use it only when running a single crawler process at a time. """ @override def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable: # Even different client instances should return same storage if the storage_dir is the same. return super().get_storage_client_cache_key(configuration), configuration.storage_dir @override async def create_dataset_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> FileSystemDatasetClient: configuration = configuration or Configuration.get_global_configuration() client = await FileSystemDatasetClient.open(id=id, name=name, alias=alias, configuration=configuration) await self._purge_if_needed(client, configuration) return client @override async def create_kvs_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> FileSystemKeyValueStoreClient: configuration = configuration or Configuration.get_global_configuration() client = await FileSystemKeyValueStoreClient.open(id=id, name=name, alias=alias, configuration=configuration) await self._purge_if_needed(client, configuration) return client @override async def create_rq_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> FileSystemRequestQueueClient: configuration = configuration or Configuration.get_global_configuration() client = await FileSystemRequestQueueClient.open(id=id, name=name, alias=alias, configuration=configuration) await self._purge_if_needed(client, configuration) return client ================================================ FILE: src/crawlee/storage_clients/_file_system/_utils.py ================================================ ================================================ FILE: src/crawlee/storage_clients/_file_system/py.typed ================================================ ================================================ FILE: src/crawlee/storage_clients/_memory/__init__.py ================================================ from ._dataset_client import MemoryDatasetClient from ._key_value_store_client import MemoryKeyValueStoreClient from ._request_queue_client import MemoryRequestQueueClient from ._storage_client import MemoryStorageClient __all__ = [ 'MemoryDatasetClient', 'MemoryKeyValueStoreClient', 'MemoryRequestQueueClient', 'MemoryStorageClient', ] ================================================ FILE: src/crawlee/storage_clients/_memory/_dataset_client.py ================================================ from __future__ import annotations from datetime import datetime, timezone from logging import getLogger from typing import TYPE_CHECKING, Any from typing_extensions import Self, override from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata if TYPE_CHECKING: from collections.abc import AsyncIterator logger = getLogger(__name__) class MemoryDatasetClient(DatasetClient): """Memory implementation of the dataset client. This client stores dataset items in memory using Python lists and dictionaries. No data is persisted between process runs, meaning all stored data is lost when the program terminates. This implementation is primarily useful for testing, development, and short-lived crawler operations where persistent storage is not required. The memory implementation provides fast access to data but is limited by available memory and does not support data sharing across different processes. It supports all dataset operations including sorting, filtering, and pagination, but performs them entirely in memory. """ def __init__( self, *, metadata: DatasetMetadata, ) -> None: """Initialize a new instance. Preferably use the `MemoryDatasetClient.open` class method to create a new instance. """ self._metadata = metadata self._records = list[dict[str, Any]]() """List to hold dataset items. Each item is a dictionary representing a record.""" @override async def get_metadata(self) -> DatasetMetadata: return self._metadata @classmethod async def open( cls, *, id: str | None, name: str | None, alias: str | None, ) -> Self: """Open or create a new memory dataset client. This method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory datasets don't check for existing datasets with the same name or ID since all data exists only in memory and is lost when the process terminates. Alias does not have any effect on the memory storage client implementation, because unnamed storages are supported by default, since data are not persisted. Args: id: The ID of the dataset. If not provided, a random ID will be generated. name: The name of the dataset for named (global scope) storages. alias: The alias of the dataset for unnamed (run scope) storages. Returns: An instance for the opened or created storage client. Raises: ValueError: If both name and alias are provided, or if neither id, name, nor alias is provided. """ # Validate input parameters. raise_if_too_many_kwargs(id=id, name=name, alias=alias) # Create a new dataset dataset_id = id or crypto_random_object_id() now = datetime.now(timezone.utc) metadata = DatasetMetadata( id=dataset_id, name=name, created_at=now, accessed_at=now, modified_at=now, item_count=0, ) return cls(metadata=metadata) @override async def drop(self) -> None: self._records.clear() await self._update_metadata( update_accessed_at=True, update_modified_at=True, new_item_count=0, ) @override async def purge(self) -> None: self._records.clear() await self._update_metadata( update_accessed_at=True, update_modified_at=True, new_item_count=0, ) @override async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: metadata = await self.get_metadata() new_item_count = metadata.item_count if isinstance(data, list): for item in data: new_item_count += 1 await self._push_item(item) else: new_item_count += 1 await self._push_item(data) await self._update_metadata( update_accessed_at=True, update_modified_at=True, new_item_count=new_item_count, ) @override async def get_data( self, *, offset: int = 0, limit: int | None = 999_999_999_999, clean: bool = False, desc: bool = False, fields: list[str] | None = None, omit: list[str] | None = None, unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, flatten: list[str] | None = None, view: str | None = None, ) -> DatasetItemsListPage: # Check for unsupported arguments and log a warning if found unsupported_args: dict[str, Any] = { 'clean': clean, 'fields': fields, 'omit': omit, 'unwind': unwind, 'skip_hidden': skip_hidden, 'flatten': flatten, 'view': view, } unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)} if unsupported: logger.warning( f'The arguments {list(unsupported.keys())} of get_data are not supported ' f'by the {self.__class__.__name__} client.' ) total = len(self._records) items = self._records.copy() # Apply skip_empty filter if requested if skip_empty: items = [item for item in items if item] # Apply sorting if desc: items = list(reversed(items)) # Apply pagination sliced_items = items[offset : (offset + limit) if limit is not None else total] await self._update_metadata(update_accessed_at=True) return DatasetItemsListPage( count=len(sliced_items), offset=offset, limit=limit or (total - offset), total=total, desc=desc, items=sliced_items, ) @override async def iterate_items( self, *, offset: int = 0, limit: int | None = None, clean: bool = False, desc: bool = False, fields: list[str] | None = None, omit: list[str] | None = None, unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, ) -> AsyncIterator[dict[str, Any]]: # Check for unsupported arguments and log a warning if found unsupported_args: dict[str, Any] = { 'clean': clean, 'fields': fields, 'omit': omit, 'unwind': unwind, 'skip_hidden': skip_hidden, } unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)} if unsupported: logger.warning( f'The arguments {list(unsupported.keys())} of iterate are not supported ' f'by the {self.__class__.__name__} client.' ) items = self._records.copy() # Apply sorting if desc: items = list(reversed(items)) # Apply pagination sliced_items = items[offset : (offset + limit) if limit is not None else len(items)] # Yield items one by one for item in sliced_items: if skip_empty and not item: continue yield item await self._update_metadata(update_accessed_at=True) async def _update_metadata( self, *, new_item_count: int | None = None, update_accessed_at: bool = False, update_modified_at: bool = False, ) -> None: """Update the dataset metadata with current information. Args: new_item_count: If provided, update the item count to this value. update_accessed_at: If True, update the `accessed_at` timestamp to the current time. update_modified_at: If True, update the `modified_at` timestamp to the current time. """ now = datetime.now(timezone.utc) if update_accessed_at: self._metadata.accessed_at = now if update_modified_at: self._metadata.modified_at = now if new_item_count is not None: self._metadata.item_count = new_item_count async def _push_item(self, item: dict[str, Any]) -> None: """Push a single item to the dataset. Args: item: The data item to add to the dataset. """ self._records.append(item) ================================================ FILE: src/crawlee/storage_clients/_memory/_key_value_store_client.py ================================================ from __future__ import annotations import sys from datetime import datetime, timezone from typing import TYPE_CHECKING, Any from typing_extensions import Self, override from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.file import infer_mime_type from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata if TYPE_CHECKING: from collections.abc import AsyncIterator class MemoryKeyValueStoreClient(KeyValueStoreClient): """Memory implementation of the key-value store client. This client stores data in memory as Python dictionaries. No data is persisted between process runs, meaning all stored data is lost when the program terminates. This implementation is primarily useful for testing, development, and short-lived crawler operations where persistence is not required. The memory implementation provides fast access to data but is limited by available memory and does not support data sharing across different processes. """ def __init__( self, *, metadata: KeyValueStoreMetadata, ) -> None: """Initialize a new instance. Preferably use the `MemoryKeyValueStoreClient.open` class method to create a new instance. """ self._metadata = metadata self._records = dict[str, KeyValueStoreRecord]() """Dictionary to hold key-value records.""" @override async def get_metadata(self) -> KeyValueStoreMetadata: return self._metadata @classmethod async def open( cls, *, id: str | None, name: str | None, alias: str | None, ) -> Self: """Open or create a new memory key-value store client. This method creates a new in-memory key-value store instance. Unlike persistent storage implementations, memory KVS don't check for existing stores with the same name or ID since all data exists only in memory and is lost when the process terminates. Alias does not have any effect on the memory storage client implementation, because unnamed storages are supported by default, since data are not persisted. Args: id: The ID of the key-value store. If not provided, a random ID will be generated. name: The name of the key-value store for named (global scope) storages. alias: The alias of the key-value store for unnamed (run scope) storages. Returns: An instance for the opened or created storage client. Raises: ValueError: If both name and alias are provided. """ # Validate input parameters. raise_if_too_many_kwargs(id=id, name=name, alias=alias) # Create a new key-value store store_id = id or crypto_random_object_id() now = datetime.now(timezone.utc) metadata = KeyValueStoreMetadata( id=store_id, name=name, created_at=now, accessed_at=now, modified_at=now, ) return cls(metadata=metadata) @override async def drop(self) -> None: self._records.clear() await self._update_metadata(update_accessed_at=True, update_modified_at=True) @override async def purge(self) -> None: self._records.clear() await self._update_metadata(update_accessed_at=True, update_modified_at=True) @override async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: await self._update_metadata(update_accessed_at=True) # Return None if key doesn't exist return self._records.get(key, None) @override async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None: content_type = content_type or infer_mime_type(value) size = sys.getsizeof(value) # Create and store the record record = KeyValueStoreRecord( key=key, value=value, content_type=content_type, size=size, ) self._records[key] = record await self._update_metadata(update_accessed_at=True, update_modified_at=True) @override async def delete_value(self, *, key: str) -> None: if key in self._records: del self._records[key] await self._update_metadata(update_accessed_at=True, update_modified_at=True) @override async def iterate_keys( self, *, exclusive_start_key: str | None = None, limit: int | None = None, ) -> AsyncIterator[KeyValueStoreRecordMetadata]: await self._update_metadata(update_accessed_at=True) # Get all keys, sorted alphabetically keys = sorted(self._records.keys()) # Apply exclusive_start_key filter if provided if exclusive_start_key is not None: keys = [k for k in keys if k > exclusive_start_key] # Apply limit if provided if limit is not None: keys = keys[:limit] # Yield metadata for each key for key in keys: record = self._records[key] yield KeyValueStoreRecordMetadata( key=key, content_type=record.content_type, size=record.size, ) @override async def get_public_url(self, *, key: str) -> str: raise NotImplementedError('Public URLs are not supported for memory key-value stores.') @override async def record_exists(self, *, key: str) -> bool: await self._update_metadata(update_accessed_at=True) return key in self._records async def _update_metadata( self, *, update_accessed_at: bool = False, update_modified_at: bool = False, ) -> None: """Update the key-value store metadata with current information. Args: update_accessed_at: If True, update the `accessed_at` timestamp to the current time. update_modified_at: If True, update the `modified_at` timestamp to the current time. """ now = datetime.now(timezone.utc) if update_accessed_at: self._metadata.accessed_at = now if update_modified_at: self._metadata.modified_at = now ================================================ FILE: src/crawlee/storage_clients/_memory/_request_queue_client.py ================================================ from __future__ import annotations from collections import deque from contextlib import suppress from datetime import datetime, timezone from logging import getLogger from typing import TYPE_CHECKING from typing_extensions import Self, override from crawlee import Request from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata if TYPE_CHECKING: from collections.abc import Sequence logger = getLogger(__name__) class MemoryRequestQueueClient(RequestQueueClient): """Memory implementation of the request queue client. No data is persisted between process runs, which means all requests are lost when the program terminates. This implementation is primarily useful for testing, development, and short-lived crawler runs where persistence is not required. This client provides fast access to request data but is limited by available memory and does not support data sharing across different processes. """ def __init__( self, *, metadata: RequestQueueMetadata, ) -> None: """Initialize a new instance. Preferably use the `MemoryRequestQueueClient.open` class method to create a new instance. """ self._metadata = metadata self._pending_requests = deque[Request]() """Pending requests are those that have been added to the queue but not yet fetched for processing.""" self._handled_requests = dict[str, Request]() """Handled requests are those that have been processed and marked as handled.""" self._in_progress_requests = dict[str, Request]() """In-progress requests are those that have been fetched but not yet marked as handled or reclaimed.""" self._requests_by_unique_key = dict[str, Request]() """Unique key -> Request mapping for fast lookup by unique key.""" @override async def get_metadata(self) -> RequestQueueMetadata: return self._metadata @classmethod async def open( cls, *, id: str | None, name: str | None, alias: str | None, ) -> Self: """Open or create a new memory request queue client. This method creates a new in-memory request queue instance. Unlike persistent storage implementations, memory queues don't check for existing queues with the same name or ID since all data exists only in memory and is lost when the process terminates. Alias does not have any effect on the memory storage client implementation, because unnamed storages are supported by default, since data are not persisted. Args: id: The ID of the request queue. If not provided, a random ID will be generated. name: The name of the request queue for named (global scope) storages. alias: The alias of the request queue for unnamed (run scope) storages. Returns: An instance for the opened or created storage client. Raises: ValueError: If both name and alias are provided. """ # Validate input parameters. raise_if_too_many_kwargs(id=id, name=name, alias=alias) # Create a new queue queue_id = id or crypto_random_object_id() now = datetime.now(timezone.utc) metadata = RequestQueueMetadata( id=queue_id, name=name, created_at=now, accessed_at=now, modified_at=now, had_multiple_clients=False, handled_request_count=0, pending_request_count=0, total_request_count=0, ) return cls(metadata=metadata) @override async def drop(self) -> None: self._pending_requests.clear() self._handled_requests.clear() self._requests_by_unique_key.clear() self._in_progress_requests.clear() await self._update_metadata( update_modified_at=True, update_accessed_at=True, new_handled_request_count=0, new_pending_request_count=0, new_total_request_count=0, ) @override async def purge(self) -> None: self._pending_requests.clear() self._handled_requests.clear() self._requests_by_unique_key.clear() self._in_progress_requests.clear() await self._update_metadata( update_modified_at=True, update_accessed_at=True, new_pending_request_count=0, new_handled_request_count=0, new_total_request_count=0, ) @override async def add_batch_of_requests( self, requests: Sequence[Request], *, forefront: bool = False, ) -> AddRequestsResponse: processed_requests = [] for request in requests: # Check if the request is already in the queue by unique_key. existing_request = self._requests_by_unique_key.get(request.unique_key) was_already_present = existing_request is not None was_already_handled = was_already_present and existing_request and existing_request.handled_at is not None is_in_progress = request.unique_key in self._in_progress_requests # If the request is already in the queue and handled, don't add it again. if was_already_handled: processed_requests.append( ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=True, ) ) continue # If the request is already in progress, don't add it again. if is_in_progress: processed_requests.append( ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=False, ) ) continue # If the request is already in the queue but not handled, update it. if was_already_present and existing_request: # Update indexes. self._requests_by_unique_key[request.unique_key] = request # We only update `forefront` by updating its position by shifting it to the left. if forefront: # Update the existing request with any new data and # remove old request from pending queue if it's there. with suppress(ValueError): self._pending_requests.remove(existing_request) # Add updated request back to queue. self._pending_requests.appendleft(request) processed_requests.append( ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=False, ) ) # Add the new request to the queue. else: if forefront: self._pending_requests.appendleft(request) else: self._pending_requests.append(request) # Update indexes. self._requests_by_unique_key[request.unique_key] = request await self._update_metadata( new_total_request_count=self._metadata.total_request_count + 1, new_pending_request_count=self._metadata.pending_request_count + 1, ) processed_requests.append( ProcessedRequest( unique_key=request.unique_key, was_already_present=was_already_present, was_already_handled=False, ) ) await self._update_metadata(update_accessed_at=True, update_modified_at=True) return AddRequestsResponse( processed_requests=processed_requests, unprocessed_requests=[], ) @override async def fetch_next_request(self) -> Request | None: while self._pending_requests: request = self._pending_requests.popleft() # Skip if already handled (shouldn't happen, but safety check). if request.was_already_handled: continue # Skip if already in progress (shouldn't happen, but safety check). if request.unique_key in self._in_progress_requests: continue # Mark as in progress. self._in_progress_requests[request.unique_key] = request return request return None @override async def get_request(self, unique_key: str) -> Request | None: await self._update_metadata(update_accessed_at=True) return self._requests_by_unique_key.get(unique_key) @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: # Check if the request is in progress. if request.unique_key not in self._in_progress_requests: return None # Set handled_at timestamp if not already set. if not request.was_already_handled: request.handled_at = datetime.now(timezone.utc) # Move request to handled storage. self._handled_requests[request.unique_key] = request # Update index (keep the request in indexes for get_request to work). self._requests_by_unique_key[request.unique_key] = request # Remove from in-progress. del self._in_progress_requests[request.unique_key] # Update metadata. await self._update_metadata( new_handled_request_count=self._metadata.handled_request_count + 1, new_pending_request_count=self._metadata.pending_request_count - 1, update_modified_at=True, ) return ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=True, ) @override async def reclaim_request( self, request: Request, *, forefront: bool = False, ) -> ProcessedRequest | None: # Check if the request is in progress. if request.unique_key not in self._in_progress_requests: return None # Remove from in-progress. del self._in_progress_requests[request.unique_key] # Add request back to pending queue. if forefront: self._pending_requests.appendleft(request) else: self._pending_requests.append(request) # Update metadata timestamps. await self._update_metadata(update_modified_at=True) return ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=False, ) @override async def is_empty(self) -> bool: """Check if the queue is empty. Returns: True if the queue is empty, False otherwise. """ await self._update_metadata(update_accessed_at=True) # Queue is empty if there are no pending requests and no requests in progress. return len(self._pending_requests) == 0 and len(self._in_progress_requests) == 0 async def _update_metadata( self, *, update_accessed_at: bool = False, update_modified_at: bool = False, new_handled_request_count: int | None = None, new_pending_request_count: int | None = None, new_total_request_count: int | None = None, ) -> None: """Update the request queue metadata with current information. Args: update_accessed_at: If True, update the `accessed_at` timestamp to the current time. update_modified_at: If True, update the `modified_at` timestamp to the current time. new_handled_request_count: If provided, set the handled request count to this value. new_pending_request_count: If provided, set the pending request count to this value. new_total_request_count: If provided, set the total request count to this value. """ now = datetime.now(timezone.utc) if update_accessed_at: self._metadata.accessed_at = now if update_modified_at: self._metadata.modified_at = now if new_handled_request_count is not None: self._metadata.handled_request_count = new_handled_request_count if new_pending_request_count is not None: self._metadata.pending_request_count = new_pending_request_count if new_total_request_count is not None: self._metadata.total_request_count = new_total_request_count ================================================ FILE: src/crawlee/storage_clients/_memory/_storage_client.py ================================================ from __future__ import annotations from typing_extensions import override from crawlee._utils.docs import docs_group from crawlee.configuration import Configuration from crawlee.storage_clients._base import StorageClient from ._dataset_client import MemoryDatasetClient from ._key_value_store_client import MemoryKeyValueStoreClient from ._request_queue_client import MemoryRequestQueueClient @docs_group('Storage clients') class MemoryStorageClient(StorageClient): """Memory implementation of the storage client. This storage client provides access to datasets, key-value stores, and request queues that store all data in memory using Python data structures (lists and dictionaries). No data is persisted between process runs, meaning all stored data is lost when the program terminates. The memory implementation provides fast access to data but is limited by available memory and does not support data sharing across different processes. All storage operations happen entirely in memory with no disk operations. The memory storage client is useful for testing and development environments, or short-lived crawler operations where persistence is not required. """ @override async def create_dataset_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> MemoryDatasetClient: configuration = configuration or Configuration.get_global_configuration() client = await MemoryDatasetClient.open(id=id, name=name, alias=alias) await self._purge_if_needed(client, configuration) return client @override async def create_kvs_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> MemoryKeyValueStoreClient: configuration = configuration or Configuration.get_global_configuration() client = await MemoryKeyValueStoreClient.open(id=id, name=name, alias=alias) await self._purge_if_needed(client, configuration) return client @override async def create_rq_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> MemoryRequestQueueClient: configuration = configuration or Configuration.get_global_configuration() client = await MemoryRequestQueueClient.open(id=id, name=name, alias=alias) await self._purge_if_needed(client, configuration) return client ================================================ FILE: src/crawlee/storage_clients/_memory/py.typed ================================================ ================================================ FILE: src/crawlee/storage_clients/_redis/__init__.py ================================================ from ._dataset_client import RedisDatasetClient from ._key_value_store_client import RedisKeyValueStoreClient from ._request_queue_client import RedisRequestQueueClient from ._storage_client import RedisStorageClient __all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient'] ================================================ FILE: src/crawlee/storage_clients/_redis/_client_mixin.py ================================================ from __future__ import annotations import asyncio from contextlib import asynccontextmanager from datetime import datetime, timezone from logging import getLogger from typing import TYPE_CHECKING, Any, ClassVar, TypedDict, overload from crawlee._utils.crypto import crypto_random_object_id from ._utils import await_redis_response, read_lua_script if TYPE_CHECKING: from collections.abc import AsyncIterator from redis.asyncio import Redis from redis.asyncio.client import Pipeline from redis.commands.core import AsyncScript from typing_extensions import NotRequired, Self from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata logger = getLogger(__name__) class MetadataUpdateParams(TypedDict, total=False): """Parameters for updating metadata.""" update_accessed_at: NotRequired[bool] update_modified_at: NotRequired[bool] class RedisClientMixin: """Mixin class for Redis clients. This mixin provides common Redis operations and basic methods for Redis storage clients. """ _DEFAULT_NAME = 'default' """Default storage name in key prefix when none provided.""" _MAIN_KEY: ClassVar[str] """Main Redis key prefix for this storage type.""" _CLIENT_TYPE: ClassVar[str] """Human-readable client type for error messages.""" def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: self._storage_name = storage_name self._storage_id = storage_id self._redis = redis self._scripts_loaded = False @property def redis(self) -> Redis: """Return the Redis client instance.""" return self._redis @property def metadata_key(self) -> str: """Return the Redis key for the metadata of this storage.""" return f'{self._MAIN_KEY}:{self._storage_name}:metadata' @classmethod async def _get_metadata_by_name(cls, name: str, redis: Redis, *, with_wait: bool = False) -> dict | None: """Retrieve metadata by storage name. Args: name: The name of the storage. redis: The Redis client instance. with_wait: Whether to wait for the storage to be created if it doesn't exist. """ if with_wait: # Wait for the creation signal (max 30 seconds) await await_redis_response(redis.blpop([f'{cls._MAIN_KEY}:{name}:created_signal'], timeout=30)) # Signal consumed, push it back for other waiters await await_redis_response(redis.lpush(f'{cls._MAIN_KEY}:{name}:created_signal', 1)) response = await await_redis_response(redis.json().get(f'{cls._MAIN_KEY}:{name}:metadata')) data = response[0] if response is not None and isinstance(response, list) else response if data is not None and not isinstance(data, dict): raise TypeError('The metadata data was received in an incorrect format.') return data @classmethod async def _get_metadata_name_by_id(cls, id: str, redis: Redis) -> str | None: """Retrieve storage name by ID from id_to_name index. Args: id: The ID of the storage. redis: The Redis client instance. """ name = await await_redis_response(redis.hget(f'{cls._MAIN_KEY}:id_to_name', id)) if isinstance(name, str) or name is None: return name if isinstance(name, bytes): return name.decode('utf-8') return None @classmethod async def _open( cls, *, id: str | None, name: str | None, alias: str | None, metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata], redis: Redis, extra_metadata_fields: dict[str, Any], instance_kwargs: dict[str, Any], ) -> Self: """Open or create a new Redis storage client. Args: id: The ID of the storage. If not provided, a random ID will be generated. name: The name of the storage for named (global scope) storages. alias: The alias of the storage for unnamed (run scope) storages. redis: Redis client instance. metadata_model: Pydantic model for metadata validation. extra_metadata_fields: Storage-specific metadata fields. instance_kwargs: Additional arguments for the client constructor. Returns: An instance for the opened or created storage client. """ internal_name = name or alias or cls._DEFAULT_NAME storage_id: str | None = None # Determine if storage exists by ID or name if id: storage_name = await cls._get_metadata_name_by_id(id=id, redis=redis) storage_id = id if storage_name is None: raise ValueError(f'{cls._CLIENT_TYPE} with ID "{id}" does not exist.') else: metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis) storage_name = internal_name if metadata_data is not None else None storage_id = metadata_data['id'] if metadata_data is not None else None # If both storage_name and storage_id are found, open existing storage if storage_name and storage_id: client = cls(storage_name=storage_name, storage_id=storage_id, redis=redis, **instance_kwargs) async with client._get_pipeline() as pipe: await client._update_metadata(pipe, update_accessed_at=True) # Otherwise, create a new storage else: now = datetime.now(timezone.utc) metadata = metadata_model( id=crypto_random_object_id(), name=name, created_at=now, accessed_at=now, modified_at=now, **extra_metadata_fields, ) client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis, **instance_kwargs) created = await client._create_metadata_and_storage(internal_name, metadata.model_dump()) # The client was probably not created due to a race condition. Let's try to open it using the name. if not created: metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis, with_wait=True) client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis, **instance_kwargs) # Ensure Lua scripts are loaded await client._ensure_scripts_loaded() return client async def _load_scripts(self) -> None: """Load Lua scripts in Redis.""" return async def _ensure_scripts_loaded(self) -> None: """Ensure Lua scripts are loaded in Redis.""" if not self._scripts_loaded: await self._load_scripts() self._scripts_loaded = True @asynccontextmanager async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIterator[Pipeline]: """Create a new Redis pipeline.""" async with self._redis.pipeline() as pipe: try: pipe.multi() yield pipe finally: if with_execute: await pipe.execute() async def _create_storage(self, pipeline: Pipeline) -> None: """Create the actual storage structure in Redis.""" async def _create_script(self, script_name: str) -> AsyncScript: """Load a Lua script from a file and return a Script object.""" script_content = await asyncio.to_thread(read_lua_script, script_name) return self._redis.register_script(script_content) async def _create_metadata_and_storage(self, storage_name: str, metadata: dict) -> bool: index_id_to_name = f'{self._MAIN_KEY}:id_to_name' index_name_to_id = f'{self._MAIN_KEY}:name_to_id' metadata['created_at'] = metadata['created_at'].isoformat() metadata['accessed_at'] = metadata['accessed_at'].isoformat() metadata['modified_at'] = metadata['modified_at'].isoformat() # Try to create name_to_id index entry, if it already exists, return False. name_to_id = await await_redis_response(self._redis.hsetnx(index_name_to_id, storage_name, metadata['id'])) # If name already exists, return False. Probably an attempt at parallel creation. if not name_to_id: return False # Create id_to_name index entry, metadata, and storage structure in a transaction. async with self._get_pipeline() as pipe: await await_redis_response(pipe.hsetnx(index_id_to_name, metadata['id'], storage_name)) await await_redis_response(pipe.json().set(self.metadata_key, '$', metadata)) await await_redis_response(pipe.lpush(f'{self._MAIN_KEY}:{storage_name}:created_signal', 1)) await self._create_storage(pipe) return True async def _drop(self, extra_keys: list[str]) -> None: async with self._get_pipeline() as pipe: await pipe.delete(self.metadata_key) await pipe.delete(f'{self._MAIN_KEY}:id_to_name', self._storage_id) await pipe.delete(f'{self._MAIN_KEY}:name_to_id', self._storage_name) await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:created_signal') for key in extra_keys: await pipe.delete(key) async def _purge(self, extra_keys: list[str], metadata_kwargs: MetadataUpdateParams) -> None: async with self._get_pipeline() as pipe: for key in extra_keys: await pipe.delete(key) await self._update_metadata(pipe, **metadata_kwargs) await self._create_storage(pipe) @overload async def _get_metadata(self, metadata_model: type[DatasetMetadata]) -> DatasetMetadata: ... @overload async def _get_metadata(self, metadata_model: type[KeyValueStoreMetadata]) -> KeyValueStoreMetadata: ... @overload async def _get_metadata(self, metadata_model: type[RequestQueueMetadata]) -> RequestQueueMetadata: ... async def _get_metadata( self, metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata] ) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata: """Retrieve client metadata.""" metadata_dict = await self._get_metadata_by_name(name=self._storage_name, redis=self._redis) if metadata_dict is None: raise ValueError(f'{self._CLIENT_TYPE} with name "{self._storage_name}" does not exist.') async with self._get_pipeline() as pipe: await self._update_metadata(pipe, update_accessed_at=True) return metadata_model.model_validate(metadata_dict) async def _specific_update_metadata(self, pipeline: Pipeline, **kwargs: Any) -> None: """Pipeline operations storage-specific metadata updates. Must be implemented by concrete classes. Args: pipeline: The Redis pipeline to use for the update. **kwargs: Storage-specific update parameters. """ async def _update_metadata( self, pipeline: Pipeline, *, update_accessed_at: bool = False, update_modified_at: bool = False, **kwargs: Any, ) -> None: """Update storage metadata combining common and specific fields. Args: pipeline: The Redis pipeline to use for the update. update_accessed_at: Whether to update accessed_at timestamp. update_modified_at: Whether to update modified_at timestamp. **kwargs: Additional arguments for _specific_update_metadata. """ now = datetime.now(timezone.utc) if update_accessed_at: await await_redis_response( pipeline.json().set(self.metadata_key, '$.accessed_at', now.isoformat(), nx=False, xx=True) ) if update_modified_at: await await_redis_response( pipeline.json().set(self.metadata_key, '$.modified_at', now.isoformat(), nx=False, xx=True) ) await self._specific_update_metadata(pipeline, **kwargs) ================================================ FILE: src/crawlee/storage_clients/_redis/_dataset_client.py ================================================ from __future__ import annotations from logging import getLogger from typing import TYPE_CHECKING, Any, cast from typing_extensions import NotRequired, override from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata from ._client_mixin import MetadataUpdateParams, RedisClientMixin from ._utils import await_redis_response if TYPE_CHECKING: from collections.abc import AsyncIterator from redis.asyncio import Redis from redis.asyncio.client import Pipeline logger = getLogger(__name__) class _DatasetMetadataUpdateParams(MetadataUpdateParams): """Parameters for updating dataset metadata.""" new_item_count: NotRequired[int] delta_item_count: NotRequired[int] class RedisDatasetClient(DatasetClient, RedisClientMixin): """Redis implementation of the dataset client. This client persists dataset items to Redis using JSON arrays for efficient storage and retrieval. Items are stored as JSON objects with automatic ordering preservation through Redis list operations. The dataset data is stored in Redis using the following key pattern: - `datasets:{name}:items` - Redis JSON array containing all dataset items. - `datasets:{name}:metadata` - Redis JSON object containing dataset metadata. Items must be JSON-serializable dictionaries. Single items or lists of items can be pushed to the dataset. The item ordering is preserved through Redis JSON array operations. All operations provide atomic consistency through Redis transactions and pipeline operations. """ _DEFAULT_NAME = 'default' """Default Dataset name key prefix when none provided.""" _MAIN_KEY = 'datasets' """Main Redis key prefix for Dataset.""" _CLIENT_TYPE = 'Dataset' """Human-readable client type for error messages.""" def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: """Initialize a new instance. Preferably use the `RedisDatasetClient.open` class method to create a new instance. Args: storage_name: Internal storage name used for Redis keys. storage_id: Unique identifier for the dataset. redis: Redis client instance. """ super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis) @property def _items_key(self) -> str: """Return the Redis key for the items of this dataset.""" return f'{self._MAIN_KEY}:{self._storage_name}:items' @classmethod async def open( cls, *, id: str | None, name: str | None, alias: str | None, redis: Redis, ) -> RedisDatasetClient: """Open or create a new Redis dataset client. This method attempts to open an existing dataset from the Redis database. If a dataset with the specified ID or name exists, it loads the metadata from the database. If no existing store is found, a new one is created. Args: id: The ID of the dataset. If not provided, a random ID will be generated. name: The name of the dataset for named (global scope) storages. alias: The alias of the dataset for unnamed (run scope) storages. redis: Redis client instance. Returns: An instance for the opened or created storage client. """ return await cls._open( id=id, name=name, alias=alias, redis=redis, metadata_model=DatasetMetadata, extra_metadata_fields={'item_count': 0}, instance_kwargs={}, ) @override async def get_metadata(self) -> DatasetMetadata: return await self._get_metadata(DatasetMetadata) @override async def drop(self) -> None: await self._drop(extra_keys=[self._items_key]) @override async def purge(self) -> None: await self._purge( extra_keys=[self._items_key], metadata_kwargs=_DatasetMetadataUpdateParams( new_item_count=0, update_accessed_at=True, update_modified_at=True ), ) @override async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: if isinstance(data, dict): data = [data] async with self._get_pipeline() as pipe: pipe.json().arrappend(self._items_key, '$', *data) await self._update_metadata( pipe, **_DatasetMetadataUpdateParams( update_accessed_at=True, update_modified_at=True, delta_item_count=len(data) ), ) @override async def get_data( self, *, offset: int = 0, limit: int | None = 999_999_999_999, clean: bool = False, desc: bool = False, fields: list[str] | None = None, omit: list[str] | None = None, unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, flatten: list[str] | None = None, view: str | None = None, ) -> DatasetItemsListPage: # Check for unsupported arguments and log a warning if found unsupported_args: dict[str, Any] = { 'clean': clean, 'fields': fields, 'omit': omit, 'unwind': unwind, 'skip_hidden': skip_hidden, 'flatten': flatten, 'view': view, } unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)} if unsupported: logger.warning( f'The arguments {list(unsupported.keys())} of get_data are not supported ' f'by the {self.__class__.__name__} client.' ) metadata = await self.get_metadata() total = metadata.item_count json_path = '$' # Apply sorting and pagination match (desc, offset, limit): case (True, 0, int()): json_path += f'[-{limit}:]' case (True, int(), None): json_path += f'[:-{offset}]' case (True, int(), int()): # ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887. json_path += f'[-{offset + limit}:-{offset}]' # ty: ignore[unsupported-operator] case (False, 0, int()): json_path += f'[:{limit}]' case (False, int(), None): json_path += f'[{offset}:]' case (False, int(), int()): # ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887. json_path += f'[{offset}:{offset + limit}]' # ty: ignore[unsupported-operator] if json_path == '$': json_path = '$[*]' data = await await_redis_response(self._redis.json().get(self._items_key, json_path)) if data is None: data = [] data = [item for item in data if isinstance(item, dict)] if skip_empty: data = [item for item in data if item] if desc: data = list(reversed(data)) async with self._get_pipeline() as pipe: await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True)) return DatasetItemsListPage( count=len(data), offset=offset, limit=limit or (total - offset), total=total, desc=desc, items=data, ) @override async def iterate_items( self, *, offset: int = 0, limit: int | None = None, clean: bool = False, desc: bool = False, fields: list[str] | None = None, omit: list[str] | None = None, unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, ) -> AsyncIterator[dict[str, Any]]: """Iterate over dataset items one by one. This method yields items individually instead of loading all items at once, which is more memory efficient for large datasets. """ # Log warnings for unsupported arguments unsupported_args: dict[str, Any] = { 'clean': clean, 'fields': fields, 'omit': omit, 'unwind': unwind, 'skip_hidden': skip_hidden, } unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)} if unsupported: logger.warning( f'The arguments {list(unsupported.keys())} of iterate_items are not supported ' f'by the {self.__class__.__name__} client.' ) metadata = await self.get_metadata() total_items = metadata.item_count # Calculate actual range based on parameters start_idx = offset end_idx = min(total_items, offset + limit) if limit is not None else total_items # Update accessed_at timestamp async with self._get_pipeline() as pipe: await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True)) # Process items in batches for better network efficiency batch_size = 100 for batch_start in range(start_idx, end_idx, batch_size): batch_end = min(batch_start + batch_size, end_idx) # Build JsonPath for batch slice if desc: # For descending order, we need to reverse the slice calculation desc_batch_start = total_items - batch_end desc_batch_end = total_items - batch_start json_path = f'$[{desc_batch_start}:{desc_batch_end}]' else: json_path = f'$[{batch_start}:{batch_end}]' # Get batch of items batch_items = await await_redis_response(self._redis.json().get(self._items_key, json_path)) # Handle case where batch_items might be None or not a list if batch_items is None: continue # Reverse batch if desc order (since we got items in normal order but need desc) items_iter = reversed(batch_items) if desc else iter(batch_items) # Yield items from batch for item in items_iter: # Apply skip_empty filter if skip_empty and not item: continue yield cast('dict[str, Any]', item) async with self._get_pipeline() as pipe: await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True)) @override async def _create_storage(self, pipeline: Pipeline) -> None: """Create the main dataset keys in Redis.""" # Create an empty JSON array for items await await_redis_response(pipeline.json().set(self._items_key, '$', [])) @override async def _specific_update_metadata( self, pipeline: Pipeline, *, new_item_count: int | None = None, delta_item_count: int | None = None, **_kwargs: Any, ) -> None: """Update the dataset metadata in the database. Args: pipeline: The Redis pipeline to use for the update. new_item_count: If provided, update the item count to this value. delta_item_count: If provided, increment the item count by this value. """ if new_item_count is not None: await await_redis_response( pipeline.json().set(self.metadata_key, '$.item_count', new_item_count, nx=False, xx=True) ) elif delta_item_count is not None: await await_redis_response(pipeline.json().numincrby(self.metadata_key, '$.item_count', delta_item_count)) ================================================ FILE: src/crawlee/storage_clients/_redis/_key_value_store_client.py ================================================ from __future__ import annotations import json from logging import getLogger from typing import TYPE_CHECKING, Any from typing_extensions import override from crawlee._utils.file import infer_mime_type from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata from ._client_mixin import MetadataUpdateParams, RedisClientMixin from ._utils import await_redis_response if TYPE_CHECKING: from collections.abc import AsyncIterator from redis.asyncio import Redis logger = getLogger(__name__) class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin): """Redis implementation of the key-value store client. This client persists key-value data to Redis using hash data structures for efficient storage and retrieval. Keys are mapped to values with automatic content type detection and size tracking for metadata management. The key-value store data is stored in Redis using the following key pattern: - `key_value_stores:{name}:items` - Redis hash containing key-value pairs (values stored as binary data). - `key_value_stores:{name}:metadata_items` - Redis hash containing metadata for each key. - `key_value_stores:{name}:metadata` - Redis JSON object containing store metadata. Values are serialized based on their type: JSON objects are stored as UTF-8 encoded JSON strings, text values as UTF-8 encoded strings, and binary data as-is. The implementation automatically handles content type detection and maintains metadata about each record including size and MIME type information. All operations are atomic through Redis hash operations and pipeline transactions. The client supports concurrent access through Redis's built-in atomic operations for hash fields. """ _DEFAULT_NAME = 'default' """Default Key-Value Store name key prefix when none provided.""" _MAIN_KEY = 'key_value_stores' """Main Redis key prefix for Key-Value Store.""" _CLIENT_TYPE = 'Key-value store' """Human-readable client type for error messages.""" def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: """Initialize a new instance. Preferably use the `RedisKeyValueStoreClient.open` class method to create a new instance. """ super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis) @property def _items_key(self) -> str: """Return the Redis key for the items of KVS.""" return f'{self._MAIN_KEY}:{self._storage_name}:items' @property def _metadata_items_key(self) -> str: """Return the Redis key for the items metadata of KVS.""" return f'{self._MAIN_KEY}:{self._storage_name}:metadata_items' @classmethod async def open( cls, *, id: str | None, name: str | None, alias: str | None, redis: Redis, ) -> RedisKeyValueStoreClient: """Open or create a new Redis key-value store client. This method attempts to open an existing key-value store from the Redis database. If a store with the specified ID or name exists, it loads the metadata from the database. If no existing store is found, a new one is created. Args: id: The ID of the key-value store. If not provided, a random ID will be generated. name: The name of the key-value store for named (global scope) storages. alias: The alias of the key-value store for unnamed (run scope) storages. redis: Redis client instance. Returns: An instance for the opened or created storage client. """ return await cls._open( id=id, name=name, alias=alias, redis=redis, metadata_model=KeyValueStoreMetadata, extra_metadata_fields={}, instance_kwargs={}, ) @override async def get_metadata(self) -> KeyValueStoreMetadata: return await self._get_metadata(KeyValueStoreMetadata) @override async def drop(self) -> None: await self._drop(extra_keys=[self._items_key, self._metadata_items_key]) @override async def purge(self) -> None: await self._purge( extra_keys=[self._items_key, self._metadata_items_key], metadata_kwargs=MetadataUpdateParams(update_accessed_at=True, update_modified_at=True), ) @override async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None: # Special handling for None values if value is None: content_type = 'application/x-none' # Special content type to identify None values value_bytes = b'' else: content_type = content_type or infer_mime_type(value) # Serialize the value to bytes. if 'application/json' in content_type: value_bytes = json.dumps(value, default=str, ensure_ascii=False).encode('utf-8') elif isinstance(value, str): value_bytes = value.encode('utf-8') elif isinstance(value, (bytes, bytearray)): value_bytes = value else: # Fallback: attempt to convert to string and encode. value_bytes = str(value).encode('utf-8') size = len(value_bytes) item_metadata = KeyValueStoreRecordMetadata( key=key, content_type=content_type, size=size, ) async with self._get_pipeline() as pipe: # redis-py typing issue await await_redis_response(pipe.hset(self._items_key, key, value_bytes)) # ty: ignore[invalid-argument-type] await await_redis_response( pipe.hset( self._metadata_items_key, key, item_metadata.model_dump_json(), ) ) await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True)) @override async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: serialized_metadata_item = await await_redis_response(self._redis.hget(self._metadata_items_key, key)) async with self._get_pipeline() as pipe: await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True)) if not isinstance(serialized_metadata_item, (str, bytes, bytearray)): logger.warning(f'Metadata for key "{key}" is missing or invalid.') return None metadata_item = KeyValueStoreRecordMetadata.model_validate_json(serialized_metadata_item) # Handle None values if metadata_item.content_type == 'application/x-none': return KeyValueStoreRecord(value=None, **metadata_item.model_dump()) # Query the record by key # redis-py typing issue value_bytes: bytes | None = await await_redis_response(self._redis.hget(self._items_key, key)) # ty: ignore[invalid-assignment] if value_bytes is None: logger.warning(f'Value for key "{key}" is missing.') return None # Handle JSON values if 'application/json' in metadata_item.content_type: try: value = json.loads(value_bytes.decode('utf-8')) except (json.JSONDecodeError, UnicodeDecodeError): logger.warning(f'Failed to decode JSON value for key "{key}"') return None # Handle text values elif metadata_item.content_type.startswith('text/'): try: value = value_bytes.decode('utf-8') except UnicodeDecodeError: logger.warning(f'Failed to decode text value for key "{key}"') return None # Handle binary values else: value = value_bytes return KeyValueStoreRecord(value=value, **metadata_item.model_dump()) @override async def delete_value(self, *, key: str) -> None: async with self._get_pipeline() as pipe: await await_redis_response(pipe.hdel(self._items_key, key)) await await_redis_response(pipe.hdel(self._metadata_items_key, key)) await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True)) @override async def iterate_keys( self, *, exclusive_start_key: str | None = None, limit: int | None = None, ) -> AsyncIterator[KeyValueStoreRecordMetadata]: items_data = await await_redis_response(self._redis.hgetall(self._metadata_items_key)) if not items_data: return # No items to iterate over if not isinstance(items_data, dict): raise TypeError('The items data was received in an incorrect format.') # Get all keys, sorted alphabetically keys = sorted(items_data.keys()) # Apply exclusive_start_key filter if provided if exclusive_start_key is not None: bytes_exclusive_start_key = exclusive_start_key.encode() keys = [k for k in keys if k > bytes_exclusive_start_key] # Apply limit if provided if limit is not None: keys = keys[:limit] # Yield metadata for each key for key in keys: record = items_data[key] if not isinstance(record, (str, bytes)): raise TypeError(f'Expected str or bytes, got {type(record)}') yield KeyValueStoreRecordMetadata.model_validate_json(record) async with self._get_pipeline() as pipe: await self._update_metadata( pipe, **MetadataUpdateParams(update_accessed_at=True), ) @override async def get_public_url(self, *, key: str) -> str: raise NotImplementedError('Public URLs are not supported for memory key-value stores.') @override async def record_exists(self, *, key: str) -> bool: async with self._get_pipeline(with_execute=False) as pipe: await await_redis_response(pipe.hexists(self._items_key, key)) await self._update_metadata( pipe, **MetadataUpdateParams(update_accessed_at=True), ) results = await pipe.execute() return bool(results[0]) ================================================ FILE: src/crawlee/storage_clients/_redis/_request_queue_client.py ================================================ from __future__ import annotations import json from collections import deque from datetime import datetime, timedelta, timezone from logging import getLogger from typing import TYPE_CHECKING, Any, Literal from typing_extensions import NotRequired, override from crawlee import Request from crawlee._utils.crypto import crypto_random_object_id from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata from ._client_mixin import MetadataUpdateParams, RedisClientMixin from ._utils import await_redis_response if TYPE_CHECKING: from collections.abc import Sequence from redis.asyncio import Redis from redis.asyncio.client import Pipeline from redis.commands.core import AsyncScript logger = getLogger(__name__) class _QueueMetadataUpdateParams(MetadataUpdateParams): """Parameters for updating queue metadata.""" new_handled_request_count: NotRequired[int] new_pending_request_count: NotRequired[int] new_total_request_count: NotRequired[int] delta_handled_request_count: NotRequired[int] delta_pending_request_count: NotRequired[int] delta_total_request_count: NotRequired[int] recalculate: NotRequired[bool] update_had_multiple_clients: NotRequired[bool] class RedisRequestQueueClient(RequestQueueClient, RedisClientMixin): """Redis implementation of the request queue client. This client persists requests to Redis using multiple data structures for efficient queue operations, deduplication, and concurrent access safety. Requests are stored with FIFO ordering and support both regular and forefront (high-priority) insertion modes. The implementation uses Bloom filters for efficient request deduplication and Redis lists for queue operations. Request blocking and client coordination is handled through Redis hashes with timestamp-based expiration for stale request recovery. The request queue data is stored in Redis using the following key patterns: - `request_queues:{name}:queue` - Redis list for FIFO request ordering - `request_queues:{name}:data` - Redis hash storing serialized Request objects by unique_key - `request_queues:{name}:in_progress` - Redis hash tracking requests currently being processed - `request_queues:{name}:added_bloom_filter` - Bloom filter for added request deduplication (`bloom` dedup_strategy) - `request_queues:{name}:handled_bloom_filter` - Bloom filter for completed request tracking (`bloom` dedup_strategy) - `request_queues:{name}:pending_set` - Redis set for added request deduplication (`default` dedup_strategy) - `request_queues:{name}:handled_set` - Redis set for completed request tracking (`default` dedup_strategy) - `request_queues:{name}:metadata` - Redis JSON object containing queue metadata Requests are serialized to JSON for storage and maintain proper FIFO ordering through Redis list operations. The implementation provides concurrent access safety through atomic Lua scripts, Bloom filter operations, and Redis's built-in atomicity guarantees for individual operations. """ _DEFAULT_NAME = 'default' """Default Request Queue name key prefix when none provided.""" _MAIN_KEY = 'request_queues' """Main Redis key prefix for Request Queue.""" _CLIENT_TYPE = 'Request queue' """Human-readable client type for error messages.""" _MAX_BATCH_FETCH_SIZE = 10 """Maximum number of requests to fetch in a single batch operation.""" _BLOCK_REQUEST_TIME = 300_000 # milliseconds """Time in milliseconds to block a fetched request for other clients before it can be autoreclaimed.""" _RECLAIM_INTERVAL = timedelta(seconds=30) """Interval to check for stale requests to reclaim.""" def __init__( self, storage_name: str, storage_id: str, redis: Redis, dedup_strategy: Literal['default', 'bloom'] = 'default', bloom_error_rate: float = 1e-7, ) -> None: """Initialize a new instance. Preferably use the `RedisRequestQueueClient.open` class method to create a new instance. """ super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis) self._dedup_strategy = dedup_strategy """Deduplication strategy for the queue.""" self._bloom_error_rate = bloom_error_rate """Desired false positive rate for Bloom filters.""" self._pending_fetch_cache: deque[Request] = deque() """Cache for requests: ordered by sequence number.""" self.client_key = crypto_random_object_id(length=32)[:32] """Unique identifier for this client instance.""" # Lua scripts for atomic operations self._fetch_script: AsyncScript | None = None self._reclaim_stale_script: AsyncScript | None = None self._add_requests_script: AsyncScript | None = None self._next_reclaim_stale: None | datetime = None @property def _added_filter_key(self) -> str: """Return the Redis key for the added requests Bloom filter.""" if self._dedup_strategy != 'bloom': raise RuntimeError('The added requests filter is only available with the bloom deduplication strategy.') return f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter' @property def _handled_filter_key(self) -> str: """Return the Redis key for the handled requests Bloom filter.""" if self._dedup_strategy != 'bloom': raise RuntimeError('The handled requests filter is only available with the bloom deduplication strategy.') return f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter' @property def _pending_set_key(self) -> str: """Return the Redis key for the pending requests set.""" if self._dedup_strategy != 'default': raise RuntimeError('The pending requests set is only available with the default deduplication strategy.') return f'{self._MAIN_KEY}:{self._storage_name}:pending_set' @property def _handled_set_key(self) -> str: """Return the Redis key for the handled requests set.""" if self._dedup_strategy != 'default': raise RuntimeError('The handled requests set is only available with the default deduplication strategy.') return f'{self._MAIN_KEY}:{self._storage_name}:handled_set' @property def _queue_key(self) -> str: """Return the Redis key for the request queue.""" return f'{self._MAIN_KEY}:{self._storage_name}:queue' @property def _data_key(self) -> str: """Return the Redis key for the request data hash.""" return f'{self._MAIN_KEY}:{self._storage_name}:data' @property def _in_progress_key(self) -> str: """Return the Redis key for the in-progress requests hash.""" return f'{self._MAIN_KEY}:{self._storage_name}:in_progress' @classmethod async def open( cls, *, id: str | None, name: str | None, alias: str | None, redis: Redis, dedup_strategy: Literal['default', 'bloom'] = 'default', bloom_error_rate: float = 1e-7, ) -> RedisRequestQueueClient: """Open or create a new Redis request queue client. This method attempts to open an existing request queue from the Redis database. If a queue with the specified ID or name exists, it loads the metadata from the database. If no existing queue is found, a new one is created. Args: id: The ID of the request queue. If not provided, a random ID will be generated. name: The name of the dataset for named (global scope) storages. alias: The alias of the dataset for unnamed (run scope) storages. redis: Redis client instance. dedup_strategy: Strategy for request queue deduplication. Options are: - 'default': Uses Redis sets for exact deduplication. - 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using this approach, there is a possibility 1e-7 that requests will be skipped in the queue. bloom_error_rate: Desired false positive rate for Bloom filter deduplication. Only relevant if `dedup_strategy` is set to 'bloom'. Returns: An instance for the opened or created storage client. """ return await cls._open( id=id, name=name, alias=alias, redis=redis, metadata_model=RequestQueueMetadata, extra_metadata_fields={ 'had_multiple_clients': False, 'handled_request_count': 0, 'pending_request_count': 0, 'total_request_count': 0, }, instance_kwargs={'dedup_strategy': dedup_strategy, 'bloom_error_rate': bloom_error_rate}, ) @override async def get_metadata(self) -> RequestQueueMetadata: return await self._get_metadata(RequestQueueMetadata) @override async def drop(self) -> None: if self._dedup_strategy == 'bloom': extra_keys = [self._added_filter_key, self._handled_filter_key] elif self._dedup_strategy == 'default': extra_keys = [self._pending_set_key, self._handled_set_key] else: raise RuntimeError(f'Unknown deduplication strategy: {self._dedup_strategy}') extra_keys.extend([self._queue_key, self._data_key, self._in_progress_key]) await self._drop(extra_keys=extra_keys) @override async def purge(self) -> None: if self._dedup_strategy == 'bloom': extra_keys = [self._added_filter_key, self._handled_filter_key] elif self._dedup_strategy == 'default': extra_keys = [self._pending_set_key, self._handled_set_key] else: raise RuntimeError(f'Unknown deduplication strategy: {self._dedup_strategy}') extra_keys.extend([self._queue_key, self._data_key, self._in_progress_key]) await self._purge( extra_keys=extra_keys, metadata_kwargs=_QueueMetadataUpdateParams( update_accessed_at=True, update_modified_at=True, new_pending_request_count=0, new_handled_request_count=0, new_total_request_count=0, ), ) @override async def add_batch_of_requests( self, requests: Sequence[Request], *, forefront: bool = False, ) -> AddRequestsResponse: if self._add_requests_script is None: raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') processed_requests = [] delta_pending = 0 delta_total = 0 requests_by_unique_key = {req.unique_key: req for req in requests} unique_keys = list(requests_by_unique_key.keys()) # Check which requests are already added or handled async with self._get_pipeline(with_execute=False) as pipe: if self._dedup_strategy == 'default': await await_redis_response(pipe.smismember(self._pending_set_key, unique_keys)) await await_redis_response(pipe.smismember(self._handled_set_key, unique_keys)) elif self._dedup_strategy == 'bloom': await await_redis_response(pipe.bf().mexists(self._added_filter_key, *unique_keys)) await await_redis_response(pipe.bf().mexists(self._handled_filter_key, *unique_keys)) pipe_results = await pipe.execute() added_pending_flags = pipe_results[0] handled_flags = pipe_results[1] new_unique_keys = [] new_request_data = {} delta_pending = 0 delta_total = 0 for i, unique_key in enumerate(unique_keys): # Already handled - skip if handled_flags[i]: processed_requests.append( ProcessedRequest( unique_key=unique_key, was_already_present=True, was_already_handled=True, ) ) continue # Already in queue - skip if added_pending_flags[i]: processed_requests.append( ProcessedRequest( unique_key=unique_key, was_already_present=True, was_already_handled=False, ) ) continue # New request - will add to queue request = requests_by_unique_key[unique_key] new_unique_keys.append(unique_key) new_request_data[unique_key] = request.model_dump_json() if new_unique_keys: # Add new requests to the queue atomically, get back which were actually added script_results = await self._add_requests_script( keys=[ self._added_filter_key if self._dedup_strategy == 'bloom' else self._pending_set_key, self._queue_key, self._data_key, ], args=[int(forefront), json.dumps(new_unique_keys), json.dumps(new_request_data)], ) actually_added = set(json.loads(script_results)) delta_pending = len(actually_added) delta_total = len(actually_added) processed_requests.extend( [ ProcessedRequest( unique_key=unique_key, was_already_present=unique_key not in actually_added, was_already_handled=False, ) for unique_key in new_unique_keys ] ) async with self._get_pipeline() as pipe: await self._update_metadata( pipe, **_QueueMetadataUpdateParams( update_accessed_at=True, update_modified_at=True, delta_pending_request_count=delta_pending, delta_total_request_count=delta_total, ), ) return AddRequestsResponse( processed_requests=processed_requests, unprocessed_requests=[], ) @override async def fetch_next_request(self) -> Request | None: if self._pending_fetch_cache: return self._pending_fetch_cache.popleft() if self._fetch_script is None: raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') blocked_until_timestamp = int(datetime.now(tz=timezone.utc).timestamp() * 1000) + self._BLOCK_REQUEST_TIME # The script retrieves requests from the queue and places them in the in_progress hash. requests_json = await self._fetch_script( keys=[self._queue_key, self._in_progress_key, self._data_key], args=[self.client_key, blocked_until_timestamp, self._MAX_BATCH_FETCH_SIZE], ) async with self._get_pipeline() as pipe: await self._update_metadata(pipe, **_QueueMetadataUpdateParams(update_accessed_at=True)) if not requests_json: return None requests = [Request.model_validate_json(req_json) for req_json in requests_json] self._pending_fetch_cache.extend(requests[1:]) return requests[0] @override async def get_request(self, unique_key: str) -> Request | None: request_data = await await_redis_response(self._redis.hget(self._data_key, unique_key)) if isinstance(request_data, (str, bytes, bytearray)): return Request.model_validate_json(request_data) return None @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: # Check if the request is in progress. check_in_progress = await await_redis_response(self._redis.hexists(self._in_progress_key, request.unique_key)) if not check_in_progress: logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.') return None # Update the request's handled_at timestamp. if request.handled_at is None: request.handled_at = datetime.now(timezone.utc) async with self._get_pipeline() as pipe: if self._dedup_strategy == 'default': await await_redis_response(pipe.sadd(self._handled_set_key, request.unique_key)) await await_redis_response(pipe.srem(self._pending_set_key, request.unique_key)) elif self._dedup_strategy == 'bloom': await await_redis_response(pipe.bf().add(self._handled_filter_key, request.unique_key)) await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key)) await await_redis_response(pipe.hset(self._data_key, request.unique_key, request.model_dump_json())) await self._update_metadata( pipe, **_QueueMetadataUpdateParams( update_accessed_at=True, update_modified_at=True, delta_handled_request_count=1, delta_pending_request_count=-1, ), ) return ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=True, ) @override async def reclaim_request( self, request: Request, *, forefront: bool = False, ) -> ProcessedRequest | None: check_in_progress = await await_redis_response(self._redis.hexists(self._in_progress_key, request.unique_key)) if not check_in_progress: logger.info(f'Reclaiming request {request.unique_key} that is not in progress.') return None async with self._get_pipeline() as pipe: if forefront: blocked_until_timestamp = ( int(datetime.now(tz=timezone.utc).timestamp() * 1000) + self._BLOCK_REQUEST_TIME ) await await_redis_response( pipe.hset( self._in_progress_key, request.unique_key, f'{{"client_id":"{self.client_key}","blocked_until_timestamp":{blocked_until_timestamp}}}', ) ) await await_redis_response(pipe.hset(self._data_key, request.unique_key, request.model_dump_json())) self._pending_fetch_cache.appendleft(request) else: await await_redis_response(pipe.rpush(self._queue_key, request.unique_key)) await await_redis_response(pipe.hset(self._data_key, request.unique_key, request.model_dump_json())) await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key)) await self._update_metadata( pipe, **_QueueMetadataUpdateParams( update_modified_at=True, update_accessed_at=True, ), ) return ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=False, ) @override async def is_empty(self) -> bool: """Check if the queue is empty. Returns: True if the queue is empty, False otherwise. """ if self._pending_fetch_cache: return False # Reclaim stale requests if needed if self._next_reclaim_stale is None or datetime.now(tz=timezone.utc) >= self._next_reclaim_stale: await self._reclaim_stale_requests() self._next_reclaim_stale = datetime.now(tz=timezone.utc) + self._RECLAIM_INTERVAL metadata = await self.get_metadata() return metadata.pending_request_count == 0 async def _load_scripts(self) -> None: """Ensure Lua scripts are loaded in Redis.""" self._fetch_script = await self._create_script('atomic_fetch_request.lua') self._reclaim_stale_script = await self._create_script('reclaim_stale_requests.lua') if self._dedup_strategy == 'bloom': self._add_requests_script = await self._create_script('atomic_bloom_add_requests.lua') elif self._dedup_strategy == 'default': self._add_requests_script = await self._create_script('atomic_set_add_requests.lua') @override async def _create_storage(self, pipeline: Pipeline) -> None: # Create Bloom filters for added and handled requests if self._dedup_strategy == 'bloom': await await_redis_response( pipeline.bf().create( self._added_filter_key, errorRate=self._bloom_error_rate, capacity=100000, expansion=10 ) ) await await_redis_response( pipeline.bf().create( self._handled_filter_key, errorRate=self._bloom_error_rate, capacity=100000, expansion=10 ) ) async def _reclaim_stale_requests(self) -> None: """Reclaim requests that have been in progress for too long.""" if self._reclaim_stale_script is None: raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') current_time = int(datetime.now(tz=timezone.utc).timestamp() * 1000) await self._reclaim_stale_script( keys=[self._in_progress_key, self._queue_key, self._data_key], args=[current_time] ) @override async def _specific_update_metadata( self, pipeline: Pipeline, *, delta_handled_request_count: int | None = None, new_handled_request_count: int | None = None, delta_pending_request_count: int | None = None, new_pending_request_count: int | None = None, delta_total_request_count: int | None = None, new_total_request_count: int | None = None, update_had_multiple_clients: bool = False, **_kwargs: Any, ) -> None: """Update the dataset metadata with current information. Args: pipeline: The Redis pipeline to use for the update. new_handled_request_count: If provided, update the handled_request_count to this value. new_pending_request_count: If provided, update the pending_request_count to this value. new_total_request_count: If provided, update the total_request_count to this value. delta_handled_request_count: If provided, add this value to the handled_request_count. delta_pending_request_count: If provided, add this value to the pending_request_count. delta_total_request_count: If provided, add this value to the total_request_count. update_had_multiple_clients: If True, set had_multiple_clients to True. """ if new_pending_request_count is not None: await await_redis_response( pipeline.json().set( self.metadata_key, '$.pending_request_count', new_pending_request_count, nx=False, xx=True ) ) elif delta_pending_request_count is not None: await await_redis_response( pipeline.json().numincrby(self.metadata_key, '$.pending_request_count', delta_pending_request_count) ) if new_handled_request_count is not None: await await_redis_response( pipeline.json().set( self.metadata_key, '$.handled_request_count', new_handled_request_count, nx=False, xx=True ) ) elif delta_handled_request_count is not None: await await_redis_response( pipeline.json().numincrby(self.metadata_key, '$.handled_request_count', delta_handled_request_count) ) if new_total_request_count is not None: await await_redis_response( pipeline.json().set( self.metadata_key, '$.total_request_count', new_total_request_count, nx=False, xx=True ) ) elif delta_total_request_count is not None: await await_redis_response( pipeline.json().numincrby(self.metadata_key, '$.total_request_count', delta_total_request_count) ) if update_had_multiple_clients: await await_redis_response( pipeline.json().set( self.metadata_key, '$.had_multiple_clients', update_had_multiple_clients, nx=False, xx=True ) ) ================================================ FILE: src/crawlee/storage_clients/_redis/_storage_client.py ================================================ from __future__ import annotations import warnings from typing import Literal from redis.asyncio import Redis from typing_extensions import override from crawlee._utils.docs import docs_group from crawlee.configuration import Configuration from crawlee.storage_clients._base import StorageClient from ._dataset_client import RedisDatasetClient from ._key_value_store_client import RedisKeyValueStoreClient from ._request_queue_client import RedisRequestQueueClient @docs_group('Storage clients') class RedisStorageClient(StorageClient): """Redis implementation of the storage client. This storage client provides access to datasets, key-value stores, and request queues that persist data to a Redis database v8.0+. Each storage type uses Redis-specific data structures and key patterns for efficient storage and retrieval. The client accepts either a Redis connection string or a pre-configured Redis client instance. Exactly one of these parameters must be provided during initialization. Storage types use the following Redis data structures: - **Datasets**: Redis JSON arrays for item storage with metadata in JSON objects - **Key-value stores**: Redis hashes for key-value pairs with separate metadata storage - **Request queues**: Redis lists for FIFO queuing, hashes for request data and in-progress tracking, and Bloom filters for request deduplication Warning: This is an experimental feature. The behavior and interface may change in future versions. """ def __init__( self, *, connection_string: str | None = None, redis: Redis | None = None, queue_dedup_strategy: Literal['default', 'bloom'] = 'default', queue_bloom_error_rate: float = 1e-7, ) -> None: """Initialize the Redis storage client. Args: connection_string: Redis connection string (e.g., "redis://localhost:6379"). Supports standard Redis URL format with optional database selection. redis: Pre-configured Redis client instance. queue_dedup_strategy: Strategy for request queue deduplication. Options are: - 'default': Uses Redis sets for exact deduplication. - 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using this approach, approximately 1 in 1e-7 requests will be falsely considered duplicate. queue_bloom_error_rate: Desired false positive rate for Bloom filter deduplication. Only relevant if `queue_dedup_strategy` is set to 'bloom'. """ if redis is None and connection_string is None: raise ValueError('Either redis or connection_string must be provided.') if redis is not None and connection_string is not None: raise ValueError('Either redis or connection_string must be provided, not both.') if isinstance(redis, Redis) and connection_string is None: self._redis = redis if isinstance(connection_string, str) and redis is None: self._redis = Redis.from_url(connection_string) self._redis: Redis # to help type checker self._queue_dedup_strategy = queue_dedup_strategy self._queue_bloom_error_rate = queue_bloom_error_rate # Call the notification only once warnings.warn( ( 'RedisStorageClient is experimental and its API, behavior, and key structure may change in future ' 'releases.' ), category=UserWarning, stacklevel=2, ) @override async def create_dataset_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> RedisDatasetClient: configuration = configuration or Configuration.get_global_configuration() client = await RedisDatasetClient.open( id=id, name=name, alias=alias, redis=self._redis, ) await self._purge_if_needed(client, configuration) return client @override async def create_kvs_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> RedisKeyValueStoreClient: configuration = configuration or Configuration.get_global_configuration() client = await RedisKeyValueStoreClient.open( id=id, name=name, alias=alias, redis=self._redis, ) await self._purge_if_needed(client, configuration) return client @override async def create_rq_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> RedisRequestQueueClient: configuration = configuration or Configuration.get_global_configuration() client = await RedisRequestQueueClient.open( id=id, name=name, alias=alias, redis=self._redis, dedup_strategy=self._queue_dedup_strategy, bloom_error_rate=self._queue_bloom_error_rate, ) await self._purge_if_needed(client, configuration) return client ================================================ FILE: src/crawlee/storage_clients/_redis/_utils.py ================================================ from collections.abc import Awaitable from pathlib import Path from typing import TypeVar, cast, overload T = TypeVar('T') @overload async def await_redis_response(response: Awaitable[T]) -> T: ... @overload async def await_redis_response(response: T) -> T: ... async def await_redis_response(response: Awaitable[T] | T) -> T: """Solve the problem of ambiguous typing for redis.""" if isinstance(response, Awaitable): return cast('T', await response) return response def read_lua_script(script_name: str) -> str: """Read a Lua script from a file.""" file_path = Path(__file__).parent / 'lua_scripts' / script_name with file_path.open(mode='r', encoding='utf-8') as file: return file.read() ================================================ FILE: src/crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua ================================================ local added_filter_key = KEYS[1] local queue_key = KEYS[2] local data_key = KEYS[3] local forefront = ARGV[1] == '1' local unique_keys = cjson.decode(ARGV[2]) local requests_data = cjson.decode(ARGV[3]) -- Add and check which unique keys are actually new using Bloom filter local bf_results = redis.call('bf.madd', added_filter_key, unpack(unique_keys)) local actually_added = {} local hset_args = {} -- Process the results for i, unique_key in ipairs(unique_keys) do if bf_results[i] == 1 then -- This key was added by us (did not exist before) table.insert(hset_args, unique_key) table.insert(hset_args, requests_data[unique_key]) table.insert(actually_added, unique_key) end end -- Add only those that are actually new if #actually_added > 0 then redis.call('hset', data_key, unpack(hset_args)) if forefront then redis.call('lpush', queue_key, unpack(actually_added)) else redis.call('rpush', queue_key, unpack(actually_added)) end end return cjson.encode(actually_added) ================================================ FILE: src/crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua ================================================ local queue_key = KEYS[1] local in_progress_key = KEYS[2] local data_key = KEYS[3] local client_id = ARGV[1] local blocked_until_timestamp = ARGV[2] local batch_size = tonumber(ARGV[3]) -- Pop batch unique_key from queue local batch_result = redis.call('LMPOP', 1, queue_key, 'LEFT', 'COUNT', batch_size) if not batch_result then return nil end local unique_keys = batch_result[2] -- Get requests data local requests_data = redis.call('HMGET', data_key, unpack(unique_keys)) if not requests_data then -- Data missing, skip this request return nil end -- Prepare results and update in_progress local final_result = {} local in_progress_hmset = {} local pending_decrement = 0 local in_progress_data = cjson.encode({ client_id = client_id, blocked_until_timestamp = tonumber(blocked_until_timestamp) }) for i = 1, #unique_keys do local unique_key = unique_keys[i] local request_data = requests_data[i] if request_data then -- Add to in_progress hash table.insert(in_progress_hmset, unique_key) table.insert(in_progress_hmset, in_progress_data) table.insert(final_result, request_data) end end -- Update in_progress hash if #in_progress_hmset > 0 then redis.call('HMSET', in_progress_key, unpack(in_progress_hmset)) end -- Return result with requests data return final_result ================================================ FILE: src/crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua ================================================ local added_filter_key = KEYS[1] local queue_key = KEYS[2] local data_key = KEYS[3] local forefront = ARGV[1] == '1' local unique_keys = cjson.decode(ARGV[2]) local requests_data = cjson.decode(ARGV[3]) -- Add and check which unique keys are actually new using Redis set local actually_added = {} local hset_args = {} -- Process each unique key for _, unique_key in ipairs(unique_keys) do -- Try to add the key to the set, returns 1 if added, 0 if already existed local set_result = redis.call('sadd', added_filter_key, unique_key) if set_result == 1 then -- This key was added by us (did not exist before) table.insert(hset_args, unique_key) table.insert(hset_args, requests_data[unique_key]) table.insert(actually_added, unique_key) end end -- Add only those that are actually new if #actually_added > 0 then redis.call('hset', data_key, unpack(hset_args)) if forefront then redis.call('lpush', queue_key, unpack(actually_added)) else redis.call('rpush', queue_key, unpack(actually_added)) end end return cjson.encode(actually_added) ================================================ FILE: src/crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua ================================================ local in_progress_key = KEYS[1] local queue_key = KEYS[2] local data_key = KEYS[3] local current_time = tonumber(ARGV[1]) local max_reclaim = 1000 local cursor = "0" local count = 0 repeat local result = redis.call('hscan', in_progress_key, cursor, 'COUNT', 100) cursor = result[1] local entries = result[2] for i = 1, #entries, 2 do if count >= max_reclaim then break end local unique_key = entries[i] local data = cjson.decode(entries[i + 1]) -- Check if timed out if current_time > data.blocked_until_timestamp then -- Atomically remove from in_progress and add back to queue redis.call('hdel', in_progress_key, unique_key) redis.call('rpush', queue_key, unique_key) count = count + 1 end end until cursor == "0" or count >= max_reclaim return count ================================================ FILE: src/crawlee/storage_clients/_redis/py.typed ================================================ ================================================ FILE: src/crawlee/storage_clients/_sql/__init__.py ================================================ from ._dataset_client import SqlDatasetClient from ._key_value_store_client import SqlKeyValueStoreClient from ._request_queue_client import SqlRequestQueueClient from ._storage_client import SqlStorageClient __all__ = ['SqlDatasetClient', 'SqlKeyValueStoreClient', 'SqlRequestQueueClient', 'SqlStorageClient'] ================================================ FILE: src/crawlee/storage_clients/_sql/_client_mixin.py ================================================ from __future__ import annotations from abc import ABC, abstractmethod from contextlib import asynccontextmanager from datetime import datetime, timedelta, timezone from logging import getLogger from typing import TYPE_CHECKING, Any, ClassVar, TypedDict, cast, overload from sqlalchemy import CursorResult, delete, select, text, update from sqlalchemy import func as sql_func from sqlalchemy.dialects.mysql import insert as mysql_insert from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlalchemy.dialects.sqlite import insert as lite_insert from sqlalchemy.exc import OperationalError, SQLAlchemyError from crawlee._utils.crypto import crypto_random_object_id if TYPE_CHECKING: from collections.abc import AsyncIterator from sqlalchemy import Insert from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import DeclarativeBase from typing_extensions import NotRequired, Self from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata from ._db_models import ( DatasetItemDb, DatasetMetadataBufferDb, DatasetMetadataDb, KeyValueStoreMetadataBufferDb, KeyValueStoreMetadataDb, KeyValueStoreRecordDb, RequestDb, RequestQueueMetadataBufferDb, RequestQueueMetadataDb, ) from ._storage_client import SqlStorageClient logger = getLogger(__name__) class MetadataUpdateParams(TypedDict, total=False): """Parameters for updating metadata.""" accessed_at: NotRequired[datetime] modified_at: NotRequired[datetime] class SqlClientMixin(ABC): """Mixin class for SQL clients. This mixin provides common SQL operations and basic methods for SQL storage clients. """ _DEFAULT_NAME: ClassVar[str] """Default name when none provided.""" _METADATA_TABLE: ClassVar[type[DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb]] """SQLAlchemy model for metadata.""" _BUFFER_TABLE: ClassVar[ type[KeyValueStoreMetadataBufferDb | DatasetMetadataBufferDb | RequestQueueMetadataBufferDb] ] """SQLAlchemy model for metadata buffer.""" _ITEM_TABLE: ClassVar[type[DatasetItemDb | KeyValueStoreRecordDb | RequestDb]] """SQLAlchemy model for items.""" _CLIENT_TYPE: ClassVar[str] """Human-readable client type for error messages.""" _BLOCK_BUFFER_TIME = timedelta(seconds=1) """Time interval that blocks buffer reading to update metadata.""" def __init__(self, *, id: str, storage_client: SqlStorageClient) -> None: self._id = id self._storage_client = storage_client @classmethod async def _open( cls, *, id: str | None, name: str | None, internal_name: str, storage_client: SqlStorageClient, metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata], session: AsyncSession, extra_metadata_fields: dict[str, Any], ) -> Self: """Open existing storage or create new one. Internal method used by _safely_open. Args: id: Storage ID to open (takes precedence over name). name: The name of the storage. internal_name: The database name for the storage based on name or alias. storage_client: SQL storage client instance. metadata_model: Pydantic model for metadata validation. session: Active database session. extra_metadata_fields: Storage-specific metadata fields. """ orm_metadata: DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None = None if id: orm_metadata = await session.get(cls._METADATA_TABLE, id) if not orm_metadata: raise ValueError(f'{cls._CLIENT_TYPE} with ID "{id}" not found.') else: stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name) result = await session.execute(stmt) orm_metadata = result.scalar_one_or_none() if orm_metadata: client = cls(id=orm_metadata.id, storage_client=storage_client) await client._add_buffer_record(session) # Ensure any pending buffer updates are processed await client._process_buffers() else: now = datetime.now(timezone.utc) metadata = metadata_model( id=crypto_random_object_id(), name=name, created_at=now, accessed_at=now, modified_at=now, **extra_metadata_fields, ) client = cls(id=metadata.id, storage_client=storage_client) session.add(cls._METADATA_TABLE(**metadata.model_dump(), internal_name=internal_name)) return client @classmethod async def _safely_open( cls, *, id: str | None, name: str | None, alias: str | None = None, storage_client: SqlStorageClient, metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata], extra_metadata_fields: dict[str, Any], ) -> Self: """Safely open storage with transaction handling. Args: id: Storage ID to open (takes precedence over name). name: The name of the storage for named (global scope) storages. alias: The alias of the storage for unnamed (run scope) storages. storage_client: SQL storage client instance. client_class: Concrete client class to instantiate. metadata_model: Pydantic model for metadata validation. extra_metadata_fields: Storage-specific metadata fields. """ # Validate input parameters. specified_params = sum(1 for param in [id, name, alias] if param is not None) if specified_params > 1: raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.') internal_name = name or alias or cls._DEFAULT_NAME async with storage_client.create_session() as session: try: client = await cls._open( id=id, name=name, internal_name=internal_name, storage_client=storage_client, metadata_model=metadata_model, session=session, extra_metadata_fields=extra_metadata_fields, ) await session.commit() except SQLAlchemyError: await session.rollback() stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name) result = await session.execute(stmt) orm_metadata: DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None orm_metadata = cast( 'DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None', result.scalar_one_or_none(), ) if not orm_metadata: raise ValueError(f'{cls._CLIENT_TYPE} with Name "{internal_name}" not found.') from None client = cls(id=orm_metadata.id, storage_client=storage_client) return client @asynccontextmanager async def get_session(self, *, with_simple_commit: bool = False) -> AsyncIterator[AsyncSession]: """Create a new SQLAlchemy session for this storage.""" async with self._storage_client.create_session() as session: # For operations where a final commit is mandatory and does not require specific processing conditions if with_simple_commit: try: yield session await session.commit() except SQLAlchemyError as e: logger.warning(f'Error occurred during session transaction: {e}') await session.rollback() else: yield session def _build_insert_stmt_with_ignore( self, table_model: type[DeclarativeBase], insert_values: dict[str, Any] | list[dict[str, Any]] ) -> Insert: """Build an insert statement with ignore for the SQL dialect. Args: table_model: SQLAlchemy table model. insert_values: Single dict or list of dicts to insert. """ if isinstance(insert_values, dict): insert_values = [insert_values] dialect = self._storage_client.get_dialect_name() if dialect == 'postgresql': return pg_insert(table_model).values(insert_values).on_conflict_do_nothing() if dialect == 'sqlite': return lite_insert(table_model).values(insert_values).on_conflict_do_nothing() if dialect in {'mysql', 'mariadb'}: return mysql_insert(table_model).values(insert_values).prefix_with('IGNORE') raise NotImplementedError(f'Insert with ignore not supported for dialect: {dialect}') def _build_upsert_stmt( self, table_model: type[DeclarativeBase], insert_values: dict[str, Any] | list[dict[str, Any]], update_columns: list[str], conflict_cols: list[str] | None = None, ) -> Insert: """Build an upsert statement for the SQL dialect. Args: table_model: SQLAlchemy table model. insert_values: Single dict or list of dicts to upsert. update_columns: Column names to update on conflict. conflict_cols: Column names that define uniqueness (for PostgreSQL/SQLite). """ if isinstance(insert_values, dict): insert_values = [insert_values] dialect = self._storage_client.get_dialect_name() if dialect == 'postgresql': pg_stmt = pg_insert(table_model).values(insert_values) set_ = {col: getattr(pg_stmt.excluded, col) for col in update_columns} return pg_stmt.on_conflict_do_update(index_elements=conflict_cols, set_=set_) if dialect == 'sqlite': lite_stmt = lite_insert(table_model).values(insert_values) set_ = {col: getattr(lite_stmt.excluded, col) for col in update_columns} return lite_stmt.on_conflict_do_update(index_elements=conflict_cols, set_=set_) if dialect in {'mysql', 'mariadb'}: mysql_stmt = mysql_insert(table_model).values(insert_values) set_ = {col: getattr(mysql_stmt.inserted, col) for col in update_columns} return mysql_stmt.on_duplicate_key_update(**set_) raise NotImplementedError(f'Upsert not supported for dialect: {dialect}') async def _purge(self, metadata_kwargs: MetadataUpdateParams) -> None: """Drop all items in storage and update metadata. Args: metadata_kwargs: Arguments to pass to _update_metadata. """ # Process buffers to ensure metadata is up to date before purging await self._process_buffers() stmt_records = delete(self._ITEM_TABLE).where(self._ITEM_TABLE.storage_id == self._id) async with self.get_session(with_simple_commit=True) as session: await session.execute(stmt_records) await self._update_metadata(session, **metadata_kwargs) async def _drop(self) -> None: """Delete this storage and all its data. This operation is irreversible. Uses CASCADE deletion to remove all related items. """ stmt = delete(self._METADATA_TABLE).where(self._METADATA_TABLE.id == self._id) # Delete the buffer records with a separate query, since tables don't link via foreign key. buffer_stmt = delete(self._BUFFER_TABLE).where(self._BUFFER_TABLE.storage_id == self._id) async with self.get_session(with_simple_commit=True) as session: if self._storage_client.get_dialect_name() == 'sqlite': # foreign_keys=ON is set at the connection level. Required for cascade deletion. await session.execute(text('PRAGMA foreign_keys=ON')) await session.execute(stmt) await session.execute(buffer_stmt) @overload async def _get_metadata(self, metadata_model: type[DatasetMetadata]) -> DatasetMetadata: ... @overload async def _get_metadata(self, metadata_model: type[KeyValueStoreMetadata]) -> KeyValueStoreMetadata: ... @overload async def _get_metadata(self, metadata_model: type[RequestQueueMetadata]) -> RequestQueueMetadata: ... async def _get_metadata( self, metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata] ) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata: """Retrieve client metadata.""" # Process any pending buffer updates first await self._process_buffers() async with self.get_session() as session: orm_metadata = await session.get(self._METADATA_TABLE, self._id) if not orm_metadata: raise ValueError(f'{self._CLIENT_TYPE} with ID "{self._id}" not found.') return metadata_model.model_validate(orm_metadata) @abstractmethod def _specific_update_metadata(self, **kwargs: Any) -> dict[str, Any]: """Prepare storage-specific metadata updates. Must be implemented by concrete classes. Args: **kwargs: Storage-specific update parameters. """ @abstractmethod def _prepare_buffer_data(self, **kwargs: Any) -> dict[str, Any]: """Prepare storage-specific buffer data. Must be implemented by concrete classes.""" @abstractmethod async def _apply_buffer_updates(self, session: AsyncSession, max_buffer_id: int) -> None: """Apply aggregated buffer updates to metadata. Must be implemented by concrete classes. Args: session: Active database session. max_buffer_id: Maximum buffer record ID to process. """ async def _update_metadata( self, session: AsyncSession, *, accessed_at: datetime | None = None, modified_at: datetime | None = None, **kwargs: Any, ) -> None: """Directly update storage metadata combining common and specific fields. Args: session: Active database session. accessed_at: Datetime to set as accessed_at timestamp. modified_at: Datetime to set as modified_at timestamp. **kwargs: Additional arguments for _specific_update_metadata. """ values_to_set: dict[str, Any] = {} if accessed_at is not None: values_to_set['accessed_at'] = accessed_at if modified_at is not None: values_to_set['modified_at'] = modified_at values_to_set.update(self._specific_update_metadata(**kwargs)) if values_to_set: if (stmt := values_to_set.pop('custom_stmt', None)) is None: stmt = update(self._METADATA_TABLE).where(self._METADATA_TABLE.id == self._id) stmt = stmt.values(**values_to_set) await session.execute(stmt) async def _add_buffer_record( self, session: AsyncSession, *, update_modified_at: bool = False, **kwargs: Any, ) -> None: """Add a record to the buffer table and update metadata. Args: session: Active database session. update_modified_at: Whether to update modified_at timestamp. **kwargs: Additional arguments for _prepare_buffer_data. """ now = datetime.now(timezone.utc) values_to_set = { 'storage_id': self._id, 'accessed_at': now, # All entries in the buffer require updating `accessed_at` 'modified_at': now if update_modified_at else None, } values_to_set.update(self._prepare_buffer_data(**kwargs)) session.add(self._BUFFER_TABLE(**values_to_set)) async def _try_acquire_buffer_lock(self, session: AsyncSession) -> bool: """Try to acquire buffer processing lock for a short period. Args: session: Active database session. Returns: True if lock was acquired, False if already locked by another process. """ capture_error_code = 1020 # MariaDB error code for "Record has changed since last read" now = datetime.now(timezone.utc) lock_until = now + self._BLOCK_BUFFER_TIME dialect = self._storage_client.get_dialect_name() if dialect in {'postgresql', 'mysql', 'mariadb'}: select_stmt = ( select(self._METADATA_TABLE) .where( self._METADATA_TABLE.id == self._id, (self._METADATA_TABLE.buffer_locked_until.is_(None)) | (self._METADATA_TABLE.buffer_locked_until < now), select(self._BUFFER_TABLE.id).where(self._BUFFER_TABLE.storage_id == self._id).exists(), ) .with_for_update(skip_locked=True) ) try: result = await session.execute(select_stmt) except OperationalError as e: # MariaDB raises error 1020 ("Record has changed since last read") instead of # silently skipping locked rows like MySQL/PostgreSQL. Treat it as lock not acquired. error_code = getattr(e.orig, 'args', [None])[0] if error_code == capture_error_code: return False raise metadata_row = result.scalar_one_or_none() if metadata_row is None: # Either conditions not met OR row is locked by another process return False # Acquire lock only if not currently locked or lock has expired update_stmt = ( update(self._METADATA_TABLE) .where( self._METADATA_TABLE.id == self._id, (self._METADATA_TABLE.buffer_locked_until.is_(None)) | (self._METADATA_TABLE.buffer_locked_until < now), select(self._BUFFER_TABLE.id).where(self._BUFFER_TABLE.storage_id == self._id).exists(), ) .values(buffer_locked_until=lock_until) ) result = await session.execute(update_stmt) result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result if result.rowcount > 0: await session.flush() return True return False async def _release_buffer_lock(self, session: AsyncSession) -> None: """Release buffer processing lock by setting buffer_locked_until to NULL. Args: session: Active database session. """ stmt = update(self._METADATA_TABLE).where(self._METADATA_TABLE.id == self._id).values(buffer_locked_until=None) await session.execute(stmt) async def _has_pending_buffer_updates(self, session: AsyncSession) -> bool: """Check if there are pending buffer updates not yet applied to metadata. Returns False only when buffer_locked_until is NULL (metadata is consistent). Returns: True if metadata might be inconsistent due to pending buffer updates. """ result = await session.execute( select(self._METADATA_TABLE.buffer_locked_until).where(self._METADATA_TABLE.id == self._id) ) locked_until = result.scalar() # Any non-NULL value means there are pending updates return locked_until is not None async def _process_buffers(self) -> None: """Process pending buffer updates and apply them to metadata.""" async with self.get_session(with_simple_commit=True) as session: # Try to acquire buffer processing lock if not await self._try_acquire_buffer_lock(session): # Another process is currently processing buffers or lock acquisition failed return # Get the maximum buffer ID at this moment # This creates a consistent snapshot - records added during processing won't be included max_buffer_id_stmt = select(sql_func.max(self._BUFFER_TABLE.id)).where( self._BUFFER_TABLE.storage_id == self._id ) result = await session.execute(max_buffer_id_stmt) max_buffer_id = result.scalar() if max_buffer_id is None: # No buffer records to process. Release the lock and exit. await self._release_buffer_lock(session) return # Apply aggregated buffer updates to metadata using only records <= max_buffer_id # This method is implemented by concrete storage classes await self._apply_buffer_updates(session, max_buffer_id=max_buffer_id) # Clean up only the processed buffer records (those <= max_buffer_id) delete_stmt = delete(self._BUFFER_TABLE).where( self._BUFFER_TABLE.storage_id == self._id, self._BUFFER_TABLE.id <= max_buffer_id ) await session.execute(delete_stmt) # Release the lock after successful processing await self._release_buffer_lock(session) ================================================ FILE: src/crawlee/storage_clients/_sql/_dataset_client.py ================================================ from __future__ import annotations from datetime import datetime, timezone from logging import getLogger from typing import TYPE_CHECKING, Any from sqlalchemy import Select, insert, select from sqlalchemy import func as sql_func from typing_extensions import Self, override from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata from ._client_mixin import MetadataUpdateParams, SqlClientMixin from ._db_models import DatasetItemDb, DatasetMetadataBufferDb, DatasetMetadataDb if TYPE_CHECKING: from collections.abc import AsyncIterator from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession from typing_extensions import NotRequired from ._storage_client import SqlStorageClient logger = getLogger(__name__) class _DatasetMetadataUpdateParams(MetadataUpdateParams): """Parameters for updating dataset metadata.""" new_item_count: NotRequired[int] delta_item_count: NotRequired[int] class SqlDatasetClient(DatasetClient, SqlClientMixin): """SQL implementation of the dataset client. This client persists dataset items to a SQL database using two tables for storage and retrieval. Items are stored as JSON with automatic ordering preservation. The dataset data is stored in SQL database tables following the pattern: - `datasets` table: Contains dataset metadata (id, name, timestamps, item_count) - `dataset_records` table: Contains individual items with JSON data and auto-increment ordering - `dataset_metadata_buffer` table: Buffers metadata updates for performance optimization Items are stored as a JSON object in SQLite and as JSONB in PostgreSQL. These objects must be JSON-serializable. The `item_id` auto-increment primary key ensures insertion order is preserved. All operations are wrapped in database transactions with CASCADE deletion support. """ _DEFAULT_NAME = 'default' """Default dataset name used when no name is provided.""" _METADATA_TABLE = DatasetMetadataDb """SQLAlchemy model for dataset metadata.""" _ITEM_TABLE = DatasetItemDb """SQLAlchemy model for dataset items.""" _CLIENT_TYPE = 'Dataset' """Human-readable client type for error messages.""" _BUFFER_TABLE = DatasetMetadataBufferDb """SQLAlchemy model for metadata buffer.""" def __init__( self, *, id: str, storage_client: SqlStorageClient, ) -> None: """Initialize a new instance. Preferably use the `SqlDatasetClient.open` class method to create a new instance. """ super().__init__(id=id, storage_client=storage_client) @classmethod async def open( cls, *, id: str | None, name: str | None, alias: str | None, storage_client: SqlStorageClient, ) -> Self: """Open an existing dataset or create a new one. Args: id: The ID of the dataset to open. If provided, searches for existing dataset by ID. name: The name of the dataset for named (global scope) storages. alias: The alias of the dataset for unnamed (run scope) storages. storage_client: The SQL storage client instance. Returns: An instance for the opened or created storage client. Raises: ValueError: If a dataset with the specified ID is not found. """ return await cls._safely_open( id=id, name=name, alias=alias, storage_client=storage_client, metadata_model=DatasetMetadata, extra_metadata_fields={'item_count': 0}, ) @override async def get_metadata(self) -> DatasetMetadata: # The database is a single place of truth return await self._get_metadata(DatasetMetadata) @override async def drop(self) -> None: """Delete this dataset and all its items from the database. This operation is irreversible. Uses CASCADE deletion to remove all related items. """ await self._drop() @override async def purge(self) -> None: """Remove all items from this dataset while keeping the dataset structure. Resets item_count to 0 and deletes all records from dataset_records table. """ now = datetime.now(timezone.utc) await self._purge( metadata_kwargs=_DatasetMetadataUpdateParams( new_item_count=0, accessed_at=now, modified_at=now, ) ) @override async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: if not isinstance(data, list): data = [data] db_items = [{'dataset_id': self._id, 'data': item} for item in data] stmt = insert(self._ITEM_TABLE).values(db_items) async with self.get_session(with_simple_commit=True) as session: await session.execute(stmt) await self._add_buffer_record(session, update_modified_at=True, delta_item_count=len(data)) @override async def get_data( self, *, offset: int = 0, limit: int | None = 999_999_999_999, clean: bool = False, desc: bool = False, fields: list[str] | None = None, omit: list[str] | None = None, unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, flatten: list[str] | None = None, view: str | None = None, ) -> DatasetItemsListPage: stmt = self._prepare_get_stmt( offset=offset, limit=limit, clean=clean, desc=desc, fields=fields, omit=omit, unwind=unwind, skip_empty=skip_empty, skip_hidden=skip_hidden, flatten=flatten, view=view, ) async with self.get_session(with_simple_commit=True) as session: result = await session.execute(stmt) db_items = result.scalars().all() await self._add_buffer_record(session) items = [db_item.data for db_item in db_items] metadata = await self.get_metadata() return DatasetItemsListPage( items=items, count=len(items), desc=desc, limit=limit or 0, offset=offset or 0, total=metadata.item_count, ) @override async def iterate_items( self, *, offset: int = 0, limit: int | None = None, clean: bool = False, desc: bool = False, fields: list[str] | None = None, omit: list[str] | None = None, unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, ) -> AsyncIterator[dict[str, Any]]: stmt = self._prepare_get_stmt( offset=offset, limit=limit, clean=clean, desc=desc, fields=fields, omit=omit, unwind=unwind, skip_empty=skip_empty, skip_hidden=skip_hidden, ) async with self.get_session(with_simple_commit=True) as session: db_items = await session.stream_scalars(stmt) async for db_item in db_items: yield db_item.data await self._add_buffer_record(session) def _prepare_get_stmt( self, *, offset: int = 0, limit: int | None = 999_999_999_999, clean: bool = False, desc: bool = False, fields: list[str] | None = None, omit: list[str] | None = None, unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, flatten: list[str] | None = None, view: str | None = None, ) -> Select: # Check for unsupported arguments and log a warning if found. unsupported_args: dict[str, Any] = { 'clean': clean, 'fields': fields, 'omit': omit, 'unwind': unwind, 'skip_hidden': skip_hidden, 'flatten': flatten, 'view': view, } unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)} if unsupported: logger.warning( f'The arguments {list(unsupported.keys())} of get_data are not supported by the ' f'{self.__class__.__name__} client.' ) stmt = select(self._ITEM_TABLE).where(self._ITEM_TABLE.dataset_id == self._id) if skip_empty: # Skip items that are empty JSON objects stmt = stmt.where(self._ITEM_TABLE.data != {}) # Apply ordering by insertion order (item_id) stmt = stmt.order_by(self._ITEM_TABLE.item_id.desc()) if desc else stmt.order_by(self._ITEM_TABLE.item_id.asc()) return stmt.offset(offset).limit(limit) @override def _specific_update_metadata( self, new_item_count: int | None = None, delta_item_count: int | None = None, **_kwargs: dict[str, Any], ) -> dict[str, Any]: """Directly update the dataset metadata in the database. Args: session: The SQLAlchemy AsyncSession to use for the update. new_item_count: If provided, set item count to this value. delta_item_count: If provided, add this value to the current item count. """ values_to_set: dict[str, Any] = {} if new_item_count is not None: values_to_set['item_count'] = new_item_count elif delta_item_count: # Use database-level for atomic updates values_to_set['item_count'] = self._METADATA_TABLE.item_count + delta_item_count return values_to_set @override def _prepare_buffer_data(self, delta_item_count: int | None = None, **_kwargs: Any) -> dict[str, Any]: """Prepare dataset specific buffer data. Args: delta_item_count: If provided, add this value to the current item count. """ buffer_data = {} if delta_item_count is not None: buffer_data['delta_item_count'] = delta_item_count return buffer_data @override async def _apply_buffer_updates(self, session: AsyncSession, max_buffer_id: int) -> None: aggregation_stmt = select( sql_func.max(self._BUFFER_TABLE.accessed_at).label('max_accessed_at'), sql_func.max(self._BUFFER_TABLE.modified_at).label('max_modified_at'), sql_func.sum(self._BUFFER_TABLE.delta_item_count).label('delta_item_count'), ).where(self._BUFFER_TABLE.storage_id == self._id, self._BUFFER_TABLE.id <= max_buffer_id) result = await session.execute(aggregation_stmt) row = result.first() if not row: return await self._update_metadata( session, **_DatasetMetadataUpdateParams( accessed_at=row.max_accessed_at, modified_at=row.max_modified_at, delta_item_count=row.delta_item_count, ), ) ================================================ FILE: src/crawlee/storage_clients/_sql/_db_models.py ================================================ from __future__ import annotations from datetime import datetime, timezone from typing import TYPE_CHECKING, Any from sqlalchemy import JSON, BigInteger, Boolean, ForeignKey, Index, Integer, LargeBinary, String, Text, text from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship, synonym from sqlalchemy.types import DateTime, TypeDecorator from typing_extensions import override if TYPE_CHECKING: from sqlalchemy.engine import Dialect from sqlalchemy.types import TypeEngine class AwareDateTime(TypeDecorator): """Custom SQLAlchemy type for timezone-aware datetime handling. Ensures all datetime values are timezone-aware by adding UTC timezone to naive datetime values from databases that don't store timezone information. """ impl = DateTime(timezone=True) cache_ok = True @override def process_result_value(self, value: datetime | None, dialect: Dialect) -> datetime | None: """Add UTC timezone to naive datetime values.""" if value is not None and value.tzinfo is None: return value.replace(tzinfo=timezone.utc) return value class JsonField(TypeDecorator): """Uses JSONB for PostgreSQL and JSON for other databases.""" impl = JSON cache_ok = True def load_dialect_impl(self, dialect: Dialect) -> TypeEngine[JSON | JSONB]: """Load the appropriate dialect implementation for the JSON type.""" if dialect.name == 'postgresql': return dialect.type_descriptor(JSONB()) return dialect.type_descriptor(JSON()) class Base(DeclarativeBase): """Base class for all database models for correct type annotations.""" class StorageMetadataDb: """Base database model for storage metadata.""" internal_name: Mapped[str] = mapped_column(String(255), nullable=False, index=True, unique=True) """Internal unique name for a storage instance based on a name or alias.""" name: Mapped[str | None] = mapped_column(String(255), nullable=True, unique=True) """Human-readable name. None becomes 'default' in database to enforce uniqueness.""" accessed_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False) """Last access datetime for usage tracking.""" created_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False) """Creation datetime.""" modified_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False) """Last modification datetime.""" buffer_locked_until: Mapped[datetime | None] = mapped_column(AwareDateTime, nullable=True) """Timestamp until which buffer processing is locked for this storage. NULL = unlocked.""" class DatasetMetadataDb(StorageMetadataDb, Base): """Metadata table for datasets.""" __tablename__ = 'datasets' dataset_id: Mapped[str] = mapped_column(String(20), nullable=False, primary_key=True) """Unique identifier for the dataset.""" item_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0) """Number of items in the dataset.""" # Relationship to dataset items with cascade deletion items: Mapped[list[DatasetItemDb]] = relationship( back_populates='dataset', cascade='all, delete-orphan', lazy='noload' ) id = synonym('dataset_id') """Alias for dataset_id to match Pydantic expectations.""" class RequestQueueMetadataDb(StorageMetadataDb, Base): """Metadata table for request queues.""" __tablename__ = 'request_queues' request_queue_id: Mapped[str] = mapped_column(String(20), nullable=False, primary_key=True) """Unique identifier for the request queue.""" had_multiple_clients: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) """Flag indicating if multiple clients have accessed this queue.""" handled_request_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0) """Number of requests processed.""" pending_request_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0) """Number of requests waiting to be processed.""" total_request_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0) """Total number of requests ever added to this queue.""" # Relationship to queue requests with cascade deletion requests: Mapped[list[RequestDb]] = relationship( back_populates='queue', cascade='all, delete-orphan', lazy='noload' ) # Relationship to queue state state: Mapped[RequestQueueStateDb] = relationship( back_populates='queue', cascade='all, delete-orphan', lazy='noload' ) id = synonym('request_queue_id') """Alias for request_queue_id to match Pydantic expectations.""" class KeyValueStoreMetadataDb(StorageMetadataDb, Base): """Metadata table for key-value stores.""" __tablename__ = 'key_value_stores' key_value_store_id: Mapped[str] = mapped_column(String(20), nullable=False, primary_key=True) """Unique identifier for the key-value store.""" # Relationship to store records with cascade deletion records: Mapped[list[KeyValueStoreRecordDb]] = relationship( back_populates='kvs', cascade='all, delete-orphan', lazy='noload' ) id = synonym('key_value_store_id') """Alias for key_value_store_id to match Pydantic expectations.""" class KeyValueStoreRecordDb(Base): """Records table for key-value stores.""" __tablename__ = 'key_value_store_records' key_value_store_id: Mapped[str] = mapped_column( String(20), ForeignKey('key_value_stores.key_value_store_id', ondelete='CASCADE'), primary_key=True, index=True, nullable=False, ) """Foreign key to metadata key-value store record.""" key: Mapped[str] = mapped_column(String(255), primary_key=True) """The key part of the key-value pair.""" value: Mapped[bytes] = mapped_column(LargeBinary, nullable=False) """Value stored as binary data to support any content type.""" content_type: Mapped[str] = mapped_column(String(50), nullable=False) """MIME type for proper value deserialization.""" size: Mapped[int | None] = mapped_column(Integer, nullable=False, default=0) """Size of stored value in bytes.""" # Relationship back to parent store kvs: Mapped[KeyValueStoreMetadataDb] = relationship(back_populates='records') storage_id = synonym('key_value_store_id') """Alias for key_value_store_id to match SqlClientMixin expectations.""" class DatasetItemDb(Base): """Items table for datasets.""" __tablename__ = 'dataset_records' item_id: Mapped[int] = mapped_column(Integer, primary_key=True) """Auto-increment primary key preserving insertion order.""" dataset_id: Mapped[str] = mapped_column( String(20), ForeignKey('datasets.dataset_id', ondelete='CASCADE'), index=True, ) """Foreign key to metadata dataset record.""" data: Mapped[list[dict[str, Any]] | dict[str, Any]] = mapped_column(JsonField, nullable=False) """JSON serializable item data.""" # Relationship back to parent dataset dataset: Mapped[DatasetMetadataDb] = relationship(back_populates='items') storage_id = synonym('dataset_id') """Alias for dataset_id to match SqlClientMixin expectations.""" class RequestDb(Base): """Requests table for request queues.""" __tablename__ = 'request_queue_records' __table_args__ = ( Index( 'idx_fetch_available', 'request_queue_id', 'is_handled', 'sequence_number', postgresql_where=text('is_handled = false'), ), Index( 'idx_count_aggregate', 'request_queue_id', 'is_handled', ), ) request_id: Mapped[int] = mapped_column(BigInteger, primary_key=True) """Unique identifier for the request representing the unique_key.""" request_queue_id: Mapped[str] = mapped_column( String(20), ForeignKey('request_queues.request_queue_id', ondelete='CASCADE'), primary_key=True ) """Foreign key to metadata request queue record.""" data: Mapped[str] = mapped_column(Text, nullable=False) """JSON-serialized Request object.""" sequence_number: Mapped[int] = mapped_column(Integer, nullable=False) """Ordering sequence: negative for forefront, positive for regular.""" is_handled: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) """Processing status flag.""" time_blocked_until: Mapped[datetime | None] = mapped_column(AwareDateTime, nullable=True) """Timestamp until which this request is considered blocked for processing by other clients.""" client_key: Mapped[str | None] = mapped_column(String(32), nullable=True) """Identifier of the client that has currently locked this request for processing.""" # Relationship back to metadata table queue: Mapped[RequestQueueMetadataDb] = relationship(back_populates='requests') storage_id = synonym('request_queue_id') """Alias for request_queue_id to match SqlClientMixin expectations.""" class RequestQueueStateDb(Base): """State table for request queues.""" __tablename__ = 'request_queue_state' request_queue_id: Mapped[str] = mapped_column( String(20), ForeignKey('request_queues.request_queue_id', ondelete='CASCADE'), primary_key=True ) """Foreign key to metadata request queue record.""" sequence_counter: Mapped[int] = mapped_column(Integer, nullable=False, default=1) """Counter for regular request ordering (positive).""" forefront_sequence_counter: Mapped[int] = mapped_column(Integer, nullable=False, default=-1) """Counter for forefront request ordering (negative).""" # Relationship back to metadata table queue: Mapped[RequestQueueMetadataDb] = relationship(back_populates='state') class VersionDb(Base): """Table for storing the database schema version.""" __tablename__ = 'version' version: Mapped[str] = mapped_column(String(10), nullable=False, primary_key=True) class MetadataBufferDb: """Base model for metadata update buffer tables.""" id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) """Auto-increment primary key for ordering.""" # Timestamp fields - use max value when aggregating accessed_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False) """New accessed_at timestamp, if being updated.""" modified_at: Mapped[datetime | None] = mapped_column(AwareDateTime, nullable=True) """New modified_at timestamp, if being updated.""" class KeyValueStoreMetadataBufferDb(MetadataBufferDb, Base): """Buffer table for deferred key-value store metadata updates to reduce concurrent access issues.""" __tablename__ = 'key_value_store_metadata_buffer' # Don't use foreign key constraint to avoid DB locks on high concurrency. key_value_store_id: Mapped[str] = mapped_column(String(20), nullable=False, index=True) """ID of the key-value store being updated.""" storage_id = synonym('key_value_store_id') """Alias for key_value_store_id to match SqlClientMixin expectations.""" class DatasetMetadataBufferDb(MetadataBufferDb, Base): """Buffer table for deferred dataset metadata updates to reduce concurrent access issues.""" __tablename__ = 'dataset_metadata_buffer' # Don't use foreign key constraint to avoid DB locks on high concurrency. dataset_id: Mapped[str] = mapped_column(String(20), nullable=False, index=True) """ID of the dataset being updated.""" # Counter deltas - use SUM when aggregating. delta_item_count: Mapped[int | None] = mapped_column(Integer, nullable=True) """Delta for dataset item_count.""" storage_id = synonym('dataset_id') """Alias for dataset_id to match SqlClientMixin expectations.""" class RequestQueueMetadataBufferDb(MetadataBufferDb, Base): """Buffer table for deferred request queue metadata updates to reduce concurrent access issues.""" __tablename__ = 'request_queue_metadata_buffer' __table_args__ = (Index('idx_rq_client', 'request_queue_id', 'client_id'),) # Don't use foreign key constraint to avoid DB locks on high concurrency. request_queue_id: Mapped[str] = mapped_column(String(20), nullable=False, index=True) """ID of the request queue being updated.""" client_id: Mapped[str] = mapped_column(String(32), nullable=False) """Identifier of the client making this update.""" # Counter deltas - use SUM when aggregating. delta_handled_count: Mapped[int | None] = mapped_column(Integer, nullable=True) """Delta for handled_request_count.""" delta_pending_count: Mapped[int | None] = mapped_column(Integer, nullable=True) """Delta for pending_request_count.""" delta_total_count: Mapped[int | None] = mapped_column(Integer, nullable=True) """Delta for total_request_count.""" need_recalc: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) """Flag indicating that counters need recalculation from actual data.""" storage_id = synonym('request_queue_id') """Alias for request_queue_id to match SqlClientMixin expectations.""" ================================================ FILE: src/crawlee/storage_clients/_sql/_key_value_store_client.py ================================================ from __future__ import annotations import json from datetime import datetime, timezone from logging import getLogger from typing import TYPE_CHECKING, Any, cast from sqlalchemy import CursorResult, delete, select from sqlalchemy import func as sql_func from typing_extensions import Self, override from crawlee._utils.file import infer_mime_type from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import ( KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata, ) from ._client_mixin import MetadataUpdateParams, SqlClientMixin from ._db_models import KeyValueStoreMetadataBufferDb, KeyValueStoreMetadataDb, KeyValueStoreRecordDb if TYPE_CHECKING: from collections.abc import AsyncIterator from sqlalchemy.ext.asyncio import AsyncSession from ._storage_client import SqlStorageClient logger = getLogger(__name__) class SqlKeyValueStoreClient(KeyValueStoreClient, SqlClientMixin): """SQL implementation of the key-value store client. This client persists key-value data to a SQL database with transaction support and concurrent access safety. Keys are mapped to rows in database tables with proper indexing for efficient retrieval. The key-value store data is stored in SQL database tables following the pattern: - `key_value_stores` table: Contains store metadata (id, name, timestamps) - `key_value_store_records` table: Contains individual key-value pairs with binary value storage, content type, and size information - `key_value_store_metadata_buffer` table: Buffers metadata updates for performance optimization Values are serialized based on their type: JSON objects are stored as formatted JSON, text values as UTF-8 encoded strings, and binary data as-is in the `LargeBinary` column. The implementation automatically handles content type detection and maintains metadata about each record including size and MIME type information. All database operations are wrapped in transactions with proper error handling and rollback mechanisms. The client supports atomic upsert operations and handles race conditions when multiple clients access the same store using composite primary keys (key_value_store_id, key). """ _DEFAULT_NAME = 'default' """Default dataset name used when no name is provided.""" _METADATA_TABLE = KeyValueStoreMetadataDb """SQLAlchemy model for key-value store metadata.""" _ITEM_TABLE = KeyValueStoreRecordDb """SQLAlchemy model for key-value store items.""" _CLIENT_TYPE = 'Key-value store' """Human-readable client type for error messages.""" _BUFFER_TABLE = KeyValueStoreMetadataBufferDb """SQLAlchemy model for metadata buffer.""" def __init__( self, *, storage_client: SqlStorageClient, id: str, ) -> None: """Initialize a new instance. Preferably use the `SqlKeyValueStoreClient.open` class method to create a new instance. """ super().__init__(id=id, storage_client=storage_client) @classmethod async def open( cls, *, id: str | None, name: str | None, alias: str | None, storage_client: SqlStorageClient, ) -> Self: """Open or create a SQL key-value store client. This method attempts to open an existing key-value store from the SQL database. If a KVS with the specified ID or name exists, it loads the metadata from the database. If no existing store is found, a new one is created. Args: id: The ID of the key-value store to open. If provided, searches for existing store by ID. name: The name of the key-value store for named (global scope) storages. alias: The alias of the key-value store for unnamed (run scope) storages. storage_client: The SQL storage client used to access the database. Returns: An instance for the opened or created storage client. Raises: ValueError: If a store with the specified ID is not found, or if metadata is invalid. """ return await cls._safely_open( id=id, name=name, alias=alias, storage_client=storage_client, metadata_model=KeyValueStoreMetadata, extra_metadata_fields={}, ) @override async def get_metadata(self) -> KeyValueStoreMetadata: # The database is a single place of truth return await self._get_metadata(KeyValueStoreMetadata) @override async def drop(self) -> None: """Delete this key-value store and all its records from the database. This operation is irreversible. Uses CASCADE deletion to remove all related records. """ await self._drop() @override async def purge(self) -> None: """Remove all items from this key-value store while keeping the key-value store structure. Remove all records from key_value_store_records table. """ now = datetime.now(timezone.utc) await self._purge(metadata_kwargs=MetadataUpdateParams(accessed_at=now, modified_at=now)) @override async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None: # Special handling for None values if value is None: content_type = 'application/x-none' # Special content type to identify None values value_bytes = b'' else: content_type = content_type or infer_mime_type(value) # Serialize the value to bytes. if 'application/json' in content_type: value_bytes = json.dumps(value, default=str, ensure_ascii=False).encode('utf-8') elif isinstance(value, str): value_bytes = value.encode('utf-8') elif isinstance(value, (bytes, bytearray)): value_bytes = value else: # Fallback: attempt to convert to string and encode. value_bytes = str(value).encode('utf-8') size = len(value_bytes) insert_values = { 'key_value_store_id': self._id, 'key': key, 'value': value_bytes, 'content_type': content_type, 'size': size, } upsert_stmt = self._build_upsert_stmt( self._ITEM_TABLE, insert_values=insert_values, update_columns=['value', 'content_type', 'size'], conflict_cols=['key_value_store_id', 'key'], ) async with self.get_session(with_simple_commit=True) as session: await session.execute(upsert_stmt) await self._add_buffer_record(session, update_modified_at=True) @override async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: # Query the record by key stmt = select(self._ITEM_TABLE).where( self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key ) async with self.get_session(with_simple_commit=True) as session: result = await session.execute(stmt) record_db = result.scalar_one_or_none() await self._add_buffer_record(session) if not record_db: return None # Deserialize the value based on content type value_bytes = record_db.value # Handle None values if record_db.content_type == 'application/x-none': value = None # Handle JSON values elif 'application/json' in record_db.content_type: try: value = json.loads(value_bytes.decode('utf-8')) except (json.JSONDecodeError, UnicodeDecodeError): logger.warning(f'Failed to decode JSON value for key "{key}"') return None # Handle text values elif record_db.content_type.startswith('text/'): try: value = value_bytes.decode('utf-8') except UnicodeDecodeError: logger.warning(f'Failed to decode text value for key "{key}"') return None # Handle binary values else: value = value_bytes return KeyValueStoreRecord( key=record_db.key, value=value, content_type=record_db.content_type, size=record_db.size, ) @override async def delete_value(self, *, key: str) -> None: stmt = delete(self._ITEM_TABLE).where( self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key ) async with self.get_session(with_simple_commit=True) as session: # Delete the record if it exists result = await session.execute(stmt) result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result # Update metadata if we actually deleted something if result.rowcount > 0: await self._add_buffer_record(session, update_modified_at=True) @override async def iterate_keys( self, *, exclusive_start_key: str | None = None, limit: int | None = None, ) -> AsyncIterator[KeyValueStoreRecordMetadata]: # Build query for record metadata stmt = ( select(self._ITEM_TABLE.key, self._ITEM_TABLE.content_type, self._ITEM_TABLE.size) .where(self._ITEM_TABLE.key_value_store_id == self._id) .order_by(self._ITEM_TABLE.key) ) # Apply exclusive_start_key filter if exclusive_start_key is not None: stmt = stmt.where(self._ITEM_TABLE.key > exclusive_start_key) # Apply limit if limit is not None: stmt = stmt.limit(limit) async with self.get_session(with_simple_commit=True) as session: result = await session.stream(stmt.execution_options(stream_results=True)) async for row in result: yield KeyValueStoreRecordMetadata( key=row.key, content_type=row.content_type, size=row.size, ) await self._add_buffer_record(session) @override async def record_exists(self, *, key: str) -> bool: stmt = select(self._ITEM_TABLE.key).where( self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key ) async with self.get_session(with_simple_commit=True) as session: # Check if record exists result = await session.execute(stmt) await self._add_buffer_record(session) return result.scalar_one_or_none() is not None @override async def get_public_url(self, *, key: str) -> str: raise NotImplementedError('Public URLs are not supported for SQL key-value stores.') @override def _specific_update_metadata(self, **_kwargs: dict[str, Any]) -> dict[str, Any]: return {} @override def _prepare_buffer_data(self, **_kwargs: Any) -> dict[str, Any]: """Prepare key-value store specific buffer data. For KeyValueStore, we don't have specific metadata fields to track in buffer, so we just return empty dict. The base buffer will handle accessed_at/modified_at. """ return {} @override async def _apply_buffer_updates(self, session: AsyncSession, max_buffer_id: int) -> None: aggregation_stmt = select( sql_func.max(self._BUFFER_TABLE.accessed_at).label('max_accessed_at'), sql_func.max(self._BUFFER_TABLE.modified_at).label('max_modified_at'), ).where(self._BUFFER_TABLE.storage_id == self._id, self._BUFFER_TABLE.id <= max_buffer_id) result = await session.execute(aggregation_stmt) row = result.first() if not row: return await self._update_metadata( session, **MetadataUpdateParams( accessed_at=row.max_accessed_at, modified_at=row.max_modified_at, ), ) ================================================ FILE: src/crawlee/storage_clients/_sql/_request_queue_client.py ================================================ from __future__ import annotations from collections import deque from datetime import datetime, timedelta, timezone from functools import lru_cache from hashlib import sha256 from logging import getLogger from typing import TYPE_CHECKING, Any, cast from sqlalchemy import CursorResult, exists, func, or_, select, update from sqlalchemy import func as sql_func from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.orm import load_only from typing_extensions import NotRequired, Self, override from crawlee import Request from crawlee._utils.crypto import crypto_random_object_id from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import ( AddRequestsResponse, ProcessedRequest, RequestQueueMetadata, UnprocessedRequest, ) from ._client_mixin import MetadataUpdateParams, SqlClientMixin from ._db_models import RequestDb, RequestQueueMetadataBufferDb, RequestQueueMetadataDb, RequestQueueStateDb if TYPE_CHECKING: from collections.abc import Sequence from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.sql import ColumnElement from ._storage_client import SqlStorageClient logger = getLogger(__name__) class _QueueMetadataUpdateParams(MetadataUpdateParams): """Parameters for updating queue metadata.""" new_handled_request_count: NotRequired[int] new_pending_request_count: NotRequired[int] new_total_request_count: NotRequired[int] delta_handled_request_count: NotRequired[int] delta_pending_request_count: NotRequired[int] delta_total_request_count: NotRequired[int] recalculate: NotRequired[bool] update_had_multiple_clients: NotRequired[bool] class SqlRequestQueueClient(RequestQueueClient, SqlClientMixin): """SQL implementation of the request queue client. This client persists requests to a SQL database with transaction handling and concurrent access safety. Requests are stored with sequence-based ordering and efficient querying capabilities. The implementation uses negative sequence numbers for forefront (high-priority) requests and positive sequence numbers for regular requests, allowing for efficient single-query ordering. A cache mechanism reduces database queries. The request queue data is stored in SQL database tables following the pattern: - `request_queues` table: Contains queue metadata (id, name, timestamps, request counts, multi-client flag) - `request_queue_records` table: Contains individual requests with JSON data, unique keys for deduplication, sequence numbers for ordering, and processing status flags - `request_queue_state` table: Maintains counters for sequence numbers to ensure proper ordering of requests. - `request_queue_metadata_buffer` table: Buffers metadata updates for performance optimization Requests are serialized to JSON for storage and maintain proper ordering through sequence numbers. The implementation provides concurrent access safety through transaction handling, locking mechanisms, and optimized database indexes for efficient querying. """ _DEFAULT_NAME = 'default' """Default dataset name used when no name is provided.""" _MAX_BATCH_FETCH_SIZE = 10 """Maximum number of requests to fetch from the database in a single batch operation. Used to limit the number of requests loaded and locked for processing at once (improves efficiency and reduces database load). """ _METADATA_TABLE = RequestQueueMetadataDb """SQLAlchemy model for request queue metadata.""" _ITEM_TABLE = RequestDb """SQLAlchemy model for request items.""" _CLIENT_TYPE = 'Request queue' """Human-readable client type for error messages.""" _BLOCK_REQUEST_TIME = 300 """Number of seconds for which a request is considered blocked in the database after being fetched for processing. """ _BUFFER_TABLE = RequestQueueMetadataBufferDb """SQLAlchemy model for metadata buffer.""" def __init__( self, *, id: str, storage_client: SqlStorageClient, ) -> None: """Initialize a new instance. Preferably use the `SqlRequestQueueClient.open` class method to create a new instance. """ super().__init__(id=id, storage_client=storage_client) self._pending_fetch_cache: deque[Request] = deque() """Cache for requests: ordered by sequence number.""" self.client_key = crypto_random_object_id(length=32)[:32] """Unique identifier for this client instance.""" self._had_multiple_clients = False """Indicates whether the queue has been accessed by multiple clients.""" @classmethod async def open( cls, *, id: str | None, name: str | None, alias: str | None, storage_client: SqlStorageClient, ) -> Self: """Open an existing request queue or create a new one. This method first tries to find an existing queue by ID or name. If found, it returns a client for that queue. If not found, it creates a new queue with the specified parameters. Args: id: The ID of the request queue to open. Takes precedence over name. name: The name of the request queue for named (global scope) storages. alias: The alias of the request queue for unnamed (run scope) storages. storage_client: The SQL storage client used to access the database. Returns: An instance for the opened or created request queue. Raises: ValueError: If a queue with the specified ID is not found. """ return await cls._safely_open( id=id, name=name, alias=alias, storage_client=storage_client, metadata_model=RequestQueueMetadata, extra_metadata_fields={ 'had_multiple_clients': False, 'handled_request_count': 0, 'pending_request_count': 0, 'total_request_count': 0, }, ) @override async def get_metadata(self) -> RequestQueueMetadata: # The database is a single place of truth metadata = await self._get_metadata(RequestQueueMetadata) self._had_multiple_clients = metadata.had_multiple_clients return metadata @override async def drop(self) -> None: """Delete this request queue and all its records from the database. This operation is irreversible. Uses CASCADE deletion to remove all related records. """ await self._drop() self._pending_fetch_cache.clear() @override async def purge(self) -> None: """Remove all items from this dataset while keeping the dataset structure. Resets pending_request_count and handled_request_count to 0 and deletes all records from request_queue_records table. """ now = datetime.now(timezone.utc) await self._purge( metadata_kwargs=_QueueMetadataUpdateParams( accessed_at=now, modified_at=now, new_pending_request_count=0, new_handled_request_count=0, new_total_request_count=0, ) ) # Clear recoverable state self._pending_fetch_cache.clear() @override async def add_batch_of_requests( self, requests: Sequence[Request], *, forefront: bool = False, ) -> AddRequestsResponse: if not requests: return AddRequestsResponse(processed_requests=[], unprocessed_requests=[]) # Clear empty cache since we're adding requests processed_requests = [] unprocessed_requests = [] transaction_processed_requests = [] transaction_processed_requests_unique_keys = set() approximate_new_request = 0 # Deduplicate requests by unique_key upfront unique_requests = {} unique_key_by_request_id = {} for req in requests: if req.unique_key not in unique_requests: request_id = self._get_int_id_from_unique_key(req.unique_key) unique_requests[request_id] = req unique_key_by_request_id[request_id] = req.unique_key # Get existing requests by unique keys stmt = ( select(self._ITEM_TABLE) .where( self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id.in_(set(unique_requests.keys())), ) .options( load_only( self._ITEM_TABLE.request_id, self._ITEM_TABLE.is_handled, self._ITEM_TABLE.time_blocked_until, ) ) ) async with self.get_session() as session: result = await session.execute(stmt) result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result existing_requests = {req.request_id: req for req in result.scalars()} state = await self._get_state(session) insert_values: list[dict] = [] for request_id, request in sorted(unique_requests.items()): existing_req_db = existing_requests.get(request_id) # New Request, add it if existing_req_db is None: value = { 'request_id': request_id, 'request_queue_id': self._id, 'data': request.model_dump_json(), 'is_handled': False, } if forefront: value['sequence_number'] = state.forefront_sequence_counter state.forefront_sequence_counter -= 1 else: value['sequence_number'] = state.sequence_counter state.sequence_counter += 1 insert_values.append(value) transaction_processed_requests.append( ProcessedRequest( unique_key=request.unique_key, was_already_present=False, was_already_handled=False, ) ) transaction_processed_requests_unique_keys.add(request.unique_key) # Already handled request, skip adding elif existing_req_db and existing_req_db.is_handled: processed_requests.append( ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=True, ) ) # Already in progress in one of the clients elif existing_req_db and existing_req_db.time_blocked_until: processed_requests.append( ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=False, ) ) # Request in database but not yet handled and not in progress elif existing_req_db and not existing_req_db.is_handled and not existing_req_db.time_blocked_until: # Forefront request, update its sequence number if forefront: insert_values.append( { 'request_queue_id': self._id, 'request_id': request_id, 'sequence_number': state.forefront_sequence_counter, 'data': request.model_dump_json(), 'is_handled': False, } ) state.forefront_sequence_counter -= 1 transaction_processed_requests.append( ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=False, ) ) transaction_processed_requests_unique_keys.add(request.unique_key) # Regular request, keep its position else: processed_requests.append( ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=False, ) ) # Unexpected condition else: unprocessed_requests.append( UnprocessedRequest( unique_key=request.unique_key, url=request.url, method=request.method, ) ) try: if insert_values: if forefront: # If the request already exists in the database, we update the sequence_number # by shifting request to the left. upsert_stmt = self._build_upsert_stmt( self._ITEM_TABLE, insert_values, update_columns=['sequence_number'], conflict_cols=['request_id', 'request_queue_id'], ) result = await session.execute(upsert_stmt) else: # If the request already exists in the database, we ignore this request when inserting. insert_stmt_with_ignore = self._build_insert_stmt_with_ignore(self._ITEM_TABLE, insert_values) result = await session.execute(insert_stmt_with_ignore) result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result approximate_new_request += result.rowcount await self._add_buffer_record( session, update_modified_at=True, delta_pending_request_count=approximate_new_request, delta_total_request_count=approximate_new_request, ) await session.commit() processed_requests.extend(transaction_processed_requests) except SQLAlchemyError as e: await session.rollback() logger.debug(f'Failed add requests to DB with error: {e}') await self._add_buffer_record( session, update_modified_at=True, recalculate=True, ) await session.commit() transaction_processed_requests.clear() unprocessed_requests.extend( [ UnprocessedRequest( unique_key=request.unique_key, url=request.url, method=request.method, ) for request in requests if request.unique_key in transaction_processed_requests_unique_keys ] ) return AddRequestsResponse( processed_requests=processed_requests, unprocessed_requests=unprocessed_requests, ) @override async def get_request(self, unique_key: str) -> Request | None: request_id = self._get_int_id_from_unique_key(unique_key) stmt = select(self._ITEM_TABLE).where( self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id == request_id ) async with self.get_session(with_simple_commit=True) as session: result = await session.execute(stmt) request_db = result.scalar_one_or_none() if request_db is None: logger.warning(f'Request with ID "{unique_key}" not found in the queue.') return None await self._add_buffer_record(session) return Request.model_validate_json(request_db.data) @override async def fetch_next_request(self) -> Request | None: if self._pending_fetch_cache: return self._pending_fetch_cache.popleft() now = datetime.now(timezone.utc) block_until = now + timedelta(seconds=self._BLOCK_REQUEST_TIME) dialect = self._storage_client.get_dialect_name() # Get available requests not blocked by another client stmt = ( select(self._ITEM_TABLE) .where( self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.is_handled == False, # noqa: E712 or_(self._ITEM_TABLE.time_blocked_until.is_(None), self._ITEM_TABLE.time_blocked_until < now), ) .order_by(self._ITEM_TABLE.sequence_number.asc()) .limit(self._MAX_BATCH_FETCH_SIZE) ) async with self.get_session(with_simple_commit=True) as session: # We use the `skip_locked` database mechanism to prevent the 'interception' of requests by another client if dialect in {'postgresql', 'mysql', 'mariadb'}: stmt = stmt.with_for_update(skip_locked=True) result = await session.execute(stmt) requests_db = result.scalars().all() if not requests_db: return None # All requests received have already been reserved for update with the help of `skip_locked`. request_ids = {r.request_id for r in requests_db} update_stmt = ( update(self._ITEM_TABLE) .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id.in_(request_ids)) .values(time_blocked_until=block_until, client_key=self.client_key) ) await session.execute(update_stmt) blocked_ids = request_ids else: # For other databases, we first select the requests, then try to update them to be blocked. result = await session.execute(stmt) requests_db = result.scalars().all() if not requests_db: return None request_ids = {r.request_id for r in requests_db} update_stmt = ( update(self._ITEM_TABLE) .where( self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id.in_(request_ids), self._ITEM_TABLE.is_handled == False, # noqa: E712 or_(self._ITEM_TABLE.time_blocked_until.is_(None), self._ITEM_TABLE.time_blocked_until < now), ) .values(time_blocked_until=block_until, client_key=self.client_key) .returning(self._ITEM_TABLE.request_id) ) update_result = await session.execute(update_stmt) blocked_ids = {row[0] for row in update_result.fetchall()} if not blocked_ids: await session.rollback() return None await self._add_buffer_record(session) requests = [Request.model_validate_json(r.data) for r in requests_db if r.request_id in blocked_ids] if not requests: return None self._pending_fetch_cache.extend(requests[1:]) return requests[0] @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: request_id = self._get_int_id_from_unique_key(request.unique_key) # Update the request's handled_at timestamp. if request.handled_at is None: request.handled_at = datetime.now(timezone.utc) # Update request in Db stmt = ( update(self._ITEM_TABLE) .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id == request_id) .values(is_handled=True, time_blocked_until=None, client_key=None, data=request.model_dump_json()) ) async with self.get_session(with_simple_commit=True) as session: result = await session.execute(stmt) result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result if result.rowcount == 0: logger.warning(f'Request {request.unique_key} not found in database.') return None await self._add_buffer_record( session, update_modified_at=True, delta_pending_request_count=-1, delta_handled_request_count=1 ) return ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=True, ) @override async def reclaim_request( self, request: Request, *, forefront: bool = False, ) -> ProcessedRequest | None: request_id = self._get_int_id_from_unique_key(request.unique_key) stmt = update(self._ITEM_TABLE).where( self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id == request_id ) async with self.get_session(with_simple_commit=True) as session: state = await self._get_state(session) # Update sequence number if changing priority if forefront: new_sequence = state.forefront_sequence_counter state.forefront_sequence_counter -= 1 now = datetime.now(timezone.utc) block_until = now + timedelta(seconds=self._BLOCK_REQUEST_TIME) # Extend blocking for forefront request, it is considered blocked by the current client. stmt = stmt.values( sequence_number=new_sequence, time_blocked_until=block_until, client_key=self.client_key, data=request.model_dump_json(), ) else: new_sequence = state.sequence_counter state.sequence_counter += 1 stmt = stmt.values( sequence_number=new_sequence, time_blocked_until=None, client_key=None, data=request.model_dump_json(), ) result = await session.execute(stmt) result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result if result.rowcount == 0: logger.warning(f'Request {request.unique_key} not found in database.') return None await self._add_buffer_record(session, update_modified_at=True) # put the forefront request at the beginning of the cache if forefront: self._pending_fetch_cache.appendleft(request) return ProcessedRequest( unique_key=request.unique_key, was_already_present=True, was_already_handled=False, ) @override async def is_empty(self) -> bool: # Check in-memory cache for requests if self._pending_fetch_cache: return False metadata = await self.get_metadata() async with self.get_session(with_simple_commit=True) as session: # If there are no pending requests, check if there are any buffered updates if metadata.pending_request_count == 0: # Check for active buffer lock (indicates pending buffer processing) buffer_lock_stmt = select(self._METADATA_TABLE.buffer_locked_until).where( self._METADATA_TABLE.id == self._id ) buffer_lock_result = await session.execute(buffer_lock_stmt) buffer_locked_until = buffer_lock_result.scalar() # If buffer is locked, there are pending updates being processed if buffer_locked_until is not None: await self._add_buffer_record(session) return False # Check if there are any buffered updates that might change the pending count buffer_check_stmt = select( exists().where( (self._BUFFER_TABLE.storage_id == self._id) & ( (self._BUFFER_TABLE.delta_pending_count != 0) | (self._BUFFER_TABLE.need_recalc == True) # noqa: E712 ) ) ) buffer_result = await session.execute(buffer_check_stmt) has_pending_buffer_updates = buffer_result.scalar() await self._add_buffer_record(session) # If there are no pending requests and no buffered updates, the queue is empty return not has_pending_buffer_updates # There are pending requests (may be inaccurate), ensure recalculated metadata await self._add_buffer_record(session, update_modified_at=True, recalculate=True) return False async def _get_state(self, session: AsyncSession) -> RequestQueueStateDb: """Get the current state of the request queue.""" orm_state: RequestQueueStateDb | None = await session.get(RequestQueueStateDb, self._id) if not orm_state: insert_values = {'request_queue_id': self._id} # Create a new state if it doesn't exist # This is a safeguard against race conditions where multiple clients might try to create the state # simultaneously. insert_stmt = self._build_insert_stmt_with_ignore(RequestQueueStateDb, insert_values) await session.execute(insert_stmt) await session.flush() orm_state = await session.get(RequestQueueStateDb, self._id) if not orm_state: raise RuntimeError(f'Failed to create or retrieve state for queue {self._id}') return orm_state @override def _specific_update_metadata( self, new_handled_request_count: int | None = None, new_pending_request_count: int | None = None, new_total_request_count: int | None = None, delta_handled_request_count: int | None = None, delta_pending_request_count: int | None = None, delta_total_request_count: int | None = None, *, recalculate: bool = False, update_had_multiple_clients: bool = False, **_kwargs: dict[str, Any], ) -> dict[str, Any]: """Update the request queue metadata in the database. Args: session: The SQLAlchemy session to use for database operations. new_handled_request_count: If provided, update the handled_request_count to this value. new_pending_request_count: If provided, update the pending_request_count to this value. new_total_request_count: If provided, update the total_request_count to this value. delta_handled_request_count: If provided, add this value to the handled_request_count. delta_pending_request_count: If provided, add this value to the pending_request_count. delta_total_request_count: If provided, add this value to the total_request_count. recalculate: If True, recalculate the pending_request_count, and total_request_count on request table. update_had_multiple_clients: If True, set had_multiple_clients to True. """ values_to_set: dict[str, Any] = {} if update_had_multiple_clients: values_to_set['had_multiple_clients'] = True if recalculate: stmt = ( update(self._METADATA_TABLE) .where(self._METADATA_TABLE.request_queue_id == self._id) .values( pending_request_count=( select(func.count()) .select_from(self._ITEM_TABLE) .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.is_handled.is_(False)) .scalar_subquery() ), total_request_count=( select(func.count()) .select_from(self._ITEM_TABLE) .where(self._ITEM_TABLE.request_queue_id == self._id) .scalar_subquery() ), handled_request_count=( select(func.count()) .select_from(self._ITEM_TABLE) .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.is_handled.is_(True)) .scalar_subquery() ), ) ) values_to_set['custom_stmt'] = stmt else: if new_handled_request_count is not None: values_to_set['handled_request_count'] = new_handled_request_count elif delta_handled_request_count is not None: values_to_set['handled_request_count'] = ( self._METADATA_TABLE.handled_request_count + delta_handled_request_count ) if new_pending_request_count is not None: values_to_set['pending_request_count'] = new_pending_request_count elif delta_pending_request_count is not None: values_to_set['pending_request_count'] = ( self._METADATA_TABLE.pending_request_count + delta_pending_request_count ) if new_total_request_count is not None: values_to_set['total_request_count'] = new_total_request_count elif delta_total_request_count is not None: values_to_set['total_request_count'] = ( self._METADATA_TABLE.total_request_count + delta_total_request_count ) return values_to_set @staticmethod @lru_cache(maxsize=10000) def _get_int_id_from_unique_key(unique_key: str) -> int: """Generate a deterministic integer ID for a unique_key. Args: unique_key: Unique key to be used to generate ID. Returns: An integer ID based on the unique_key. """ hashed_key = sha256(unique_key.encode('utf-8')).hexdigest() name_length = 15 return int(hashed_key[:name_length], 16) @override def _prepare_buffer_data( self, delta_handled_request_count: int | None = None, delta_pending_request_count: int | None = None, delta_total_request_count: int | None = None, *, recalculate: bool = False, **_kwargs: Any, ) -> dict[str, Any]: """Prepare request queue specific buffer data. Args: delta_handled_request_count: If provided, add this value to the handled_request_count. delta_pending_request_count: If provided, add this value to the pending_request_count. delta_total_request_count: If provided, add this value to the total_request_count. recalculate: If True, recalculate the pending_request_count, and total_request_count on request table. """ buffer_data: dict[str, Any] = { 'client_id': self.client_key, } if delta_handled_request_count: buffer_data['delta_handled_count'] = delta_handled_request_count if delta_pending_request_count: buffer_data['delta_pending_count'] = delta_pending_request_count if delta_total_request_count: buffer_data['delta_total_count'] = delta_total_request_count if recalculate: buffer_data['need_recalc'] = True return buffer_data @override async def _apply_buffer_updates(self, session: AsyncSession, max_buffer_id: int) -> None: aggregations: list[ColumnElement[Any]] = [ sql_func.max(self._BUFFER_TABLE.accessed_at).label('max_accessed_at'), sql_func.max(self._BUFFER_TABLE.modified_at).label('max_modified_at'), sql_func.sum(self._BUFFER_TABLE.delta_handled_count).label('delta_handled_count'), sql_func.sum(self._BUFFER_TABLE.delta_pending_count).label('delta_pending_count'), sql_func.sum(self._BUFFER_TABLE.delta_total_count).label('delta_total_count'), ] if not self._had_multiple_clients: aggregations.append( sql_func.count(sql_func.distinct(self._BUFFER_TABLE.client_id)).label('unique_clients_count') ) if self._storage_client.get_dialect_name() == 'postgresql': aggregations.append(sql_func.bool_or(self._BUFFER_TABLE.need_recalc).label('need_recalc')) else: aggregations.append(sql_func.max(self._BUFFER_TABLE.need_recalc).label('need_recalc')) aggregation_stmt = select(*aggregations).where( self._BUFFER_TABLE.storage_id == self._id, self._BUFFER_TABLE.id <= max_buffer_id ) result = await session.execute(aggregation_stmt) row = result.first() if not row: return await self._update_metadata( session, **_QueueMetadataUpdateParams( accessed_at=row.max_accessed_at, modified_at=row.max_modified_at, update_had_multiple_clients=not self._had_multiple_clients and row.unique_clients_count > 1, delta_handled_request_count=row.delta_handled_count, delta_pending_request_count=row.delta_pending_count, delta_total_request_count=row.delta_total_count, recalculate=bool(row.need_recalc), ), ) ================================================ FILE: src/crawlee/storage_clients/_sql/_storage_client.py ================================================ from __future__ import annotations import warnings from logging import getLogger from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar from sqlalchemy.exc import IntegrityError, OperationalError from sqlalchemy.ext.asyncio import AsyncEngine, async_sessionmaker, create_async_engine from sqlalchemy.sql import insert, select, text from typing_extensions import override from crawlee._utils.docs import docs_group from crawlee.configuration import Configuration from crawlee.storage_clients._base import StorageClient from ._dataset_client import SqlDatasetClient from ._db_models import Base, VersionDb from ._key_value_store_client import SqlKeyValueStoreClient from ._request_queue_client import SqlRequestQueueClient if TYPE_CHECKING: from types import TracebackType from sqlalchemy.ext.asyncio import AsyncSession logger = getLogger(__name__) @docs_group('Storage clients') class SqlStorageClient(StorageClient): """SQL implementation of the storage client. This storage client provides access to datasets, key-value stores, and request queues that persist data to a SQL database using SQLAlchemy 2+. Each storage type uses two tables: one for metadata and one for records. The client accepts either a database connection string or a pre-configured AsyncEngine. If neither is provided, it creates a default SQLite database 'crawlee.db' in the storage directory. Database schema is automatically created during initialization. SQLite databases receive performance optimizations including WAL mode and increased cache size. Warning: This is an experimental feature. The behavior and interface may change in future versions. """ _DEFAULT_DB_NAME = 'crawlee.db' """Default database name if not specified in connection string.""" _SUPPORTED_DIALECTS: ClassVar[set[str]] = {'sqlite', 'postgresql', 'mysql', 'mariadb'} def __init__( self, *, connection_string: str | None = None, engine: AsyncEngine | None = None, ) -> None: """Initialize the SQL storage client. Args: connection_string: Database connection string (e.g., "sqlite+aiosqlite:///crawlee.db"). If not provided, defaults to SQLite database in the storage directory. engine: Pre-configured AsyncEngine instance. If provided, connection_string is ignored. """ if engine is not None and connection_string is not None: raise ValueError('Either connection_string or engine must be provided, not both.') self._connection_string = connection_string self._engine = engine self._initialized = False self.session_maker: None | async_sessionmaker[AsyncSession] = None # Flag needed to apply optimizations only for default database self._default_flag = self._engine is None and self._connection_string is None self._dialect_name: str | None = None # Call the notification only once warnings.warn( 'The SqlStorageClient is experimental and may change or be removed in future releases.', category=UserWarning, stacklevel=2, ) async def __aenter__(self) -> SqlStorageClient: """Async context manager entry.""" return self async def __aexit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None, ) -> None: """Async context manager exit.""" await self.close() @property def engine(self) -> AsyncEngine: """Get the SQLAlchemy AsyncEngine instance.""" if self._engine is None: raise ValueError('Engine is not initialized. Call initialize() before accessing the engine.') return self._engine def get_dialect_name(self) -> str | None: """Get the database dialect name.""" return self._dialect_name async def initialize(self, configuration: Configuration) -> None: """Initialize the database schema. This method creates all necessary tables if they don't exist. Should be called before using the storage client. """ if not self._initialized: engine = self._get_or_create_engine(configuration) async with engine.begin() as conn: self._dialect_name = engine.dialect.name if self._dialect_name not in self._SUPPORTED_DIALECTS: raise ValueError( f'Unsupported database dialect: {self._dialect_name}. Supported: ' f'{", ".join(self._SUPPORTED_DIALECTS)}. Consider using a different database.', ) # Create tables if they don't exist. # Rollback the transaction when an exception occurs. # This is likely an attempt to create a database from several parallel processes. try: # Set SQLite pragmas for performance and consistency if self._default_flag: await conn.execute(text('PRAGMA journal_mode=WAL')) # Better concurrency await conn.execute(text('PRAGMA synchronous=NORMAL')) # Balanced safety/speed await conn.execute(text('PRAGMA cache_size=100000')) # 100MB cache await conn.execute(text('PRAGMA temp_store=MEMORY')) # Memory temp storage await conn.execute(text('PRAGMA mmap_size=268435456')) # 256MB memory mapping await conn.execute(text('PRAGMA foreign_keys=ON')) # Enforce constraints await conn.execute(text('PRAGMA busy_timeout=30000')) # 30s busy timeout await conn.run_sync(Base.metadata.create_all, checkfirst=True) from crawlee import __version__ # Noqa: PLC0415 db_version = (await conn.execute(select(VersionDb))).scalar_one_or_none() # Raise an error if the new version creates breaking changes in the database schema. if db_version and db_version != __version__: warnings.warn( f'Database version {db_version} does not match library version {__version__}. ' 'This may lead to unexpected behavior. Drop the db if you want to make sure that ' 'everything will work fine.', category=UserWarning, stacklevel=2, ) elif not db_version: await conn.execute(insert(VersionDb).values(version=__version__)) except (IntegrityError, OperationalError): await conn.rollback() self._initialized = True async def close(self) -> None: """Close the database connection pool.""" if self._engine is not None: await self._engine.dispose() self._engine = None def create_session(self) -> AsyncSession: """Create a new database session. Returns: A new AsyncSession instance. """ if self.session_maker is None: self.session_maker = async_sessionmaker(self._engine, expire_on_commit=False, autoflush=False) return self.session_maker() @override async def create_dataset_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> SqlDatasetClient: configuration = configuration or Configuration.get_global_configuration() await self.initialize(configuration) client = await SqlDatasetClient.open( id=id, name=name, alias=alias, storage_client=self, ) await self._purge_if_needed(client, configuration) return client @override async def create_kvs_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> SqlKeyValueStoreClient: configuration = configuration or Configuration.get_global_configuration() await self.initialize(configuration) client = await SqlKeyValueStoreClient.open( id=id, name=name, alias=alias, storage_client=self, ) await self._purge_if_needed(client, configuration) return client @override async def create_rq_client( self, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, ) -> SqlRequestQueueClient: configuration = configuration or Configuration.get_global_configuration() await self.initialize(configuration) client = await SqlRequestQueueClient.open( id=id, name=name, alias=alias, storage_client=self, ) await self._purge_if_needed(client, configuration) return client def _get_or_create_engine(self, configuration: Configuration) -> AsyncEngine: """Get or create the database engine based on configuration.""" if self._engine is not None: return self._engine if self._connection_string is not None: connection_string = self._connection_string else: # Create SQLite database in the storage directory storage_dir = Path(configuration.storage_dir) if not storage_dir.exists(): storage_dir.mkdir(parents=True, exist_ok=True) db_path = storage_dir / self._DEFAULT_DB_NAME # Create connection string with path to default database connection_string = f'sqlite+aiosqlite:///{db_path}' if not any(connection_string.startswith(dialect) for dialect in self._SUPPORTED_DIALECTS): raise ValueError( f'Unsupported database. Supported: {", ".join(self._SUPPORTED_DIALECTS)}. Consider using a different ' 'database.' ) kwargs: dict[str, Any] = {} if 'mysql' in connection_string or 'mariadb' in connection_string: connect_args: dict[str, Any] = {'connect_timeout': 30} # MySQL/MariaDB require READ COMMITTED isolation level for correct behavior in concurrent environments # without deadlocks. kwargs['isolation_level'] = 'READ COMMITTED' else: connect_args = {'timeout': 30} self._engine = create_async_engine( connection_string, future=True, pool_size=5, max_overflow=10, pool_timeout=60, pool_recycle=600, pool_pre_ping=True, echo=False, connect_args=connect_args, **kwargs, ) return self._engine ================================================ FILE: src/crawlee/storage_clients/_sql/py.typed ================================================ ================================================ FILE: src/crawlee/storage_clients/models.py ================================================ from __future__ import annotations from datetime import datetime from typing import TYPE_CHECKING, Annotated, Any, Generic from pydantic import BaseModel, BeforeValidator, ConfigDict, Field from typing_extensions import TypeVar from crawlee._types import HttpMethod from crawlee._utils.docs import docs_group from crawlee._utils.urls import validate_http_url KvsValueType = TypeVar('KvsValueType', default=Any) @docs_group('Storage data') class StorageMetadata(BaseModel): """Represents the base model for storage metadata. It contains common fields shared across all specific storage types. """ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, extra='allow', from_attributes=True) id: Annotated[str, Field(alias='id')] """The unique identifier of the storage.""" name: Annotated[str | None, Field(alias='name', default=None)] """The name of the storage.""" accessed_at: Annotated[datetime, Field(alias='accessedAt')] """The timestamp when the storage was last accessed.""" created_at: Annotated[datetime, Field(alias='createdAt')] """The timestamp when the storage was created.""" modified_at: Annotated[datetime, Field(alias='modifiedAt')] """The timestamp when the storage was last modified.""" @docs_group('Storage data') class DatasetMetadata(StorageMetadata): """Model for a dataset metadata.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) item_count: Annotated[int, Field(alias='itemCount')] """The number of items in the dataset.""" @docs_group('Storage data') class KeyValueStoreMetadata(StorageMetadata): """Model for a key-value store metadata.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) @docs_group('Storage data') class RequestQueueMetadata(StorageMetadata): """Model for a request queue metadata.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')] """Indicates whether the queue has been accessed by multiple clients (consumers).""" handled_request_count: Annotated[int, Field(alias='handledRequestCount')] """The number of requests that have been handled from the queue.""" pending_request_count: Annotated[int, Field(alias='pendingRequestCount')] """The number of requests that are still pending in the queue.""" total_request_count: Annotated[int, Field(alias='totalRequestCount')] """The total number of requests that have been added to the queue.""" @docs_group('Storage data') class KeyValueStoreRecordMetadata(BaseModel): """Model for a key-value store record metadata.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) key: Annotated[str, Field(alias='key')] """The key of the record. A unique identifier for the record in the key-value store. """ content_type: Annotated[str, Field(alias='contentType')] """The MIME type of the record. Describe the format and type of data stored in the record, following the MIME specification. """ size: Annotated[int | None, Field(alias='size', default=None)] = None """The size of the record in bytes.""" @docs_group('Storage data') class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]): """Model for a key-value store record.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) value: Annotated[KvsValueType, Field(alias='value')] """The value of the record.""" @docs_group('Storage data') class DatasetItemsListPage(BaseModel): """Model for a single page of dataset items returned from a collection list method.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) count: Annotated[int, Field(default=0)] """The number of objects returned on this page.""" offset: Annotated[int, Field(default=0)] """The starting position of the first object returned, as specified in the API call.""" limit: Annotated[int, Field(default=0)] """The maximum number of objects to return, as specified in the API call.""" total: Annotated[int, Field(default=0)] """The total number of objects that match the criteria of the API call.""" desc: Annotated[bool, Field(default=False)] """Indicates if the returned list is in descending order.""" # Workaround for Pydantic and type checkers when using Annotated with default_factory if TYPE_CHECKING: items: list[dict] = [] """The list of dataset items returned on this page.""" else: items: Annotated[list[dict], Field(default_factory=list)] """The list of dataset items returned on this page.""" @docs_group('Storage data') class ProcessedRequest(BaseModel): """Represents a processed request.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) id: Annotated[str | None, Field(alias='requestId', default=None)] = None """Internal representation of the request by the storage client. Only some clients use id.""" unique_key: Annotated[str, Field(alias='uniqueKey')] was_already_present: Annotated[bool, Field(alias='wasAlreadyPresent')] was_already_handled: Annotated[bool, Field(alias='wasAlreadyHandled')] @docs_group('Storage data') class UnprocessedRequest(BaseModel): """Represents an unprocessed request.""" model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) unique_key: Annotated[str, Field(alias='uniqueKey')] url: Annotated[str, BeforeValidator(validate_http_url), Field()] method: Annotated[HttpMethod | None, Field()] = None @docs_group('Storage data') class AddRequestsResponse(BaseModel): """Model for a response to add requests to a queue. Contains detailed information about the processing results when adding multiple requests to a queue. This includes which requests were successfully processed and which ones encountered issues during processing. """ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) processed_requests: Annotated[list[ProcessedRequest], Field(alias='processedRequests')] """Successfully processed requests, including information about whether they were already present in the queue and whether they had been handled previously.""" unprocessed_requests: Annotated[list[UnprocessedRequest], Field(alias='unprocessedRequests')] """Requests that could not be processed, typically due to validation errors or other issues.""" ================================================ FILE: src/crawlee/storage_clients/py.typed ================================================ ================================================ FILE: src/crawlee/storages/__init__.py ================================================ from ._dataset import Dataset from ._key_value_store import KeyValueStore from ._request_queue import RequestQueue __all__ = [ 'Dataset', 'KeyValueStore', 'RequestQueue', ] ================================================ FILE: src/crawlee/storages/_base.py ================================================ from __future__ import annotations from abc import ABC, abstractmethod from typing import TYPE_CHECKING from crawlee._utils.docs import docs_group if TYPE_CHECKING: from crawlee.configuration import Configuration from crawlee.storage_clients._base import StorageClient from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata @docs_group('Storages') class Storage(ABC): """Base class for storages.""" @property @abstractmethod def id(self) -> str: """Get the storage ID.""" @property @abstractmethod def name(self) -> str | None: """Get the storage name.""" @abstractmethod async def get_metadata(self) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata: """Get the storage metadata.""" @classmethod @abstractmethod async def open( cls, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> Storage: """Open a storage, either restore existing or create a new one. Args: id: The storage ID. name: The storage name (global scope, persists across runs). Name can only contain letters "a" through "z", the digits "0" through "9", and the hyphen ("-") but only in the middle of the string (e.g. "my-value-1"). alias: The storage alias (run scope, creates unnamed storage). configuration: Configuration object used during the storage creation or restoration process. storage_client: Underlying storage client to use. If not provided, the default global storage client from the service locator will be used. """ @abstractmethod async def drop(self) -> None: """Drop the storage, removing it from the underlying storage client and clearing the cache.""" @abstractmethod async def purge(self) -> None: """Purge the storage, removing all items from the underlying storage client. This method does not remove the storage itself, e.g. don't remove the metadata, but clears all items within it. """ ================================================ FILE: src/crawlee/storages/_dataset.py ================================================ from __future__ import annotations import logging from io import StringIO from typing import TYPE_CHECKING, overload from typing_extensions import override from crawlee import service_locator from crawlee._utils.docs import docs_group from crawlee._utils.file import export_csv_to_stream, export_json_to_stream from ._base import Storage from ._key_value_store import KeyValueStore from ._utils import validate_storage_name if TYPE_CHECKING: from collections.abc import AsyncIterator from typing import Any, Literal from typing_extensions import Unpack from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata logger = logging.getLogger(__name__) @docs_group('Storages') class Dataset(Storage): """Dataset is a storage for managing structured tabular data. The dataset class provides a high-level interface for storing and retrieving structured data with consistent schema, similar to database tables or spreadsheets. It abstracts the underlying storage implementation details, offering a consistent API regardless of where the data is physically stored. Dataset operates in an append-only mode, allowing new records to be added but not modified or deleted after creation. This makes it particularly suitable for storing crawling results and other data that should be immutable once collected. The class provides methods for adding data, retrieving data with various filtering options, and exporting data to different formats. You can create a dataset using the `open` class method, specifying either a name or ID. The underlying storage implementation is determined by the configured storage client. ### Usage ```python from crawlee.storages import Dataset # Open a dataset dataset = await Dataset.open(name='my-dataset') # Add data await dataset.push_data({'title': 'Example Product', 'price': 99.99}) # Retrieve filtered data results = await dataset.get_data(limit=10, desc=True) # Export data await dataset.export_to('results.json', content_type='json') ``` """ def __init__(self, client: DatasetClient, id: str, name: str | None) -> None: """Initialize a new instance. Preferably use the `Dataset.open` constructor to create a new instance. Args: client: An instance of a storage client. id: The unique identifier of the storage. name: The name of the storage, if available. """ validate_storage_name(name) self._client = client self._id = id self._name = name @property @override def id(self) -> str: return self._id @property @override def name(self) -> str | None: return self._name @override async def get_metadata(self) -> DatasetMetadata: return await self._client.get_metadata() @override @classmethod async def open( cls, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> Dataset: configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client client_opener_coro = storage_client.create_dataset_client( id=id, name=name, alias=alias, configuration=configuration ) storage_client_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration) return await service_locator.storage_instance_manager.open_storage_instance( cls, id=id, name=name, alias=alias, client_opener_coro=client_opener_coro, storage_client_cache_key=storage_client_cache_key, ) @override async def drop(self) -> None: storage_instance_manager = service_locator.storage_instance_manager storage_instance_manager.remove_from_cache(self) await self._client.drop() @override async def purge(self) -> None: await self._client.purge() async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: """Store an object or an array of objects to the dataset. The size of the data is limited by the receiving API and therefore `push_data()` will only allow objects whose JSON representation is smaller than 9MB. When an array is passed, none of the included objects may be larger than 9MB, but the array itself may be of any size. Args: data: A JSON serializable data structure to be stored in the dataset. The JSON representation of each item must be smaller than 9MB. """ await self._client.push_data(data=data) async def get_data( self, *, offset: int = 0, limit: int | None = 999_999_999_999, clean: bool = False, desc: bool = False, fields: list[str] | None = None, omit: list[str] | None = None, unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, flatten: list[str] | None = None, view: str | None = None, ) -> DatasetItemsListPage: """Retrieve a paginated list of items from a dataset based on various filtering parameters. This method provides the flexibility to filter, sort, and modify the appearance of dataset items when listed. Each parameter modifies the result set according to its purpose. The method also supports pagination through 'offset' and 'limit' parameters. Args: offset: Skips the specified number of items at the start. limit: The maximum number of items to retrieve. Unlimited if None. clean: Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty. desc: Set to True to sort results in descending order. fields: Fields to include in each item. Sorts fields as specified if provided. omit: Fields to exclude from each item. unwind: Unwinds items by a specified array field, turning each element into a separate item. skip_empty: Excludes empty items from the results if True. skip_hidden: Excludes fields starting with '#' if True. flatten: Fields to be flattened in returned items. view: Specifies the dataset view to be used. Returns: An object with filtered, sorted, and paginated dataset items plus pagination details. """ return await self._client.get_data( offset=offset, limit=limit, clean=clean, desc=desc, fields=fields, omit=omit, unwind=unwind, skip_empty=skip_empty, skip_hidden=skip_hidden, flatten=flatten, view=view, ) async def iterate_items( self, *, offset: int = 0, limit: int | None = 999_999_999_999, clean: bool = False, desc: bool = False, fields: list[str] | None = None, omit: list[str] | None = None, unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, ) -> AsyncIterator[dict[str, Any]]: """Iterate over items in the dataset according to specified filters and sorting. This method allows for asynchronously iterating through dataset items while applying various filters such as skipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit` parameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and `skip_hidden` parameters. Args: offset: Skips the specified number of items at the start. limit: The maximum number of items to retrieve. Unlimited if None. clean: Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty. desc: Set to True to sort results in descending order. fields: Fields to include in each item. Sorts fields as specified if provided. omit: Fields to exclude from each item. unwind: Unwinds items by a specified array field, turning each element into a separate item. skip_empty: Excludes empty items from the results if True. skip_hidden: Excludes fields starting with '#' if True. Yields: An asynchronous iterator of dictionary objects, each representing a dataset item after applying the specified filters and transformations. """ async for item in self._client.iterate_items( offset=offset, limit=limit, clean=clean, desc=desc, fields=fields, omit=omit, unwind=unwind, skip_empty=skip_empty, skip_hidden=skip_hidden, ): yield item async def list_items( self, *, offset: int = 0, limit: int | None = 999_999_999_999, clean: bool = False, desc: bool = False, fields: list[str] | None = None, omit: list[str] | None = None, unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, ) -> list[dict[str, Any]]: """Retrieve a list of all items from the dataset according to specified filters and sorting. This method collects all dataset items into a list while applying various filters such as skipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit` parameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and `skip_hidden` parameters. Args: offset: Skips the specified number of items at the start. limit: The maximum number of items to retrieve. Unlimited if None. clean: Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty. desc: Set to True to sort results in descending order. fields: Fields to include in each item. Sorts fields as specified if provided. omit: Fields to exclude from each item. unwind: Unwinds items by a specified array field, turning each element into a separate item. skip_empty: Excludes empty items from the results if True. skip_hidden: Excludes fields starting with '#' if True. Returns: A list of dictionary objects, each representing a dataset item after applying the specified filters and transformations. """ return [ item async for item in self.iterate_items( offset=offset, limit=limit, clean=clean, desc=desc, fields=fields, omit=omit, unwind=unwind, skip_empty=skip_empty, skip_hidden=skip_hidden, ) ] @overload async def export_to( self, key: str, content_type: Literal['json'], to_kvs_id: str | None = None, to_kvs_name: str | None = None, to_kvs_storage_client: StorageClient | None = None, to_kvs_configuration: Configuration | None = None, **kwargs: Unpack[ExportDataJsonKwargs], ) -> None: ... @overload async def export_to( self, key: str, content_type: Literal['csv'], to_kvs_id: str | None = None, to_kvs_name: str | None = None, to_kvs_storage_client: StorageClient | None = None, to_kvs_configuration: Configuration | None = None, **kwargs: Unpack[ExportDataCsvKwargs], ) -> None: ... async def export_to( self, key: str, content_type: Literal['json', 'csv'] = 'json', to_kvs_id: str | None = None, to_kvs_name: str | None = None, to_kvs_storage_client: StorageClient | None = None, to_kvs_configuration: Configuration | None = None, **kwargs: Any, ) -> None: """Export the entire dataset into a specified file stored under a key in a key-value store. This method consolidates all entries from a specified dataset into one file, which is then saved under a given key in a key-value store. The format of the exported file is determined by the `content_type` parameter. Either the dataset's ID or name should be specified, and similarly, either the target key-value store's ID or name should be used. Args: key: The key under which to save the data in the key-value store. content_type: The format in which to export the data. to_kvs_id: ID of the key-value store to save the exported file. Specify only one of ID or name. to_kvs_name: Name of the key-value store to save the exported file. Specify only one of ID or name. to_kvs_storage_client: Storage client to use for the key-value store. to_kvs_configuration: Configuration for the key-value store. kwargs: Additional parameters for the export operation, specific to the chosen content type. """ kvs = await KeyValueStore.open( id=to_kvs_id, name=to_kvs_name, configuration=to_kvs_configuration, storage_client=to_kvs_storage_client, ) dst = StringIO() if content_type == 'csv': await export_csv_to_stream(self.iterate_items(), dst, **kwargs) await kvs.set_value(key, dst.getvalue(), 'text/csv') elif content_type == 'json': await export_json_to_stream(self.iterate_items(), dst, **kwargs) await kvs.set_value(key, dst.getvalue(), 'application/json') else: raise ValueError('Unsupported content type, expecting CSV or JSON') ================================================ FILE: src/crawlee/storages/_key_value_store.py ================================================ from __future__ import annotations import asyncio from collections.abc import AsyncIterator from logging import getLogger from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, overload from pydantic import RootModel from typing_extensions import override from crawlee import service_locator from crawlee._types import JsonSerializable # noqa: TC001 from crawlee._utils.docs import docs_group from crawlee._utils.recoverable_state import RecoverableState from crawlee.storage_clients.models import KeyValueStoreMetadata from ._base import Storage from ._utils import validate_storage_name if TYPE_CHECKING: from collections.abc import AsyncIterator from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecordMetadata else: from crawlee._utils.recoverable_state import RecoverableState T = TypeVar('T') logger = getLogger(__name__) class AutosavedValue(RootModel): root: dict[str, JsonSerializable] @docs_group('Storages') class KeyValueStore(Storage): """Key-value store is a storage for reading and writing data records with unique key identifiers. The key-value store class acts as a high-level interface for storing, retrieving, and managing data records identified by unique string keys. It abstracts away the underlying storage implementation details, allowing you to work with the same API regardless of whether data is stored in memory, on disk, or in the cloud. Each data record is associated with a specific MIME content type, allowing storage of various data formats such as JSON, text, images, HTML snapshots or any binary data. This class is commonly used to store inputs, outputs, and other artifacts of crawler operations. You can instantiate a key-value store using the `open` class method, which will create a store with the specified name or id. The underlying storage implementation is determined by the configured storage client. ### Usage ```python from crawlee.storages import KeyValueStore # Open a named key-value store kvs = await KeyValueStore.open(name='my-store') # Store and retrieve data await kvs.set_value('product-1234.json', [{'name': 'Smartphone', 'price': 799.99}]) product = await kvs.get_value('product-1234') ``` """ _autosaved_values: ClassVar[ dict[ str, dict[str, RecoverableState[AutosavedValue]], ] ] = {} """Cache for recoverable (auto-saved) values.""" def __init__(self, client: KeyValueStoreClient, id: str, name: str | None) -> None: """Initialize a new instance. Preferably use the `KeyValueStore.open` constructor to create a new instance. Args: client: An instance of a storage client. id: The unique identifier of the storage. name: The name of the storage, if available. """ validate_storage_name(name) self._client = client self._id = id self._name = name self._autosave_lock = asyncio.Lock() """Lock for autosaving values to prevent concurrent modifications.""" @property @override def id(self) -> str: return self._id @property @override def name(self) -> str | None: return self._name @override async def get_metadata(self) -> KeyValueStoreMetadata: return await self._client.get_metadata() @override @classmethod async def open( cls, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> KeyValueStore: configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client client_opener_coro = storage_client.create_kvs_client( id=id, name=name, alias=alias, configuration=configuration ) additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration) return await service_locator.storage_instance_manager.open_storage_instance( cls, id=id, name=name, alias=alias, client_opener_coro=client_opener_coro, storage_client_cache_key=additional_cache_key, ) @override async def drop(self) -> None: storage_instance_manager = service_locator.storage_instance_manager storage_instance_manager.remove_from_cache(self) await self._clear_cache() # Clear cache with persistent values. await self._client.drop() @override async def purge(self) -> None: await self._client.purge() @overload async def get_value(self, key: str) -> Any: ... @overload async def get_value(self, key: str, default_value: T) -> T: ... @overload async def get_value(self, key: str, default_value: T | None = None) -> T | None: ... async def get_value(self, key: str, default_value: T | None = None) -> T | None: """Get a value from the KVS. Args: key: Key of the record to retrieve. default_value: Default value returned in case the record does not exist. Returns: The value associated with the given key. `default_value` is used in case the record does not exist. """ record = await self._client.get_value(key=key) return record.value if record else default_value async def set_value( self, key: str, value: Any, content_type: str | None = None, ) -> None: """Set a value in the KVS. Args: key: Key of the record to set. value: Value to set. content_type: The MIME content type string. """ await self._client.set_value(key=key, value=value, content_type=content_type) async def delete_value(self, key: str) -> None: """Delete a value from the KVS. Args: key: Key of the record to delete. """ await self._client.delete_value(key=key) async def iterate_keys( self, exclusive_start_key: str | None = None, limit: int | None = None, ) -> AsyncIterator[KeyValueStoreRecordMetadata]: """Iterate over the existing keys in the KVS. Args: exclusive_start_key: Key to start the iteration from. limit: Maximum number of keys to return. None means no limit. Yields: Information about the key. """ async for item in self._client.iterate_keys( exclusive_start_key=exclusive_start_key, limit=limit, ): yield item async def list_keys( self, exclusive_start_key: str | None = None, limit: int = 1000, ) -> list[KeyValueStoreRecordMetadata]: """List all the existing keys in the KVS. It uses client's `iterate_keys` method to get the keys. Args: exclusive_start_key: Key to start the iteration from. limit: Maximum number of keys to return. Returns: A list of keys in the KVS. """ return [ key async for key in self._client.iterate_keys( exclusive_start_key=exclusive_start_key, limit=limit, ) ] async def record_exists(self, key: str) -> bool: """Check if a record with the given key exists in the key-value store. Args: key: Key of the record to check for existence. Returns: True if a record with the given key exists, False otherwise. """ return await self._client.record_exists(key=key) async def get_public_url(self, key: str) -> str: """Get the public URL for the given key. Args: key: Key of the record for which URL is required. Returns: The public URL for the given key. """ return await self._client.get_public_url(key=key) async def get_auto_saved_value( self, key: str, default_value: dict[str, JsonSerializable] | None = None, ) -> dict[str, JsonSerializable]: """Get a value from KVS that will be automatically saved on changes. Args: key: Key of the record, to store the value. default_value: Value to be used if the record does not exist yet. Should be a dictionary. Returns: Return the value of the key. """ default_value = {} if default_value is None else default_value async with self._autosave_lock: cache = self._autosaved_values.setdefault(self.id, {}) if key in cache: return cache[key].current_value.root async def kvs_factory() -> KeyValueStore: return self cache[key] = recoverable_state = RecoverableState( default_state=AutosavedValue(default_value), persist_state_key=key, persistence_enabled=True, persist_state_kvs_factory=kvs_factory, logger=logger, ) await recoverable_state.initialize() return recoverable_state.current_value.root async def persist_autosaved_values(self) -> None: """Force autosaved values to be saved without waiting for an event in Event Manager.""" if self.id in self._autosaved_values: cache = self._autosaved_values[self.id] for value in cache.values(): await value.persist_state() async def _clear_cache(self) -> None: """Clear cache with autosaved values.""" if self.id in self._autosaved_values: cache = self._autosaved_values[self.id] for value in cache.values(): await value.teardown() cache.clear() ================================================ FILE: src/crawlee/storages/_request_queue.py ================================================ from __future__ import annotations import asyncio from datetime import timedelta from logging import getLogger from typing import TYPE_CHECKING, TypeVar from typing_extensions import override from crawlee import Request, service_locator from crawlee._utils.docs import docs_group from crawlee._utils.wait import wait_for_all_tasks_for_finish from crawlee.request_loaders import RequestManager from ._base import Storage from ._utils import validate_storage_name if TYPE_CHECKING: from collections.abc import Sequence from crawlee import Request from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import ProcessedRequest, RequestQueueMetadata logger = getLogger(__name__) T = TypeVar('T') @docs_group('Storages') class RequestQueue(Storage, RequestManager): """Request queue is a storage for managing HTTP requests. The request queue class serves as a high-level interface for organizing and managing HTTP requests during web crawling. It provides methods for adding, retrieving, and manipulating requests throughout the crawling lifecycle, abstracting away the underlying storage implementation details. Request queue maintains the state of each URL to be crawled, tracking whether it has been processed, is currently being handled, or is waiting in the queue. Each URL in the queue is uniquely identified by a `unique_key` property, which prevents duplicate processing unless explicitly configured otherwise. The class supports both breadth-first and depth-first crawling strategies through its `forefront` parameter when adding requests. It also provides mechanisms for error handling and request reclamation when processing fails. You can open a request queue using the `open` class method, specifying either a name or ID to identify the queue. The underlying storage implementation is determined by the configured storage client. ### Usage ```python from crawlee.storages import RequestQueue # Open a request queue rq = await RequestQueue.open(name='my-queue') # Add a request await rq.add_request('https://example.com') # Process requests request = await rq.fetch_next_request() if request: try: # Process the request # ... await rq.mark_request_as_handled(request) except Exception: await rq.reclaim_request(request) ``` """ def __init__(self, client: RequestQueueClient, id: str, name: str | None) -> None: """Initialize a new instance. Preferably use the `RequestQueue.open` constructor to create a new instance. Args: client: An instance of a storage client. id: The unique identifier of the storage. name: The name of the storage, if available. """ validate_storage_name(name) self._client = client self._id = id self._name = name self._add_requests_tasks = list[asyncio.Task]() """A list of tasks for adding requests to the queue.""" @property @override def id(self) -> str: return self._id @property @override def name(self) -> str | None: return self._name @override async def get_metadata(self) -> RequestQueueMetadata: return await self._client.get_metadata() @override async def get_handled_count(self) -> int: metadata = await self._client.get_metadata() return metadata.handled_request_count @override async def get_total_count(self) -> int: metadata = await self._client.get_metadata() return metadata.total_request_count @override @classmethod async def open( cls, *, id: str | None = None, name: str | None = None, alias: str | None = None, configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> RequestQueue: configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client client_opener_coro = storage_client.create_rq_client(id=id, name=name, alias=alias, configuration=configuration) additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration) return await service_locator.storage_instance_manager.open_storage_instance( cls, id=id, name=name, alias=alias, client_opener_coro=client_opener_coro, storage_client_cache_key=additional_cache_key, ) @override async def drop(self) -> None: # Remove from cache before dropping storage_instance_manager = service_locator.storage_instance_manager storage_instance_manager.remove_from_cache(self) await self._client.drop() @override async def purge(self) -> None: await self._client.purge() @override async def add_request( self, request: str | Request, *, forefront: bool = False, ) -> ProcessedRequest | None: request = self._transform_request(request) response = await self._client.add_batch_of_requests([request], forefront=forefront) if response.processed_requests: return response.processed_requests[0] if response.unprocessed_requests: logger.warning( f'Request {request.url} was not processed by storage client "{self._client.__class__.__name__}".' ) else: logger.warning( f'Request {request.url} was not processed by storage client "{self._client.__class__.__name__}" ' 'received empty response.' ) return None @override async def add_requests( self, requests: Sequence[str | Request], *, forefront: bool = False, batch_size: int = 1000, wait_time_between_batches: timedelta = timedelta(seconds=1), wait_for_all_requests_to_be_added: bool = False, wait_for_all_requests_to_be_added_timeout: timedelta | None = None, ) -> None: transformed_requests = self._transform_requests(requests) wait_time_secs = wait_time_between_batches.total_seconds() # Wait for the first batch to be added first_batch = transformed_requests[:batch_size] if first_batch: await self._process_batch( first_batch, base_retry_wait=wait_time_between_batches, forefront=forefront, ) async def _process_remaining_batches() -> None: for i in range(batch_size, len(transformed_requests), batch_size): batch = transformed_requests[i : i + batch_size] await self._process_batch( batch, base_retry_wait=wait_time_between_batches, forefront=forefront, ) if i + batch_size < len(transformed_requests): await asyncio.sleep(wait_time_secs) # Create and start the task to process remaining batches in the background remaining_batches_task = asyncio.create_task( _process_remaining_batches(), name='request_queue_process_remaining_batches_task', ) self._add_requests_tasks.append(remaining_batches_task) remaining_batches_task.add_done_callback(lambda _: self._add_requests_tasks.remove(remaining_batches_task)) # Wait for all tasks to finish if requested if wait_for_all_requests_to_be_added: await wait_for_all_tasks_for_finish( (remaining_batches_task,), logger=logger, timeout=wait_for_all_requests_to_be_added_timeout, ) async def fetch_next_request(self) -> Request | None: """Return the next request in the queue to be processed. Once you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled` to mark the request as handled in the queue. If there was some error in processing the request, call `RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer in another call to the `fetch_next_request` method. Note that the `None` return value does not mean the queue processing finished, it means there are currently no pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished` instead. Returns: The next request to process, or `None` if there are no more pending requests. """ return await self._client.fetch_next_request() async def get_request(self, unique_key: str) -> Request | None: """Retrieve a specific request from the queue by its ID. Args: unique_key: Unique key of the request to retrieve. Returns: The request with the specified ID, or `None` if no such request exists. """ return await self._client.get_request(unique_key) async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: """Mark a request as handled after successful processing. This method should be called after a request has been successfully processed. Once marked as handled, the request will be removed from the queue and will not be returned in subsequent calls to `fetch_next_request` method. Args: request: The request to mark as handled. Returns: Information about the queue operation. """ return await self._client.mark_request_as_handled(request) async def reclaim_request( self, request: Request, *, forefront: bool = False, ) -> ProcessedRequest | None: """Reclaim a failed request back to the queue for later processing. If a request fails during processing, this method can be used to return it to the queue. The request will be returned for processing again in a subsequent call to `RequestQueue.fetch_next_request`. Args: request: The request to return to the queue. forefront: If true, the request will be added to the beginning of the queue. Otherwise, it will be added to the end. Returns: Information about the queue operation. """ return await self._client.reclaim_request(request, forefront=forefront) async def is_empty(self) -> bool: """Check if the request queue is empty. An empty queue means that there are no requests currently in the queue, either pending or being processed. However, this does not necessarily mean that the crawling operation is finished, as there still might be tasks that could add additional requests to the queue. Returns: True if the request queue is empty, False otherwise. """ return await self._client.is_empty() async def is_finished(self) -> bool: """Check if the request queue is finished. A finished queue means that all requests in the queue have been processed (the queue is empty) and there are no more tasks that could add additional requests to the queue. This is the definitive way to check if a crawling operation is complete. Returns: True if the request queue is finished (empty and no pending add operations), False otherwise. """ if self._add_requests_tasks: logger.debug('Background add requests tasks are still in progress.') return False if await self.is_empty(): logger.debug('The request queue is empty.') return True return False async def _process_batch( self, batch: Sequence[Request], *, base_retry_wait: timedelta, attempt: int = 1, forefront: bool = False, ) -> None: """Process a batch of requests with automatic retry mechanism.""" max_attempts = 5 response = await self._client.add_batch_of_requests(batch, forefront=forefront) if response.unprocessed_requests: logger.debug(f'Following requests were not processed: {response.unprocessed_requests}.') if attempt > max_attempts: logger.warning( f'Following requests were not processed even after {max_attempts} attempts:\n' f'{response.unprocessed_requests}' ) else: logger.debug('Retry to add requests.') unprocessed_requests_unique_keys = {request.unique_key for request in response.unprocessed_requests} retry_batch = [request for request in batch if request.unique_key in unprocessed_requests_unique_keys] await asyncio.sleep((base_retry_wait * attempt).total_seconds()) await self._process_batch(retry_batch, base_retry_wait=base_retry_wait, attempt=attempt + 1) request_count = len(batch) - len(response.unprocessed_requests) if request_count: logger.debug( f'Added {request_count} requests to the queue. Processed requests: {response.processed_requests}' ) ================================================ FILE: src/crawlee/storages/_storage_instance_manager.py ================================================ from __future__ import annotations from asyncio import Lock from collections import defaultdict from collections.abc import Coroutine, Hashable from dataclasses import dataclass, field from typing import TYPE_CHECKING, TypeVar from weakref import WeakValueDictionary from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient from ._utils import validate_storage_name if TYPE_CHECKING: from ._base import Storage T = TypeVar('T', bound='Storage') @dataclass class _StorageCache: """Cache for storage instances.""" by_id: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field( default_factory=lambda: defaultdict(lambda: defaultdict(defaultdict)) ) """Cache for storage instances by ID. Example: by_id[Dataset]['some_id']['some_additional_cache_key'].""" by_name: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field( default_factory=lambda: defaultdict(lambda: defaultdict(defaultdict)) ) """Cache for storage instances by name. Example: by_name[Dataset]['some_name']['some_additional_cache_key']""" by_alias: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field( default_factory=lambda: defaultdict(lambda: defaultdict(defaultdict)) ) """Cache for storage instances by alias. Example: by_alias[Dataset]['some_alias']['some_additional_cache_key']""" def remove_from_cache(self, storage_instance: Storage) -> None: """Remove a storage instance from the cache. Args: storage_instance: The storage instance to remove. """ storage_type = type(storage_instance) # Remove from ID cache for additional_key in self.by_id[storage_type][storage_instance.id]: del self.by_id[storage_type][storage_instance.id][additional_key] break # Remove from name cache or alias cache. It can never be in both. if storage_instance.name is not None: for additional_key in self.by_name[storage_type][storage_instance.name]: del self.by_name[storage_type][storage_instance.name][additional_key] break else: for alias_key in self.by_alias[storage_type]: for additional_key in self.by_alias[storage_type][alias_key]: del self.by_alias[storage_type][alias_key][additional_key] break ClientOpenerCoro = Coroutine[None, None, DatasetClient | KeyValueStoreClient | RequestQueueClient] """Type alias for the client opener function.""" class StorageInstanceManager: """Manager for caching and managing storage instances. This class centralizes the caching logic for all storage types (Dataset, KeyValueStore, RequestQueue) and provides a unified interface for opening and managing storage instances. """ _DEFAULT_STORAGE_ALIAS = '__default__' """Reserved alias for default unnamed storage.""" def __init__(self) -> None: self._cache: _StorageCache = _StorageCache() self._opener_locks: WeakValueDictionary[tuple, Lock] = WeakValueDictionary() async def open_storage_instance( self, cls: type[T], *, id: str | None, name: str | None, alias: str | None, client_opener_coro: ClientOpenerCoro, storage_client_cache_key: Hashable = '', ) -> T: """Open a storage instance with caching support. Args: cls: The storage class to instantiate. id: Storage ID. name: Storage name. (global scope, persists across runs). Name can only contain letters "a" through "z", the digits "0" through "9", and the hyphen ("-") but only in the middle of the string (e.g. "my-value-1"). alias: Storage alias (run scope, creates unnamed storage). client_opener_coro: Coroutine to open the storage client when storage instance not found in cache. storage_client_cache_key: Additional optional key from storage client to differentiate cache entries. Returns: The storage instance. Raises: ValueError: If multiple parameters out of `id`, `name`, and `alias` are specified. """ try: if name == self._DEFAULT_STORAGE_ALIAS: raise ValueError( f'Storage name cannot be "{self._DEFAULT_STORAGE_ALIAS}" as it is reserved for default alias.' ) # Validate input parameters. raise_if_too_many_kwargs(id=id, name=name, alias=alias) # Auto-set alias='__default__' when no parameters are specified. if not any([name, alias, id]): alias = self._DEFAULT_STORAGE_ALIAS # Check cache without lock first for performance. if cached_instance := self._get_from_cache( cls, id=id, name=name, alias=alias, storage_client_cache_key=storage_client_cache_key, ): return cached_instance # Validate storage name if name is not None: validate_storage_name(name) # Acquire lock for this opener opener_lock_key = (cls, str(id or name or alias), storage_client_cache_key) if not (lock := self._opener_locks.get(opener_lock_key)): lock = Lock() self._opener_locks[opener_lock_key] = lock async with lock: # Another task could have created the storage while we were waiting for the lock - check if that # happened if cached_instance := self._get_from_cache( cls, id=id, name=name, alias=alias, storage_client_cache_key=storage_client_cache_key, ): return cached_instance # Check for conflicts between named and alias storages self._check_name_alias_conflict( cls, name=name, alias=alias, storage_client_cache_key=storage_client_cache_key, ) # Create new instance client: KeyValueStoreClient | DatasetClient | RequestQueueClient client = await client_opener_coro metadata = await client.get_metadata() instance = cls(client, metadata.id, metadata.name) # type: ignore[call-arg] instance_name = getattr(instance, 'name', None) # Cache the instance. # Note: No awaits in this section. All cache entries must be written # atomically to ensure pre-checks outside the lock see consistent state. # Always cache by id. self._cache.by_id[cls][instance.id][storage_client_cache_key] = instance # Cache named storage. if instance_name is not None: self._cache.by_name[cls][instance_name][storage_client_cache_key] = instance # Cache unnamed storage. if alias is not None: self._cache.by_alias[cls][alias][storage_client_cache_key] = instance return instance finally: # Make sure the client opener is closed. # If it was awaited, then closing is no operation, if it was not awaited, this is the cleanup. client_opener_coro.close() def remove_from_cache(self, storage_instance: Storage) -> None: """Remove a storage instance from the cache. Args: storage_instance: The storage instance to remove. """ self._cache.remove_from_cache(storage_instance) def clear_cache(self) -> None: """Clear all cached storage instances.""" self._cache = _StorageCache() def _get_from_cache( self, cls: type[T], *, id: str | None = None, name: str | None = None, alias: str | None = None, storage_client_cache_key: Hashable = '', ) -> T | None: """Get a storage instance from the cache.""" if id is not None and (cached_instance := self._cache.by_id[cls][id].get(storage_client_cache_key)): if isinstance(cached_instance, cls): return cached_instance raise RuntimeError('Cached instance type mismatch.') if name is not None and (cached_instance := self._cache.by_name[cls][name].get(storage_client_cache_key)): if isinstance(cached_instance, cls): return cached_instance raise RuntimeError('Cached instance type mismatch.') if alias is not None and (cached_instance := self._cache.by_alias[cls][alias].get(storage_client_cache_key)): if isinstance(cached_instance, cls): return cached_instance raise RuntimeError('Cached instance type mismatch.') return None def _check_name_alias_conflict( self, cls: type[T], *, name: str | None = None, alias: str | None = None, storage_client_cache_key: Hashable = '', ) -> None: """Check for conflicts between named and alias storages.""" if alias and (self._cache.by_name[cls][alias].get(storage_client_cache_key)): raise ValueError( f'Cannot create alias storage "{alias}" because a named storage with the same name already exists. ' f'Use a different alias or drop the existing named storage first.' ) if name and (self._cache.by_alias[cls][name].get(storage_client_cache_key)): raise ValueError( f'Cannot create named storage "{name}" because an alias storage with the same name already exists. ' f'Use a different name or drop the existing alias storage first.' ) ================================================ FILE: src/crawlee/storages/_utils.py ================================================ import re NAME_REGEX = re.compile(r'^([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])$') def validate_storage_name(name: str | None) -> None: if name and not NAME_REGEX.match(name): raise ValueError( f'Invalid storage name "{name}". Name can only contain letters "a" through "z", the digits "0" through' '"9", and the hyphen ("-") but only in the middle of the string (e.g. "my-value-1")' ) ================================================ FILE: src/crawlee/storages/py.typed ================================================ ================================================ FILE: tests/__init__.py ================================================ ================================================ FILE: tests/e2e/__init__.py ================================================ ================================================ FILE: tests/e2e/conftest.py ================================================ import subprocess from pathlib import Path import pytest from _pytest.config import Config from filelock import FileLock _CRAWLEE_ROOT_PATH = Path(__file__).parent.parent.parent.resolve() def pytest_configure(config: Config) -> None: for marker in [ 'httpx', 'curl_impersonate', 'impit', 'playwright', 'playwright_camoufox', 'playwright_chrome', 'playwright_firefox', 'playwright_webkit', 'parsel', 'beautifulsoup', 'uv', 'poetry', 'pip', ]: config.addinivalue_line('markers', f'{marker}: Integration test parameter marker.') @pytest.fixture(scope='session') def crawlee_wheel_path(tmp_path_factory: pytest.TempPathFactory, testrun_uid: str) -> Path: """Build the package wheel if it hasn't been built yet, and return the path to the wheel.""" # Make sure the wheel is not being built concurrently across all the pytest-xdist runners, # through locking the building process with a temp file. with FileLock(tmp_path_factory.getbasetemp().parent / 'crawlee_wheel_build.lock'): # Make sure the wheel is built exactly once across all the pytest-xdist runners, # through an indicator file saying that the wheel was already built. was_wheel_built_this_test_run_file = tmp_path_factory.getbasetemp() / f'wheel_was_built_in_run_{testrun_uid}' if not was_wheel_built_this_test_run_file.exists(): subprocess.run( args='python -m build', cwd=_CRAWLEE_ROOT_PATH, shell=True, check=True, capture_output=True, ) was_wheel_built_this_test_run_file.touch() # Read the current package version, necessary for getting the right wheel filename. pyproject_toml_file = (_CRAWLEE_ROOT_PATH / 'pyproject.toml').read_text(encoding='utf-8') for line in pyproject_toml_file.splitlines(): if line.startswith('version = '): delim = '"' if '"' in line else "'" crawlee_version = line.split(delim)[1] break else: raise RuntimeError('Unable to find version string.') wheel_path = _CRAWLEE_ROOT_PATH / 'dist' / f'crawlee-{crawlee_version}-py3-none-any.whl' # Just to be sure. assert wheel_path.exists() return wheel_path ================================================ FILE: tests/e2e/project_template/test_static_crawlers_templates.py ================================================ import os import re import subprocess from pathlib import Path from typing import Literal import pytest from apify_client import ApifyClientAsync from cookiecutter.main import cookiecutter from crawlee._cli import default_start_url, template_directory from crawlee._utils.crypto import crypto_random_object_id from tests.e2e.project_template.utils import patch_crawlee_version_in_project # To run these tests locally, make sure you have apify-cli installed and available in the path. # https://docs.apify.com/cli/docs/installation @pytest.mark.parametrize( 'crawler_type', [ pytest.param('playwright-camoufox', marks=pytest.mark.playwright_camoufox), pytest.param('playwright-chrome', marks=pytest.mark.playwright_chrome), pytest.param('playwright-firefox', marks=pytest.mark.playwright_firefox), pytest.param('playwright-webkit', marks=pytest.mark.playwright_webkit), pytest.param('playwright', marks=pytest.mark.playwright), pytest.param('parsel', marks=pytest.mark.parsel), pytest.param('beautifulsoup', marks=pytest.mark.beautifulsoup), ], ) @pytest.mark.parametrize( 'http_client', [ pytest.param('httpx', marks=pytest.mark.httpx), pytest.param('curl-impersonate', marks=pytest.mark.curl_impersonate), pytest.param('impit', marks=pytest.mark.impit), ], ) @pytest.mark.parametrize( 'package_manager', [ pytest.param('pip', marks=pytest.mark.pip), pytest.param('uv', marks=pytest.mark.uv), pytest.param('poetry', marks=pytest.mark.poetry), ], ) async def test_static_crawler_actor_at_apify( tmp_path: Path, crawlee_wheel_path: Path, package_manager: Literal['pip', 'uv', 'poetry'], crawler_type: str, http_client: str, ) -> None: # Generate new actor name actor_name = f'crawlee-python-template-e2e-test-{crypto_random_object_id(8).lower()}' # Create project from template cookiecutter( template=str(template_directory), no_input=True, extra_context={ 'project_name': actor_name, 'package_manager': package_manager, 'crawler_type': crawler_type, 'http_client': http_client, 'enable_apify_integration': True, 'start_url': default_start_url, 'install_project': False, }, output_dir=str(tmp_path), ) patch_crawlee_version_in_project( project_path=tmp_path / actor_name, wheel_path=crawlee_wheel_path, package_manager=package_manager ) # Print apify version for debugging purposes in rare cases of CLI failures subprocess.run(['apify', '--version'], check=True) # noqa: ASYNC221, S607 # Build actor using sequence of cli commands as the user would subprocess.run( # noqa: ASYNC221, S603 ['apify', 'login', '-t', os.environ['APIFY_TEST_USER_API_TOKEN']], # noqa: S607 capture_output=True, check=True, cwd=tmp_path / actor_name, ) subprocess.run(['apify', 'init', '-y', actor_name], capture_output=True, check=True, cwd=tmp_path / actor_name) # noqa: ASYNC221, S603, S607 build_process = subprocess.run(['apify', 'push'], capture_output=True, check=False, cwd=tmp_path / actor_name) # noqa: ASYNC221, S607 # Get actor ID from build log actor_id_regexp = re.compile(r'https:\/\/console\.apify\.com\/actors\/(.*)#\/builds\/\d*\.\d*\.\d*') if match := re.findall(actor_id_regexp, build_process.stderr.decode()): actor_id = match[0] else: raise AssertionError(f'Failed to find actor id in build log: {build_process.stderr.decode()}') client = ApifyClientAsync(token=os.getenv('APIFY_TEST_USER_API_TOKEN')) actor = client.actor(actor_id) # Run actor try: assert build_process.returncode == 0 started_run_data = await actor.start(memory_mbytes=8192) actor_run = client.run(started_run_data['id']) finished_run_data = await actor_run.wait_for_finish() actor_run_log = await actor_run.log().get() finally: # Delete the actor once it is no longer needed. await actor.delete() # Asserts additional_run_info = f'Full actor run log: {actor_run_log}' assert actor_run_log assert finished_run_data assert finished_run_data['status'] == 'SUCCEEDED', additional_run_info assert ( 'Crawler.stop() was called with following reason: The crawler has reached its limit of 10 requests per crawl.' ) in actor_run_log, additional_run_info assert int(re.findall(r'requests_finished\s*│\s*(\d*)', actor_run_log)[-1]) >= 10, additional_run_info ================================================ FILE: tests/e2e/project_template/utils.py ================================================ import re import shutil import subprocess from pathlib import Path from typing import Literal def patch_crawlee_version_in_project( project_path: Path, wheel_path: Path, package_manager: Literal['pip', 'uv', 'poetry'] ) -> None: """Ensure that the test is using current version of the crawlee from the source and not from Pypi.""" # Copy prepared .whl file shutil.copy(wheel_path, project_path) if package_manager in {'poetry', 'uv'}: _patch_crawlee_version_in_pyproject_toml_based_project(project_path, wheel_path) else: _patch_crawlee_version_in_requirements_txt_based_project(project_path, wheel_path) def _patch_crawlee_version_in_requirements_txt_based_project(project_path: Path, wheel_path: Path) -> None: # Get any extras requirements_path = project_path / 'requirements.txt' with requirements_path.open() as f: requirements = f.read() crawlee_extras = re.findall(r'crawlee(\[.*\])', requirements)[0] or '' # Modify requirements.txt to use crawlee from wheel file instead of from Pypi with requirements_path.open() as f: modified_lines = [] for line in f: if 'crawlee' in line: modified_lines.append(f'./{wheel_path.name}{crawlee_extras}\n') else: modified_lines.append(line) with requirements_path.open('w') as f: f.write(''.join(modified_lines)) # Patch the dockerfile to have wheel file available dockerfile_path = project_path / 'Dockerfile' with dockerfile_path.open() as f: modified_lines = [] for line in f: modified_lines.append(line) if line.startswith('COPY requirements.txt ./'): modified_lines.extend( [ f'COPY {wheel_path.name} ./\n', # If no crawlee version bump, pip might be lazy and take existing pre-installed crawlee version, # make sure that one is patched as well. f'RUN pip install ./{wheel_path.name}{crawlee_extras} --force-reinstall\n', ] ) with dockerfile_path.open('w') as f: f.write(''.join(modified_lines)) def _patch_crawlee_version_in_pyproject_toml_based_project(project_path: Path, wheel_path: Path) -> None: """Ensure that the test is using current version of the crawlee from the source and not from Pypi.""" # Get any extras pyproject_path = project_path / 'pyproject.toml' with pyproject_path.open() as f: pyproject = f.read() crawlee_extras = re.findall(r'crawlee(\[.*\])', pyproject)[0] or '' # Inject crawlee wheel file to the docker image and update project to depend on it.""" dockerfile_path = project_path / 'Dockerfile' with dockerfile_path.open() as f: modified_lines = [] for line in f: modified_lines.append(line) if line.startswith('COPY pyproject.toml'): if 'uv.lock' in line: package_manager = 'uv' elif 'poetry.lock' in line: package_manager = 'poetry' else: raise RuntimeError('This does not look like a uv or poetry based project.') # Create lock file that is expected by the docker to exist (even though it will be patched # in the docker). subprocess.run( args=[package_manager, 'lock'], cwd=str(project_path), check=True, capture_output=True, ) # Add command to copy .whl to the docker image and update project with it. # Patching in docker file due to the poetry not properly supporting relative paths for wheel packages # and so the absolute path (in the container) is generated when running `add` command in the container. modified_lines.extend( [ f'COPY {wheel_path.name} ./\n', # If no crawlee version bump, poetry might be lazy and take existing pre-installed crawlee # version, make sure that one is patched as well. f'RUN pip install ./{wheel_path.name}{crawlee_extras} --force-reinstall\n', f'RUN {package_manager} add ./{wheel_path.name}{crawlee_extras}\n', f'RUN {package_manager} lock\n', ] ) with dockerfile_path.open('w') as f: f.write(''.join(modified_lines)) ================================================ FILE: tests/unit/README.md ================================================ # Unit tests Some tests may exhibit flaky behavior in CI. The reason for flaky behavior should be understood as it can indicate bug in the code or design flaw in the test. There are other reasons related to test execution, such as some tests that are not (or can not be) properly isolated, or limited resource constraints of the test executor. Here are some suggested approaches to mitigate flakiness, sorted in the order of preference: - Investigate the root cause and fix the code or test. - Apply one of the pytest marks to mitigate the flakiness: - `@run_alone_on_mac` - Test with such mark will run alone on macOS exeutor in CI (normally several tests run in parallel, which can cause resource-sensitive tests to fail.) Use for resource sensitive tests that are known to be flaky only on macOS. - `@run_alone` - Test with such mark will run alone on any executor. Use for resource sensitive tests that are known to be flaky on all platforms or for tests that can not be run in parallel with other test due to their design (This should be extremely rare). - `@pytest.mark.flaky` - Test with such mark will be retried several times if it fails. Use for tests that are known to be flaky, but the reason for flakiness is not understood or can not be easily mitigated. - `@pytest.mark.skip` - Test with such mark will be skipped. Use when none of the above approaches mitigate the test flakiness. Marking test as skipped should be a last resort, as it can hide potential bugs and give false sense of security. Skipped tests should be tracked in GitHub issue. ================================================ FILE: tests/unit/__init__.py ================================================ ================================================ FILE: tests/unit/_autoscaling/test_autoscaled_pool.py ================================================ # ruff: noqa: FBT003 # Boolean positional value in function call from __future__ import annotations import asyncio from contextlib import suppress from datetime import datetime, timedelta, timezone from itertools import chain, repeat from typing import TYPE_CHECKING, TypeVar, cast from unittest.mock import Mock import pytest from crawlee._autoscaling import AutoscaledPool, SystemStatus from crawlee._autoscaling._types import LoadRatioInfo, SystemInfo from crawlee._types import ConcurrencySettings from crawlee._utils.time import measure_time if TYPE_CHECKING: from collections.abc import Awaitable @pytest.fixture def system_status() -> SystemStatus | Mock: return Mock(spec=SystemStatus) T = TypeVar('T') def future(value: T, /) -> Awaitable[T]: f = asyncio.Future[T]() f.set_result(value) return f @pytest.mark.run_alone async def test_runs_concurrently(system_status: SystemStatus | Mock) -> None: done_count = 0 async def run() -> None: await asyncio.sleep(0.1) nonlocal done_count done_count += 1 pool = AutoscaledPool( system_status=system_status, run_task_function=run, is_task_ready_function=lambda: future(True), is_finished_function=lambda: future(done_count >= 10), concurrency_settings=ConcurrencySettings( min_concurrency=10, max_concurrency=10, ), ) with measure_time() as elapsed: await pool.run() assert elapsed.wall is not None assert elapsed.wall < 0.3 assert done_count >= 10 async def test_abort_works(system_status: SystemStatus | Mock) -> None: async def run() -> None: await asyncio.sleep(60) pool = AutoscaledPool( system_status=system_status, run_task_function=run, is_task_ready_function=lambda: future(True), is_finished_function=lambda: future(False), concurrency_settings=ConcurrencySettings( min_concurrency=10, max_concurrency=10, ), ) with measure_time() as elapsed: run_task = asyncio.create_task(pool.run(), name='pool run task') await asyncio.sleep(0.1) assert pool.current_concurrency == 10 await pool.abort() assert pool.current_concurrency == 0 await run_task assert elapsed.wall is not None assert elapsed.wall < 5 async def test_propagates_exceptions(system_status: SystemStatus | Mock) -> None: done_count = 0 async def run() -> None: await asyncio.sleep(0.1) nonlocal done_count done_count += 1 if done_count > 5: raise RuntimeError('Scheduled crash') pool = AutoscaledPool( system_status=system_status, run_task_function=run, is_task_ready_function=lambda: future(True), is_finished_function=lambda: future(done_count >= 20), concurrency_settings=ConcurrencySettings( min_concurrency=10, max_concurrency=10, ), ) with pytest.raises(RuntimeError, match=r'Scheduled crash'): await pool.run() assert done_count < 20 async def test_propagates_exceptions_after_finished(system_status: SystemStatus | Mock) -> None: started_count = 0 async def run() -> None: nonlocal started_count started_count += 1 await asyncio.sleep(1) raise RuntimeError('Scheduled crash') pool = AutoscaledPool( system_status=system_status, run_task_function=run, is_task_ready_function=lambda: future(True), is_finished_function=lambda: future(started_count > 0), concurrency_settings=ConcurrencySettings( min_concurrency=1, desired_concurrency=1, max_concurrency=1, ), ) with pytest.raises(RuntimeError, match=r'Scheduled crash'): await pool.run() @pytest.mark.flaky( rerun=3, reason='Test is flaky on Windows and MacOS, see https://github.com/apify/crawlee-python/issues/1655.', ) async def test_autoscales( monkeypatch: pytest.MonkeyPatch, system_status: SystemStatus | Mock, ) -> None: done_count = 0 async def run() -> None: await asyncio.sleep(0.1) nonlocal done_count done_count += 1 start = datetime.now(timezone.utc) def get_historical_system_info() -> SystemInfo: result = SystemInfo( cpu_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3), memory_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3), event_loop_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3), client_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3), ) # 0.5 seconds after the start of the test, pretend the CPU became overloaded if result.created_at - start >= timedelta(seconds=0.5): result.cpu_info = LoadRatioInfo(limit_ratio=0.9, actual_ratio=1.0) return result cast('Mock', system_status.get_historical_system_info).side_effect = get_historical_system_info # Override AP class attributes using monkeypatch. monkeypatch.setattr(AutoscaledPool, '_AUTOSCALE_INTERVAL', timedelta(seconds=0.1)) pool = AutoscaledPool( system_status=system_status, run_task_function=run, is_task_ready_function=lambda: future(True), is_finished_function=lambda: future(False), concurrency_settings=ConcurrencySettings( min_concurrency=1, desired_concurrency=1, max_concurrency=4, ), ) pool_run_task = asyncio.create_task(pool.run(), name='pool run task') try: # After 0.2s, there should be an increase in concurrency await asyncio.sleep(0.2) assert pool.desired_concurrency > 1 # After 0.5s, the concurrency should reach max concurrency await asyncio.sleep(0.3) assert pool.desired_concurrency == 4 # The concurrency should guarantee completion of more than 10 tasks (a single worker would complete ~5) assert done_count > 10 # After 0.7s, the pretend overload should have kicked in and there should be a drop in desired concurrency await asyncio.sleep(0.2) assert pool.desired_concurrency < 4 # After a full second, the pool should scale down all the way to 1 await asyncio.sleep(0.3) assert pool.desired_concurrency == 1 finally: pool_run_task.cancel() with suppress(asyncio.CancelledError): await pool_run_task async def test_autoscales_uses_desired_concurrency_ratio( monkeypatch: pytest.MonkeyPatch, system_status: SystemStatus | Mock, ) -> None: """Test that desired concurrency ratio can limit desired concurrency. This test creates situation where only one task is ready and then no other task is ever ready. This creates situation where the system could scale up desired concurrency, but it will not do so because desired_concurrency_ratio=1 means that first the system would have to increase current concurrency to same number as desired concurrency and due to no other task ever being ready, it will never happen. Thus desired concurrency will stay 2 as was the initial setup, even though other conditions would allow the increase. (max_concurrency=4, system being idle). """ async def run() -> None: await asyncio.sleep(0.1) is_task_ready_iterator = chain([future(True)], repeat(future(False))) def is_task_ready_function() -> Awaitable[bool]: return next(is_task_ready_iterator) def get_historical_system_info() -> SystemInfo: return SystemInfo( cpu_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3), memory_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3), event_loop_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3), client_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3), ) cast('Mock', system_status.get_historical_system_info).side_effect = get_historical_system_info # Override AP class attributes using monkeypatch. monkeypatch.setattr(AutoscaledPool, '_AUTOSCALE_INTERVAL', timedelta(seconds=0.1)) monkeypatch.setattr(AutoscaledPool, '_DESIRED_CONCURRENCY_RATIO', 1) pool = AutoscaledPool( system_status=system_status, run_task_function=run, is_task_ready_function=is_task_ready_function, is_finished_function=lambda: future(False), concurrency_settings=ConcurrencySettings( min_concurrency=2, desired_concurrency=2, max_concurrency=4, ), ) pool_run_task = asyncio.create_task(pool.run(), name='pool run task') try: for _ in range(5): assert pool.desired_concurrency == 2 await asyncio.sleep(0.1) finally: pool_run_task.cancel() with suppress(asyncio.CancelledError): await pool_run_task async def test_max_tasks_per_minute_works(system_status: SystemStatus | Mock) -> None: done_count = 0 async def run() -> None: await asyncio.sleep(0.1) nonlocal done_count done_count += 1 pool = AutoscaledPool( system_status=system_status, run_task_function=run, is_task_ready_function=lambda: future(True), is_finished_function=lambda: future(False), concurrency_settings=ConcurrencySettings( min_concurrency=1, desired_concurrency=1, max_concurrency=1, max_tasks_per_minute=120, ), ) pool_run_task = asyncio.create_task(pool.run(), name='pool run task') try: await asyncio.sleep(0.5) assert done_count <= 1 finally: pool_run_task.cancel() with suppress(asyncio.CancelledError): await pool_run_task async def test_allows_multiple_run_calls(system_status: SystemStatus | Mock) -> None: done_count = 0 async def run() -> None: nonlocal done_count done_count += 1 await asyncio.sleep(0.1) pool = AutoscaledPool( system_status=system_status, run_task_function=run, is_task_ready_function=lambda: future(done_count < 4), is_finished_function=lambda: future(done_count >= 4), concurrency_settings=ConcurrencySettings( min_concurrency=4, desired_concurrency=4, max_concurrency=4, ), ) await pool.run() assert done_count == 4 done_count = 0 await pool.run() assert done_count == 4 ================================================ FILE: tests/unit/_autoscaling/test_snapshotter.py ================================================ from __future__ import annotations import asyncio import time from bisect import insort from datetime import datetime, timedelta, timezone from logging import getLogger from math import floor from typing import TYPE_CHECKING, Any, cast from unittest import mock from unittest.mock import MagicMock import pytest from crawlee import service_locator from crawlee._autoscaling import Snapshotter from crawlee._autoscaling._types import ( SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD, ClientSnapshot, CpuSnapshot, MemorySnapshot, ) from crawlee._autoscaling.snapshotter import SortedSnapshotList from crawlee._utils.byte_size import ByteSize from crawlee._utils.system import CpuInfo, MemoryInfo, get_memory_info from crawlee.configuration import Configuration from crawlee.events import LocalEventManager from crawlee.events._types import Event, EventSystemInfoData if TYPE_CHECKING: from collections.abc import AsyncGenerator @pytest.fixture async def event_manager() -> AsyncGenerator[LocalEventManager, None]: # Use a long interval to avoid interference from periodic system info events during tests and ensure the first # automatic event is consumed before yielding. event_manager = LocalEventManager(system_info_interval=timedelta(hours=9999)) initial_system_info_consumed = asyncio.Event() async def consume_automatic_system_info(_: EventSystemInfoData) -> None: initial_system_info_consumed.set() event_manager.on(event=Event.SYSTEM_INFO, listener=consume_automatic_system_info) async with event_manager: await initial_system_info_consumed.wait() event_manager.off(event=Event.SYSTEM_INFO, listener=consume_automatic_system_info) yield event_manager @pytest.fixture async def snapshotter(event_manager: LocalEventManager) -> AsyncGenerator[Snapshotter, None]: config = Configuration(available_memory_ratio=0.25) service_locator.set_event_manager(event_manager) async with Snapshotter.from_config(config) as snapshotter: yield snapshotter @pytest.fixture def default_cpu_info() -> CpuInfo: return CpuInfo(used_ratio=0.5) @pytest.fixture def default_memory_info() -> MemoryInfo: return MemoryInfo( total_size=ByteSize.from_gb(8), current_size=ByteSize.from_gb(4), system_wide_used_size=ByteSize.from_gb(5), ) @pytest.fixture def event_system_data_info(default_cpu_info: CpuInfo, default_memory_info: MemoryInfo) -> EventSystemInfoData: return EventSystemInfoData( cpu_info=default_cpu_info, memory_info=default_memory_info, ) async def test_start_stop_lifecycle() -> None: config = Configuration(available_memory_ratio=0.25) async with Snapshotter.from_config(config): pass async def test_snapshot_cpu( snapshotter: Snapshotter, event_system_data_info: EventSystemInfoData, event_manager: LocalEventManager ) -> None: event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_data_info) await event_manager.wait_for_all_listeners_to_complete() cpu_snapshots = cast('list[CpuSnapshot]', snapshotter.get_cpu_sample()) assert len(cpu_snapshots) == 1 assert cpu_snapshots[0].used_ratio == event_system_data_info.cpu_info.used_ratio async def test_snapshot_memory( snapshotter: Snapshotter, event_system_data_info: EventSystemInfoData, event_manager: LocalEventManager ) -> None: event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_data_info) await event_manager.wait_for_all_listeners_to_complete() memory_snapshots = cast('list[MemorySnapshot]', snapshotter.get_memory_sample()) assert len(memory_snapshots) == 1 assert memory_snapshots[0].current_size == event_system_data_info.memory_info.current_size async def test_snapshot_memory_with_memory_info_sets_system_wide_fields( snapshotter: Snapshotter, event_manager: LocalEventManager ) -> None: memory_info = MemoryInfo( total_size=ByteSize.from_gb(16), current_size=ByteSize.from_gb(4), system_wide_used_size=ByteSize.from_gb(12), ) event_data = EventSystemInfoData( cpu_info=CpuInfo(used_ratio=0.5), memory_info=memory_info, ) event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data) await event_manager.wait_for_all_listeners_to_complete() memory_snapshots = cast('list[MemorySnapshot]', snapshotter.get_memory_sample()) assert len(memory_snapshots) == 1 memory_snapshot = memory_snapshots[0] # Test that system-wide fields are properly set assert memory_snapshot.system_wide_used_size == memory_info.system_wide_used_size assert memory_snapshot.system_wide_memory_size == memory_info.total_size def test_snapshot_event_loop(snapshotter: Snapshotter) -> None: # A first event loop snapshot is created when an instance is created. event_loop_snapshots = snapshotter.get_event_loop_sample() assert len(event_loop_snapshots) == 1 def test_snapshot_client(snapshotter: Snapshotter) -> None: # A first client snapshot is created when an instance is created. client_snapshots = snapshotter.get_client_sample() assert len(client_snapshots) == 1 def test_snapshot_client_overloaded() -> None: assert not ClientSnapshot(error_count=1, new_error_count=1, max_error_count=2).is_overloaded assert not ClientSnapshot(error_count=2, new_error_count=1, max_error_count=2).is_overloaded assert not ClientSnapshot(error_count=4, new_error_count=2, max_error_count=2).is_overloaded assert ClientSnapshot(error_count=7, new_error_count=3, max_error_count=2).is_overloaded @pytest.mark.run_alone async def test_get_cpu_sample( snapshotter: Snapshotter, event_manager: LocalEventManager, default_memory_info: MemoryInfo ) -> None: now = datetime.now(timezone.utc) snapshotter._SNAPSHOT_HISTORY = timedelta(hours=10) # Extend history for testing events_data = [ EventSystemInfoData( cpu_info=CpuInfo( used_ratio=0.5, created_at=now - timedelta(hours=delta), ), memory_info=default_memory_info, ) for delta in range(5, 0, -1) ] for event_data in events_data: event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data) await event_manager.wait_for_all_listeners_to_complete() # When no sample duration is provided it should return all snapshots samples = snapshotter.get_cpu_sample() assert len(samples) == len(events_data) duration = timedelta(hours=0.5) samples = snapshotter.get_cpu_sample(duration) assert len(samples) == 1 duration = timedelta(hours=2.5) samples = snapshotter.get_cpu_sample(duration) assert len(samples) == 3 duration = timedelta(hours=10) samples = snapshotter.get_cpu_sample(duration) assert len(samples) == len(events_data) async def test_methods_raise_error_when_not_active() -> None: snapshotter = Snapshotter.from_config(Configuration(available_memory_ratio=0.25)) assert snapshotter.active is False with pytest.raises(RuntimeError, match=r'Snapshotter is not active.'): snapshotter.get_cpu_sample() with pytest.raises(RuntimeError, match=r'Snapshotter is not active.'): snapshotter.get_memory_sample() with pytest.raises(RuntimeError, match=r'Snapshotter is not active.'): snapshotter.get_event_loop_sample() with pytest.raises(RuntimeError, match=r'Snapshotter is not active.'): snapshotter.get_client_sample() with pytest.raises(RuntimeError, match=r'Snapshotter is already active.'): async with snapshotter, snapshotter: pass async with snapshotter: snapshotter.get_cpu_sample() snapshotter.get_memory_sample() snapshotter.get_event_loop_sample() snapshotter.get_client_sample() assert snapshotter.active is True async def test_snapshot_pruning_removes_outdated_records( snapshotter: Snapshotter, event_manager: LocalEventManager, default_memory_info: MemoryInfo ) -> None: # Set the snapshot history to 2 hours snapshotter._SNAPSHOT_HISTORY = timedelta(hours=2) # Create timestamps for testing now = datetime.now(timezone.utc) def randomly_delayed_insort(*args: Any, **kwargs: Any) -> None: """Sort with injected delay to provoke otherwise hard to reproduce race condition.""" time.sleep(0.05) return insort(*args, **kwargs) with mock.patch('crawlee._autoscaling.snapshotter.insort', side_effect=randomly_delayed_insort): events_data = [ EventSystemInfoData( cpu_info=CpuInfo(used_ratio=0.5, created_at=now - timedelta(hours=delta)), memory_info=default_memory_info, ) for delta in [0, 3, 2, 5] # Out of order timestamps. Snapshotter can not rely on natural ordering. ] for event_data in events_data: event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data) await event_manager.wait_for_all_listeners_to_complete() cpu_snapshots = cast('list[CpuSnapshot]', snapshotter.get_cpu_sample()) # Check that only the last two snapshots remain assert len(cpu_snapshots) == 2 assert cpu_snapshots[0].created_at == now - timedelta(hours=2) assert cpu_snapshots[1].created_at == now async def test_memory_load_evaluation_logs_warning_on_high_usage( caplog: pytest.LogCaptureFixture, event_manager: LocalEventManager, default_cpu_info: CpuInfo, ) -> None: config = Configuration(memory_mbytes=8192) service_locator.set_event_manager(event_manager) snapshotter = Snapshotter.from_config(config) high_memory_usage = ByteSize.from_gb(8) * 0.95 # 95% of 8 GB event_data = EventSystemInfoData( cpu_info=default_cpu_info, memory_info=MemoryInfo( total_size=ByteSize.from_gb(8), current_size=high_memory_usage, system_wide_used_size=ByteSize.from_gb(7), ), ) async with snapshotter: event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data) await event_manager.wait_for_all_listeners_to_complete() # Filter log records to only include those from snapshotter log_records = [record for record in caplog.records if 'snapshotter' in record.pathname.lower()] assert len(log_records) == 1 assert log_records[0].levelname.lower() == 'warning' assert 'Memory is critically overloaded' in log_records[0].msg event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data) await event_manager.wait_for_all_listeners_to_complete() log_records = [record for record in caplog.records if 'snapshotter' in record.pathname.lower()] assert len(log_records) == 1 async def test_memory_load_evaluation_silent_on_acceptable_usage( monkeypatch: pytest.MonkeyPatch, event_manager: LocalEventManager, default_cpu_info: CpuInfo, ) -> None: mock_logger_warn = MagicMock() monkeypatch.setattr(getLogger('crawlee.autoscaling.snapshotter'), 'warning', mock_logger_warn) service_locator.set_event_manager(event_manager) snapshotter = Snapshotter.from_config(Configuration(memory_mbytes=8192)) low_memory_usage = ByteSize.from_gb(8) * 0.8 # 80% of 8 GB event_data = EventSystemInfoData( cpu_info=default_cpu_info, memory_info=MemoryInfo( total_size=ByteSize.from_gb(8), current_size=low_memory_usage, system_wide_used_size=ByteSize.from_gb(7), ), ) async with snapshotter: event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data) await event_manager.wait_for_all_listeners_to_complete() assert mock_logger_warn.call_count == 0 async def test_snapshots_time_ordered(snapshotter: Snapshotter, event_manager: LocalEventManager) -> None: # All internal snapshot list should be ordered by creation time in ascending order. # Scenario where older emitted event arrives after newer event. # Snapshotter should not trust the event order and check events' times. time_new = datetime.now(tz=timezone.utc) time_old = datetime.now(tz=timezone.utc) - timedelta(milliseconds=50) def create_event_data(creation_time: datetime) -> EventSystemInfoData: return EventSystemInfoData( cpu_info=CpuInfo(used_ratio=0.5, created_at=creation_time), memory_info=MemoryInfo( current_size=ByteSize(bytes=1), created_at=creation_time, total_size=ByteSize(bytes=2), system_wide_used_size=ByteSize.from_gb(5), ), ) event_manager.emit(event=Event.SYSTEM_INFO, event_data=create_event_data(time_new)) event_manager.emit(event=Event.SYSTEM_INFO, event_data=create_event_data(time_old)) await event_manager.wait_for_all_listeners_to_complete() memory_samples = snapshotter.get_memory_sample() cpu_samples = snapshotter.get_cpu_sample() assert memory_samples[0].created_at == time_old assert cpu_samples[0].created_at == time_old assert memory_samples[1].created_at == time_new assert cpu_samples[1].created_at == time_new def test_sorted_snapshot_list_add_maintains_order() -> None: """Test that SortedSnapshotList.add method maintains sorted order by created_at with multiple items.""" sorted_list = SortedSnapshotList[CpuSnapshot]() # Create snapshots with different timestamps (more items to test binary search better) now = datetime.now(timezone.utc) snapshots = [ CpuSnapshot(used_ratio=0.1, max_used_ratio=0.95, created_at=now - timedelta(seconds=50)), # oldest CpuSnapshot(used_ratio=0.2, max_used_ratio=0.95, created_at=now - timedelta(seconds=40)), CpuSnapshot(used_ratio=0.3, max_used_ratio=0.95, created_at=now - timedelta(seconds=30)), CpuSnapshot(used_ratio=0.4, max_used_ratio=0.95, created_at=now - timedelta(seconds=20)), CpuSnapshot(used_ratio=0.5, max_used_ratio=0.95, created_at=now - timedelta(seconds=10)), CpuSnapshot(used_ratio=0.6, max_used_ratio=0.95, created_at=now - timedelta(seconds=5)), CpuSnapshot(used_ratio=0.7, max_used_ratio=0.95, created_at=now), # newest ] # Add snapshots in random order to test binary search insertion add_order = [3, 0, 5, 1, 6, 2, 4] # indices in random order for i in add_order: sorted_list.add(snapshots[i]) # Verify the list is sorted by created_at (should be in original order) assert len(sorted_list) == 7 for i, snapshot in enumerate(sorted_list): assert snapshot == snapshots[i], f'Item at index {i} is not correctly sorted' if i > 0: prev_time = sorted_list[i - 1].created_at curr_time = snapshot.created_at assert prev_time <= curr_time, f'Items at indices {i - 1} and {i} are not in chronological order' @pytest.mark.parametrize('dynamic_memory', [True, False]) async def test_dynamic_memory( *, default_cpu_info: CpuInfo, event_manager: LocalEventManager, dynamic_memory: bool, ) -> None: """Test dynamic memory scaling scenario where the system-wide memory can change. Create two memory snapshots. They have same memory usage, but different available memory. First snapshot is created with insufficient memory, so it is overloaded. Second snapshot is created with sufficient memory. Based on the Snapshotter configuration, it will either take into account the increased available memory or not. """ _initial_memory_info = get_memory_info() ratio_just_below_system_wide_overload = 0.99 * SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD memory_mbytes = 0 if dynamic_memory else floor(_initial_memory_info.total_size.to_mb()) service_locator.set_event_manager(event_manager) async with Snapshotter.from_config( Configuration(memory_mbytes=memory_mbytes, available_memory_ratio=ratio_just_below_system_wide_overload) ) as snapshotter: # Default state, memory usage exactly at the overload threshold -> overloaded, but not system-wide overloaded memory_infos = [ # Overloaded sample MemoryInfo( total_size=_initial_memory_info.total_size, current_size=_initial_memory_info.total_size * ratio_just_below_system_wide_overload, system_wide_used_size=_initial_memory_info.total_size * ratio_just_below_system_wide_overload, ), # Same as first sample, with twice as memory available in the system MemoryInfo( total_size=_initial_memory_info.total_size * 2, # Simulate increased total memory current_size=_initial_memory_info.total_size * ratio_just_below_system_wide_overload, system_wide_used_size=_initial_memory_info.total_size * ratio_just_below_system_wide_overload, ), ] for memory_info in memory_infos: event_manager.emit( event=Event.SYSTEM_INFO, event_data=EventSystemInfoData( cpu_info=default_cpu_info, memory_info=memory_info, ), ) await event_manager.wait_for_all_listeners_to_complete() memory_samples = snapshotter.get_memory_sample() assert len(memory_samples) == 2 # First sample will be overloaded. assert memory_samples[0].is_overloaded # Second sample can reflect the increased available memory based on the configuration used to create Snapshotter assert memory_samples[1].is_overloaded == (not dynamic_memory) ================================================ FILE: tests/unit/_autoscaling/test_system_status.py ================================================ from __future__ import annotations from datetime import datetime, timedelta, timezone from typing import TYPE_CHECKING import pytest from crawlee._autoscaling import Snapshotter, SystemStatus from crawlee._autoscaling._types import ( ClientSnapshot, CpuSnapshot, EventLoopSnapshot, LoadRatioInfo, MemorySnapshot, SystemInfo, ) from crawlee._utils.byte_size import ByteSize from crawlee.configuration import Configuration if TYPE_CHECKING: from collections.abc import AsyncGenerator @pytest.fixture async def snapshotter() -> AsyncGenerator[Snapshotter, None]: config = Configuration(available_memory_ratio=0.25) async with Snapshotter.from_config(config) as snapshotter: yield snapshotter @pytest.fixture def now() -> datetime: return datetime.now(timezone.utc) async def test_start_stop_lifecycle() -> None: config = Configuration(available_memory_ratio=0.25) async with Snapshotter.from_config(config) as snapshotter: system_status = SystemStatus(snapshotter) system_status.get_current_system_info() system_status.get_historical_system_info() def test_cpu_is_overloaded(snapshotter: Snapshotter, now: datetime) -> None: system_status = SystemStatus(snapshotter, cpu_overload_threshold=0.5) system_status._snapshotter._cpu_snapshots = Snapshotter._get_sorted_list_by_created_at( [ CpuSnapshot(used_ratio=0.6, max_used_ratio=0.75, created_at=now - timedelta(minutes=3)), CpuSnapshot(used_ratio=0.7, max_used_ratio=0.75, created_at=now - timedelta(minutes=2)), CpuSnapshot(used_ratio=0.8, max_used_ratio=0.75, created_at=now - timedelta(minutes=1)), CpuSnapshot(used_ratio=0.9, max_used_ratio=0.75, created_at=now), ] ) cpu_info = system_status._is_cpu_overloaded() assert cpu_info == LoadRatioInfo(limit_ratio=0.5, actual_ratio=0.667) assert cpu_info.is_overloaded is True def test_cpu_is_not_overloaded(snapshotter: Snapshotter, now: datetime) -> None: system_status = SystemStatus(snapshotter, cpu_overload_threshold=0.5) system_status._snapshotter._cpu_snapshots = Snapshotter._get_sorted_list_by_created_at( [ CpuSnapshot(used_ratio=0.7, max_used_ratio=0.75, created_at=now - timedelta(minutes=3)), CpuSnapshot(used_ratio=0.8, max_used_ratio=0.75, created_at=now - timedelta(minutes=2)), CpuSnapshot(used_ratio=0.6, max_used_ratio=0.75, created_at=now - timedelta(minutes=1)), CpuSnapshot(used_ratio=0.5, max_used_ratio=0.75, created_at=now), ] ) cpu_info = system_status._is_cpu_overloaded() assert cpu_info == LoadRatioInfo(limit_ratio=0.5, actual_ratio=0.333) assert cpu_info.is_overloaded is False def test_get_system_info(snapshotter: Snapshotter, now: datetime) -> None: system_status = SystemStatus( snapshotter, max_snapshot_age=timedelta(minutes=1), cpu_overload_threshold=0.5, memory_overload_threshold=0.5, event_loop_overload_threshold=0.5, client_overload_threshold=0.5, ) # Add CPU snapshots system_status._snapshotter._cpu_snapshots = Snapshotter._get_sorted_list_by_created_at( [ CpuSnapshot(used_ratio=0.6, max_used_ratio=0.75, created_at=now - timedelta(minutes=3)), CpuSnapshot(used_ratio=0.7, max_used_ratio=0.75, created_at=now - timedelta(minutes=2)), CpuSnapshot(used_ratio=0.8, max_used_ratio=0.75, created_at=now - timedelta(minutes=1)), CpuSnapshot(used_ratio=0.9, max_used_ratio=0.75, created_at=now), ] ) # Add memory snapshots system_status._snapshotter._memory_snapshots = Snapshotter._get_sorted_list_by_created_at( [ MemorySnapshot( current_size=ByteSize.from_gb(4), max_memory_size=ByteSize.from_gb(12), max_used_memory_ratio=0.8, created_at=now - timedelta(seconds=90), system_wide_used_size=None, system_wide_memory_size=None, ), MemorySnapshot( current_size=ByteSize.from_gb(7), max_memory_size=ByteSize.from_gb(8), max_used_memory_ratio=0.8, created_at=now - timedelta(seconds=60), system_wide_used_size=None, system_wide_memory_size=None, ), MemorySnapshot( current_size=ByteSize.from_gb(28), max_memory_size=ByteSize.from_gb(30), max_used_memory_ratio=0.8, created_at=now - timedelta(seconds=30), system_wide_used_size=None, system_wide_memory_size=None, ), MemorySnapshot( current_size=ByteSize.from_gb(48), max_memory_size=ByteSize.from_gb(60), max_used_memory_ratio=0.8, created_at=now, system_wide_used_size=None, system_wide_memory_size=None, ), ] ) # Add event loop snapshots system_status._snapshotter._event_loop_snapshots = Snapshotter._get_sorted_list_by_created_at( [ EventLoopSnapshot( delay=timedelta(milliseconds=700), max_delay=timedelta(milliseconds=500), created_at=now - timedelta(minutes=3), ), EventLoopSnapshot( delay=timedelta(milliseconds=600), max_delay=timedelta(milliseconds=500), created_at=now - timedelta(minutes=2), ), EventLoopSnapshot( delay=timedelta(milliseconds=200), max_delay=timedelta(milliseconds=500), created_at=now - timedelta(minutes=1), ), EventLoopSnapshot( delay=timedelta(milliseconds=100), max_delay=timedelta(milliseconds=500), created_at=now, ), ] ) # Add client snapshots system_status._snapshotter._client_snapshots = Snapshotter._get_sorted_list_by_created_at( [ ClientSnapshot(error_count=1, new_error_count=1, max_error_count=2, created_at=now - timedelta(minutes=3)), ClientSnapshot(error_count=2, new_error_count=1, max_error_count=2, created_at=now - timedelta(minutes=2)), ClientSnapshot(error_count=4, new_error_count=2, max_error_count=2, created_at=now - timedelta(minutes=1)), ClientSnapshot(error_count=4, new_error_count=0, max_error_count=2, created_at=now), ] ) # Test current system info current_system_info = system_status.get_current_system_info() assert current_system_info == SystemInfo( cpu_info=LoadRatioInfo(limit_ratio=system_status._cpu_overload_threshold, actual_ratio=1.0), memory_info=LoadRatioInfo(limit_ratio=system_status._memory_overload_threshold, actual_ratio=0.5), event_loop_info=LoadRatioInfo(limit_ratio=system_status._event_loop_overload_threshold, actual_ratio=0), client_info=LoadRatioInfo(limit_ratio=system_status._client_overload_threshold, actual_ratio=0), created_at=current_system_info.created_at, ) assert current_system_info.is_system_idle is False # Test historical system info historical_system_info = system_status.get_historical_system_info() assert historical_system_info == SystemInfo( cpu_info=LoadRatioInfo(limit_ratio=system_status._cpu_overload_threshold, actual_ratio=0.667), memory_info=LoadRatioInfo(limit_ratio=system_status._memory_overload_threshold, actual_ratio=0.667), event_loop_info=LoadRatioInfo(limit_ratio=system_status._event_loop_overload_threshold, actual_ratio=0.333), client_info=LoadRatioInfo(limit_ratio=system_status._client_overload_threshold, actual_ratio=0), created_at=historical_system_info.created_at, ) assert historical_system_info.is_system_idle is False @pytest.mark.parametrize(('client_overload_threshold', 'is_overloaded'), [(0.66, True), (0.67, False)]) def test_client_overloaded( *, snapshotter: Snapshotter, now: datetime, client_overload_threshold: float, is_overloaded: bool ) -> None: system_status = SystemStatus( snapshotter, max_snapshot_age=timedelta(minutes=1), client_overload_threshold=client_overload_threshold, ) system_status._snapshotter._client_snapshots = Snapshotter._get_sorted_list_by_created_at( [ ClientSnapshot(error_count=1, new_error_count=1, max_error_count=0, created_at=now - timedelta(minutes=3)), ClientSnapshot(error_count=2, new_error_count=1, max_error_count=0, created_at=now - timedelta(minutes=2)), ClientSnapshot(error_count=3, new_error_count=1, max_error_count=0, created_at=now - timedelta(minutes=1)), ClientSnapshot(error_count=3, new_error_count=0, max_error_count=0, created_at=now), ] ) # Ratio of overloaded snapshots is 2/3 (2 minutes out of 3) assert system_status._is_client_overloaded().is_overloaded == is_overloaded def test_memory_overloaded_system_wide(snapshotter: Snapshotter, now: datetime) -> None: """Test that system-wide memory overload is detected when system-wide memory utilization exceeds threshold.""" system_status = SystemStatus( snapshotter, max_snapshot_age=timedelta(minutes=1), memory_overload_threshold=0.5, # Set high threshold so process memory won't trigger overload ) # Add memory snapshots with system-wide memory usage above threshold (97%) system_status._snapshotter._memory_snapshots = Snapshotter._get_sorted_list_by_created_at( [ MemorySnapshot( current_size=ByteSize.from_gb(1), # Process memory is low max_memory_size=ByteSize.from_gb(8), # Max memory is high max_used_memory_ratio=0.8, # Ratio is fine created_at=now - timedelta(minutes=1), system_wide_used_size=ByteSize.from_gb(31), # System-wide used is high system_wide_memory_size=ByteSize.from_gb(32), # System-wide total (31/32 = 96.875% < 97%) ), MemorySnapshot( current_size=ByteSize.from_gb(1), # Process memory is low max_memory_size=ByteSize.from_gb(8), # Max memory is high max_used_memory_ratio=0.8, # Ratio is fine created_at=now, system_wide_used_size=ByteSize.from_gb(31.5), # System-wide used is high system_wide_memory_size=ByteSize.from_gb(32), # System-wide total (31.5/32 = 98.4% > 97%) ), ] ) memory_info = system_status._is_memory_overloaded() # Should be overloaded due to system-wide memory usage exceeding 97% threshold assert memory_info.is_overloaded is True # The actual ratio should be 1.0 (the entire time period from first to second snapshot is overloaded) assert memory_info.actual_ratio == 1.0 assert memory_info.limit_ratio == 0.5 ================================================ FILE: tests/unit/_statistics/test_error_tracker.py ================================================ import traceback import pytest from crawlee.statistics._error_tracker import ErrorTracker @pytest.mark.parametrize( ('error_tracker', 'expected_unique_errors'), [ (ErrorTracker(), 5), (ErrorTracker(show_file_and_line_number=False), 4), (ErrorTracker(show_error_name=False), 4), (ErrorTracker(show_error_message=False), 3), (ErrorTracker(show_error_name=False, show_file_and_line_number=False), 3), (ErrorTracker(show_file_and_line_number=False, show_error_message=False), 2), (ErrorTracker(show_error_name=False, show_file_and_line_number=False, show_error_message=False), 1), ], ) async def test_error_tracker_counts(error_tracker: ErrorTracker, expected_unique_errors: int) -> None: """Use different settings of `error_tracker` and test unique errors count.""" for error in [ Exception('Some value error abc'), ValueError('Some value error abc'), # Different type, different error ValueError('Some value error cde'), # Same type and similar message to previous, considered the same. ValueError( 'Another value error efg' ), # Same type, but too different message to previous, considered different. ValueError(), # Same type but don't have message, considered different. ]: try: raise error # Errors raised on same line except Exception as e: # noqa:PERF203 await error_tracker.add(e) try: raise ValueError('Some value error abc') # Same as one previous error, but different line. except Exception as e: await error_tracker.add(e) assert error_tracker.total == 6 assert error_tracker.unique_error_count == expected_unique_errors @pytest.mark.parametrize( ('message_1', 'message_2', 'expected_generic_message'), [ ('Some error number 123', 'Some error number 456', 'Some error number ***'), ('Some error number 123 456', 'Some error number 123 456 789', 'Some error number 123 456 ***'), ('Some error number 0 0 0', 'Some error number 1 0 1', 'Some error number *** 0 ***'), ], ) async def test_error_tracker_similar_messages_full_stack( message_1: str, message_2: str, expected_generic_message: str ) -> None: """Test that similar messages collapse into same group with generic name that contains wildcard symbols.""" error_tracker = ErrorTracker() for error in [ KeyError(message_1), KeyError(message_1), KeyError(message_1), ValueError(message_1), ValueError(message_2), RuntimeError(message_2), ]: try: raise error # Errors raised on the same line except Exception as e: # noqa:PERF203 await error_tracker.add(e) line = traceback.extract_tb(e.__traceback__)[0].lineno file_name = __file__.split('/')[-1] errors = error_tracker.get_most_common_errors() assert errors[0][0] == f'{file_name}:{line}:KeyError:{message_1}' assert errors[0][1] == 3 assert errors[1][0] == f'{file_name}:{line}:ValueError:{expected_generic_message}' assert errors[1][1] == 2 assert errors[2][0] == f'{file_name}:{line}:RuntimeError:{message_2}' assert errors[2][1] == 1 @pytest.mark.parametrize( ('show_full_message', 'expected_message'), [ (True, 'Error line 1\n Error line 2'), (False, 'Error line 1'), ], ) async def test_show_full_message(*, show_full_message: bool, expected_message: str) -> None: """Test error message settings with both options of `show_full_message`.""" error_tracker = ErrorTracker( show_error_name=False, show_file_and_line_number=False, show_full_message=show_full_message ) try: raise RuntimeError('Error line 1\n Error line 2') # Errors raised on the same line except Exception as e: await error_tracker.add(e) assert error_tracker.get_most_common_errors()[0][0] == expected_message async def test_error_tracker_with_errors_chain() -> None: """Test error tracker with errors chain.""" error_tracker = ErrorTracker(show_error_name=False, show_file_and_line_number=False, show_full_message=True) try: raise ZeroDivisionError('Zero division error') # Errors raised on the same line except Exception as e: try: raise ValueError from e except Exception as e: await error_tracker.add(e) assert error_tracker.get_most_common_errors()[0][0] == 'Zero division error' ================================================ FILE: tests/unit/_statistics/test_periodic_logging.py ================================================ from __future__ import annotations import asyncio import logging from datetime import timedelta from typing import TYPE_CHECKING from crawlee.statistics import Statistics if TYPE_CHECKING: import pytest async def test_periodic_logging(caplog: pytest.LogCaptureFixture) -> None: caplog.set_level(logging.INFO) log_message = 'Periodic statistics XYZ' statistics = Statistics.with_default_state(log_interval=timedelta(milliseconds=50), log_message=log_message) async with statistics: await asyncio.sleep(0.1) matching_records = [rec for rec in caplog.records if rec.message.startswith(log_message)] assert len(matching_records) >= 1 ================================================ FILE: tests/unit/_statistics/test_persistence.py ================================================ from __future__ import annotations from crawlee.statistics import Statistics async def test_basic_persistence() -> None: key = 'statistics_foo' async with Statistics.with_default_state(persistence_enabled=True, persist_state_key=key) as statistics: statistics.state.requests_failed = 42 async with Statistics.with_default_state(persistence_enabled=True, persist_state_key=key) as statistics: pass assert statistics.state.requests_failed == 42 ================================================ FILE: tests/unit/_statistics/test_request_max_duration.py ================================================ from __future__ import annotations import asyncio from crawlee.statistics import Statistics async def test_request_max_duration_tracks_maximum() -> None: """Test that request_max_duration correctly tracks the maximum duration, not the minimum.""" # asyncio.sleep() can sleep slightly shorter than expected https://bugs.python.org/issue31539#msg302699 asyncio_sleep_time_tolerance = 0.015 sleep_time = 0.05 async with Statistics.with_default_state() as statistics: # Record a short request statistics.record_request_processing_start('request_1') statistics.record_request_processing_finish('request_1') first_duration = statistics.state.request_max_duration # Record a longer request statistics.record_request_processing_start('request_2') await asyncio.sleep(sleep_time) # 50ms delay statistics.record_request_processing_finish('request_2') second_duration = statistics.state.request_max_duration # The max duration should be updated to the longer request's duration assert second_duration is not None assert first_duration is not None assert second_duration >= first_duration assert second_duration.total_seconds() >= (sleep_time - asyncio_sleep_time_tolerance) # Record another short request - max should NOT decrease statistics.record_request_processing_start('request_3') statistics.record_request_processing_finish('request_3') third_duration = statistics.state.request_max_duration # The max duration should remain unchanged (still the longest request) assert third_duration == second_duration ================================================ FILE: tests/unit/_statistics/test_request_processing_record.py ================================================ from datetime import timedelta from crawlee.statistics._statistics import RequestProcessingRecord def test_tracking_time_resolution() -> None: """Test that `RequestProcessingRecord` tracks time with sufficient resolution. This is generally not an issue on Linux, but on Windows some packages in older Python versions might be using system timers with not so granular resolution - some sources estimate 15ms. This test will start failing on Windows if unsuitable source of time measurement is selected due to two successive time measurements possibly using same timing sample.""" record = RequestProcessingRecord() record.run() record.finish() assert record.duration assert record.duration > timedelta(seconds=0) ================================================ FILE: tests/unit/_utils/test_byte_size.py ================================================ from __future__ import annotations import pytest from crawlee._utils.byte_size import ByteSize def test_initializations() -> None: assert ByteSize(1024).bytes == 1024 assert ByteSize.from_kb(1).bytes == 1024 assert ByteSize.from_mb(1).bytes == 1024**2 assert ByteSize.from_gb(1).bytes == 1024**3 assert ByteSize.from_tb(1).bytes == 1024**4 with pytest.raises(ValueError, match=r'ByteSize cannot be negative'): ByteSize(-1) def test_conversions() -> None: size = ByteSize.from_mb(2) assert size.to_kb() == 2 * 1024 assert size.to_mb() == 2.0 assert size.to_gb() == 2 / 1024 assert size.to_tb() == 2 / (1024**2) def test_string_representation() -> None: assert str(ByteSize(512)) == '512 B' assert str(ByteSize(2 * 1024)) == '2.00 KB' assert str(ByteSize(3 * 1024**2)) == '3.00 MB' assert str(ByteSize(4 * 1024**3)) == '4.00 GB' assert str(ByteSize(5 * 1024**4)) == '5.00 TB' def test_comparisons() -> None: size1 = ByteSize(1024) size2 = ByteSize(512) assert size1 > size2 assert size1 >= size2 assert size2 < size1 assert size2 <= size1 assert size1 == ByteSize(1024) assert size1 != size2 def test_additions() -> None: # Addition of ByteSize instances size1 = ByteSize(1024) size2 = ByteSize(2048) assert (size1 + size2).bytes == 3072 # Addition of ByteSize instance and an int with pytest.raises(TypeError): _ = size1 + 1024 # Addition of ByteSize instance and an float with pytest.raises(TypeError): _ = size2 + 123.45 def test_subtractions() -> None: # Direct subtraction of ByteSize instances size1 = ByteSize(2048) size2 = ByteSize(1024) assert (size1 - size2).bytes == 1024 # Subtraction resulting in a negative value raises ValueError with pytest.raises(ValueError, match=r'Resulting ByteSize cannot be negative'): _ = size2 - size1 # Subtraction of ByteSize instance and an int with pytest.raises(TypeError): _ = size1 - 1024 # Subtraction of ByteSize instance and an float with pytest.raises(TypeError): _ = size2 - 123.45 def test_multiplication() -> None: # Multiplication of ByteSize by an int size = ByteSize(1024) result = size * 2 assert result.bytes == 2048 # Multiplication of ByteSize by a float size_float = ByteSize(1024) result_float = size_float * 1.5 assert result_float.bytes == 1536 # Test reflected multiplication size_reflected = ByteSize(1024) reflected_result = 3 * size_reflected assert reflected_result.bytes == 3072 def test_divisions() -> None: # Division of ByteSize by another ByteSize size1 = ByteSize(2048) size2 = ByteSize(1024) assert (size1 / size2) == 2 # Division by zero when the divisor is a ByteSize with zero bytes with pytest.raises(ZeroDivisionError): _ = size1 / ByteSize(0) # Division of ByteSize - multiplying by a float assert (size1 * 0.5).bytes == 1024 ================================================ FILE: tests/unit/_utils/test_console.py ================================================ from __future__ import annotations from crawlee._utils.console import make_table def test_empty_input() -> None: assert make_table([]) == '' def test_empty_row() -> None: assert make_table([()]) == '' def test_single_column() -> None: result = make_table([('test',)]) lines = result.split('\n') assert len(lines) == 3 assert lines[1] == '│ test │' def test_two_columns() -> None: data = [('Name', 'Age'), ('Alice', '30'), ('Bob', '25')] result = make_table(data) lines = result.split('\n') # fmt: off assert lines == ['┌───────┬─────┐', '│ Name │ Age │', '│ Alice │ 30 │', '│ Bob │ 25 │', '└───────┴─────┘'] # fmt: on def test_long_content_truncation() -> None: data = [('Short', 'VeryVeryVeryLongContent')] result = make_table(data, width=25) lines = result.split('\n') # fmt: off assert lines == ['┌───────────┬───────────┐', '│ Short │ VeryVe... │', '└───────────┴───────────┘'] # fmt: on ================================================ FILE: tests/unit/_utils/test_crypto.py ================================================ from __future__ import annotations from crawlee._utils.crypto import compute_short_hash, crypto_random_object_id def test_crypto_random_object_id_default_length() -> None: object_id = crypto_random_object_id() assert len(object_id) == 17, 'Default generated object ID should have a length of 17 characters.' def test_crypto_random_object_id_custom_length() -> None: for length in [5, 10, 20, 100]: object_id = crypto_random_object_id(length) assert len(object_id) == length, f'Generated object ID should have a length of {length} characters.' def test_crypto_random_object_id_character_set() -> None: long_random_object_id = crypto_random_object_id(1000) allowed_chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' for char in long_random_object_id: assert char in allowed_chars, f"Character '{char}' is not in the expected alphanumeric range." def test_compute_short_hash_with_known_input() -> None: data = b'Hello world!' expected_hash = 'c0535e4b' assert compute_short_hash(data) == expected_hash, 'The hash does not match the expected output.' def test_compute_short_hash_with_empty_input() -> None: data = b'' expected_hash = 'e3b0c442' assert compute_short_hash(data) == expected_hash, 'The hash for an empty input should follow the expected pattern.' def test_compute_short_hash_output_length() -> None: data = b'some random data' assert len(compute_short_hash(data)) == 8, 'The output hash should be 8 characters long.' def test_compute_short_hash_differentiates_input() -> None: data1 = b'input 1' data2 = b'input 2' assert compute_short_hash(data1) != compute_short_hash(data2), 'Different inputs should produce different hashes.' ================================================ FILE: tests/unit/_utils/test_file.py ================================================ from __future__ import annotations from datetime import datetime, timezone from crawlee._utils.file import json_dumps async def test_json_dumps() -> None: assert await json_dumps({'key': 'value'}) == '{\n "key": "value"\n}' assert await json_dumps(['one', 2, 3.0]) == '[\n "one",\n 2,\n 3.0\n]' assert await json_dumps('string') == '"string"' assert await json_dumps(123) == '123' assert await json_dumps(datetime(2022, 1, 1, tzinfo=timezone.utc)) == '"2022-01-01 00:00:00+00:00"' ================================================ FILE: tests/unit/_utils/test_globs.py ================================================ from __future__ import annotations from crawlee._utils.globs import Glob def test_asterisk() -> None: glob = Glob('foo/*') assert glob.regexp.match('bar/') is None assert glob.regexp.match('foo/bar') is not None assert glob.regexp.match('foo/bar/baz') is None def test_double_asteritsk() -> None: glob = Glob('foo/**') assert glob.regexp.match('bar/') is None assert glob.regexp.match('foo/bar') is not None assert glob.regexp.match('foo/bar/baz') is not None ================================================ FILE: tests/unit/_utils/test_html_to_text.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING import pytest from bs4 import BeautifulSoup from parsel import Selector from crawlee.crawlers._beautifulsoup._utils import html_to_text as html_to_text_beautifulsoup from crawlee.crawlers._parsel._utils import html_to_text as html_to_text_parsel if TYPE_CHECKING: from collections.abc import Callable _EXPECTED_TEXT = ( "Let's start with a simple text. \n" "The ships hung in the sky, much the way that bricks don't. \n" "These aren't the Droids you're looking for\n" "I'm sorry, Dave. I'm afraid I can't do that.\n" "I'm sorry, Dave. I'm afraid I can't do that.\n" 'A1\tA2\tA3\t\n' 'B1\tB2\tB3\tB 4\t\n' 'This is some text with inline elements and HTML entities (>bla<) \n' 'Test\n' 'a\n' 'few\n' 'line\n' 'breaks\n' 'Spaces in an inline text should be completely ignored. \n' 'But,\n' ' a pre-formatted\n' ' block should be kept\n' ' pre-formatted.\n' 'The Greatest Science Fiction Quotes Of All Time \n' "Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You " 'Nexus, huh? I design your eyes.' ) _EXAMPLE_HTML = """ <html> <head> <title>Title SHOULD NOT be converted Let's start with a simple text.

The ships hung in the sky, much the way that bricks don't.

This should be ignored
A1 A2 A3
B1 B2 B3 B 4

This is some text with inline elements and HTML entities (>bla<)

Test
a
few
line
breaks
Spaces in an inline text should be completely ignored.
But,
    a pre-formatted
                block  should  be  kept
                                       pre-formatted.
These special elements SHOULD NOT BE CONVERTED. This should be skipped too. The Greatest Science Fiction Quotes Of All Time

Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You Nexus, huh? I design your eyes.

""" @pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup]) @pytest.mark.parametrize( ('source', 'expected_text'), [ pytest.param(_EXAMPLE_HTML, _EXPECTED_TEXT, id='Complex html'), (' Plain text node ', 'Plain text node'), (' \nPlain text node \n ', 'Plain text node'), ('

Header 1

Header 2

', 'Header 1\nHeader 2'), ('

Header 1

Header 2


', 'Header 1\nHeader 2'), ('

Header 1

Header 2



', 'Header 1\nHeader 2'), ('

Header 1

Header 2




', 'Header 1\nHeader 2'), ('

Header 1


Header 2




', 'Header 1\n\nHeader 2'), ('

Header 1


Header 2




', 'Header 1\n\nHeader 2'), ('

Header 1

\n
\n

Header 2




', 'Header 1\n\nHeader 2'), ('

Header 1

\n
\n

Header 2




', 'Header 1\n\n\nHeader 2'), ('

Header 1

\n
\n

Header 2




', 'Header 1\n\n\n\nHeader 2'), ('
Div

Paragraph

', 'Div\nParagraph'), ('
Div1
Div2
', 'Div1\nDiv2'), ('
Div1
', 'Div1'), ('
Div1
', 'Div1'), ('
Div1
', 'Div1'), ('Skip svg
Div1
', 'Div1'), ('Skip canvas
Div1
', 'Div1'), ('A B C D E\n\nF G', 'A B C D E F G'), ('
A  B  C  D  E\n\nF  G
', 'A B C D E\n\nF G'), ( '

Heading 1

Deep Div

Heading 2

', 'Heading 1\nDeep Div\nHeading 2', ), ('this_word_should_be_one', 'this_word_should_be_one'), ('some text', 'some text'), pytest.param( ( """
Cell A1Cell A2 Cell A3
Cell B1Cell B2
""" ), 'Cell A1\tCell A2\tCell A3 \t\nCell B1\tCell B2', id='Table', ), ('á é', 'á é'), ], ) def test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[str], str]) -> None: assert html_to_text(source) == expected_text @pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup]) def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None: with pytest.raises(TypeError): # Intentional wrong type test. html_to_text(1) # ty: ignore[invalid-argument-type] def test_html_to_text_parsel() -> None: assert html_to_text_parsel(Selector(_EXAMPLE_HTML)) == _EXPECTED_TEXT def test_html_to_text_beautifulsoup() -> None: assert html_to_text_beautifulsoup(BeautifulSoup(_EXAMPLE_HTML, features='lxml')) == _EXPECTED_TEXT ================================================ FILE: tests/unit/_utils/test_measure_time.py ================================================ from __future__ import annotations import asyncio import time from crawlee._utils.time import measure_time def test_measure_time_wall_sync() -> None: with measure_time() as elapsed: time.sleep(0.1) assert elapsed.cpu is not None assert elapsed.wall is not None assert elapsed.wall >= 0.09 def test_measure_time_cpu_sync() -> None: with measure_time() as elapsed: start = time.time() acc = 0 while time.time() - start < 0.1: acc += 1 acc *= acc assert elapsed.cpu is not None assert elapsed.wall is not None # Just verify that CPU time is measured and is positive. assert elapsed.cpu > 0 async def test_measure_time_wall_async() -> None: with measure_time() as elapsed: await asyncio.sleep(0.1) assert elapsed.cpu is not None assert elapsed.wall is not None assert elapsed.wall >= 0.09 ================================================ FILE: tests/unit/_utils/test_raise_if_too_many_kwargs.py ================================================ from contextlib import nullcontext from typing import Any import pytest from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs @pytest.mark.parametrize( ('kwargs', 'should_raise'), [ ({'alias': 'alias', 'name': None, 'id': None}, False), ({'alias': None, 'name': 'name', 'id': None}, False), ({'alias': None, 'name': None, 'id': 'id'}, False), ({'alias': 'alias', 'name': 'name', 'id': None}, True), ({'alias': 'alias', 'name': None, 'id': 'id'}, True), ({'alias': None, 'name': 'name', 'id': 'id'}, True), ({'alias': 'alias', 'name': 'name', 'id': 'id'}, True), ({'alias': None, 'name': None, 'id': None}, False), ], ) def test_limit_kwargs_default(kwargs: dict[str, Any], *, should_raise: bool) -> None: context = pytest.raises(ValueError, match=r'^Only one of .*') if should_raise else nullcontext() with context: raise_if_too_many_kwargs(**kwargs) @pytest.mark.parametrize( ('kwargs', 'should_raise'), [ ({'alias': 'alias', 'name': 'name', 'id': 'id'}, True), ({'alias': 'alias', 'name': 'name', 'id': None}, False), ], ) def test_limit_kwargs(kwargs: dict[str, Any], *, should_raise: bool) -> None: context = pytest.raises(ValueError, match=r'^Only one of .*') if should_raise else nullcontext() with context: raise_if_too_many_kwargs(max_kwargs=2, **kwargs) ================================================ FILE: tests/unit/_utils/test_recurring_task.py ================================================ from __future__ import annotations import asyncio from datetime import timedelta from unittest.mock import AsyncMock import pytest from crawlee._utils.recurring_task import RecurringTask @pytest.fixture def function() -> AsyncMock: mock_function = AsyncMock() mock_function.__name__ = 'mocked_function' # To avoid issues with the function name in RecurringTask return mock_function @pytest.fixture def delay() -> timedelta: return timedelta(milliseconds=30) async def test_init(function: AsyncMock, delay: timedelta) -> None: rt = RecurringTask(function, delay) assert rt.func == function assert rt.delay == delay assert rt.task is None async def test_start_and_stop(function: AsyncMock, delay: timedelta) -> None: rt = RecurringTask(function, delay) rt.start() await asyncio.sleep(0) # Yield control to allow the task to start assert isinstance(rt.task, asyncio.Task) assert not rt.task.done() await rt.stop() assert rt.task.done() @pytest.mark.run_alone async def test_execution(function: AsyncMock, delay: timedelta) -> None: task = RecurringTask(function, delay) task.start() await asyncio.sleep(0.2) # Wait enough for the task to execute a few times await task.stop() assert isinstance(task.func, AsyncMock) # To let type checker know that the function is a mock assert task.func.call_count >= 3 await task.stop() ================================================ FILE: tests/unit/_utils/test_requests.py ================================================ from __future__ import annotations import pytest from crawlee._types import HttpHeaders from crawlee._utils.requests import compute_unique_key, normalize_url @pytest.mark.parametrize( ('url', 'expected_output', 'keep_url_fragment'), [ ('https://example.com/?utm_source=test&utm_medium=test&key=value', 'https://example.com/?key=value', False), ( 'http://example.com/?key=value&another_key=another_value', 'http://example.com/?another_key=another_value&key=value', False, ), ('HTTPS://EXAMPLE.COM/?KEY=VALUE', 'https://example.com/?key=value', False), ('', '', False), ('http://example.com/#fragment', 'http://example.com/#fragment', True), ('http://example.com/#fragment', 'http://example.com', False), (' https://example.com/ ', 'https://example.com', False), ('http://example.com/?b=2&a=1', 'http://example.com/?a=1&b=2', False), ], ids=[ 'remove_utm_params', 'retain_sort_non_utm_params', 'convert_scheme_netloc_to_lowercase', 'handle_empty_url', 'retain_fragment', 'remove_fragment', 'trim_whitespace', 'sort_query_params', ], ) def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: bool) -> None: output = normalize_url(url, keep_url_fragment=keep_url_fragment) assert output == expected_output def test_compute_unique_key_basic() -> None: url = 'https://crawlee.dev' uk_get = compute_unique_key(url, method='GET') uk_post = compute_unique_key(url, method='POST') assert url == uk_get == uk_post def test_compute_unique_key_handles_fragments() -> None: url = 'https://crawlee.dev/#fragment' uk_with_fragment = compute_unique_key(url, keep_url_fragment=True) assert uk_with_fragment == url uk_without_fragment = compute_unique_key(url, 'GET', keep_url_fragment=False) assert uk_without_fragment == 'https://crawlee.dev' def test_compute_unique_key_handles_payload() -> None: url = 'https://crawlee.dev' payload = b'{"key": "value"}' # Payload without extended unique key uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=False) assert uk == url # Extended unique key and payload is None uk = compute_unique_key(url, method='POST', payload=None, use_extended_unique_key=True) assert uk == 'POST|e3b0c442|e3b0c442|https://crawlee.dev' # Extended unique key and payload is bytes uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=True) assert uk == 'POST|e3b0c442|9724c1e2|https://crawlee.dev' def test_compute_unique_key_handles_headers() -> None: url = 'https://crawlee.dev' headers = HttpHeaders({'Accept': '*/*', 'Content-Type': 'application/json'}) uk = compute_unique_key(url, headers=headers, use_extended_unique_key=False) assert uk == url extended_uk_expected = 'GET|4e1a2cf6|e3b0c442|https://crawlee.dev' uk = compute_unique_key(url, headers=headers, use_extended_unique_key=True) assert uk == extended_uk_expected # Accept-Encoding header should not be included. headers = HttpHeaders({'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Content-Type': 'application/json'}) uk = compute_unique_key(url, headers=headers, use_extended_unique_key=True) assert uk == extended_uk_expected def test_compute_unique_key_complex() -> None: url = 'https://crawlee.dev' headers = HttpHeaders({'Accept': '*/*', 'Content-Type': 'application/json'}) payload = b'{"key": "value"}' uk = compute_unique_key( url, method='POST', headers=headers, payload=payload, session_id='test_session', use_extended_unique_key=False, ) assert uk == url extended_uk = compute_unique_key( url, method='POST', headers=headers, payload=payload, session_id='test_session', use_extended_unique_key=True, ) assert extended_uk == 'POST|4e1a2cf6|9724c1e2|test_session|https://crawlee.dev' def test_compute_unique_key_post_with_none_payload() -> None: url = 'https://crawlee.dev' expected_output = 'POST|e3b0c442|e3b0c442|https://crawlee.dev' output = compute_unique_key(url, 'POST', payload=None, use_extended_unique_key=True) assert output == expected_output def test_compute_unique_key_with_whitespace_in_headers() -> None: url = 'https://crawlee.dev' headers = HttpHeaders({'Content-Type': 'application/json'}) headers_with_whitespaces = HttpHeaders({'Content-Type': ' application/json '}) expected_output = 'GET|60d83e70|e3b0c442|https://crawlee.dev' uk_1 = compute_unique_key(url, headers=headers, use_extended_unique_key=True) assert uk_1 == expected_output uk_2 = compute_unique_key(url, headers=headers_with_whitespaces, use_extended_unique_key=True) assert uk_2 == expected_output ================================================ FILE: tests/unit/_utils/test_robots.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING from crawlee._utils.robots import RobotsTxtFile if TYPE_CHECKING: from yarl import URL from crawlee.http_clients._base import HttpClient async def test_generation_robots_txt_url(server_url: URL, http_client: HttpClient) -> None: robots_file = await RobotsTxtFile.find(str(server_url), http_client) assert len(robots_file.get_sitemaps()) > 0 async def test_allow_disallow_robots_txt(server_url: URL, http_client: HttpClient) -> None: robots = await RobotsTxtFile.find(str(server_url), http_client) assert robots.is_allowed('https://crawlee.dev') assert robots.is_allowed(str(server_url / 'something/page.html')) assert robots.is_allowed(str(server_url / 'deny_googlebot/page.html')) assert not robots.is_allowed(str(server_url / 'deny_all/page.html')) async def test_extract_sitemaps_urls(server_url: URL, http_client: HttpClient) -> None: robots = await RobotsTxtFile.find(str(server_url), http_client) assert len(robots.get_sitemaps()) == 2 assert set(robots.get_sitemaps()) == {'http://not-exists.com/sitemap_1.xml', 'http://not-exists.com/sitemap_2.xml'} async def test_parse_from_content() -> None: content = """User-agent: * Disallow: *deny_all/ crawl-delay: 10 User-agent: Googlebot Disallow: *deny_googlebot/""" robots = await RobotsTxtFile.from_content('http://not-exists.com/robots.txt', content) assert robots.is_allowed('http://not-exists.com/something/page.html') assert robots.is_allowed('http://not-exists.com/deny_googlebot/page.html') assert not robots.is_allowed('http://not-exists.com/deny_googlebot/page.html', 'Googlebot') assert not robots.is_allowed('http://not-exists.com/deny_all/page.html') async def test_bind_robots_txt_url() -> None: content = 'User-agent: *\nDisallow: /' robots = await RobotsTxtFile.from_content('http://check.com/robots.txt', content) assert not robots.is_allowed('http://check.com/test.html') assert robots.is_allowed('http://othercheck.com/robots.txt') ================================================ FILE: tests/unit/_utils/test_shared_timeout.py ================================================ import asyncio from datetime import timedelta import pytest from crawlee._utils.time import SharedTimeout, measure_time async def test_shared_timeout_tracks_elapsed_time() -> None: timeout_duration = timedelta(seconds=1) shared_timeout = SharedTimeout(timeout_duration) # First usage async with shared_timeout: await asyncio.sleep(0.2) # Second usage - should have less time remaining async with shared_timeout as remaining: assert remaining < timedelta(seconds=0.85) assert remaining > timedelta(seconds=0) async def test_shared_timeout_expires() -> None: timeout_duration = timedelta(seconds=0.1) shared_timeout = SharedTimeout(timeout_duration) with measure_time() as elapsed, pytest.raises(asyncio.TimeoutError): async with shared_timeout: await asyncio.sleep(0.5) assert elapsed.wall is not None assert elapsed.wall < 0.3 async def test_shared_timeout_cannot_be_nested() -> None: timeout_duration = timedelta(seconds=1) shared_timeout = SharedTimeout(timeout_duration) async with shared_timeout: with pytest.raises(RuntimeError, match='cannot be entered twice'): async with shared_timeout: pass async def test_shared_timeout_multiple_sequential_uses() -> None: """Test that SharedTimeout can be used multiple times sequentially.""" timeout_duration = timedelta(seconds=1) shared_timeout = SharedTimeout(timeout_duration) for _ in range(5): async with shared_timeout: await asyncio.sleep(0.05) # Should have consumed roughly 0.25 seconds async with shared_timeout as remaining: assert remaining < timedelta(seconds=0.8) assert remaining > timedelta(seconds=0) ================================================ FILE: tests/unit/_utils/test_sitemap.py ================================================ import base64 import gzip from datetime import datetime from typing import Any from unittest.mock import AsyncMock, MagicMock from yarl import URL from crawlee._utils.sitemap import Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap from crawlee.http_clients._base import HttpClient, HttpResponse BASIC_SITEMAP = """ http://not-exists.com/ 2005-02-03 monthly 0.8 http://not-exists.com/catalog?item=12&desc=vacation_hawaii weekly http://not-exists.com/catalog?item=73&desc=vacation_new_zealand 2004-12-23 weekly http://not-exists.com/catalog?item=74&desc=vacation_newfoundland 2004-12-23T18:00:15+00:00 0.3 http://not-exists.com/catalog?item=83&desc=vacation_usa 2004-11-23 """.strip() BASIC_RESULTS = { 'http://not-exists.com/', 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', 'http://not-exists.com/catalog?item=83&desc=vacation_usa', } def _make_mock_client(url_map: dict[str, tuple[int, bytes]]) -> AsyncMock: async def send_request(url: str, **_kwargs: Any) -> HttpResponse: status, body = 404, b'' for pattern, (s, b) in url_map.items(): if pattern in url: status, body = s, b break response = MagicMock(spec=HttpResponse) response.status_code = status response.read = AsyncMock(return_value=body) return response client = AsyncMock(spec=HttpClient) client.send_request.side_effect = send_request return client def compress_gzip(data: str) -> bytes: """Compress a string using gzip.""" return gzip.compress(data.encode()) def encode_base64(data: bytes) -> str: """Encode bytes to a base64 string.""" return base64.b64encode(data).decode('utf-8') async def test_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test loading a basic sitemap.""" sitemap_url = (server_url / 'sitemap.xml').with_query( base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8' ) sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 assert set(sitemap.urls) == BASIC_RESULTS async def test_extract_metadata_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test extracting item metadata from a sitemap.""" sitemap_url = (server_url / 'sitemap.xml').with_query( base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8' ) items = [item async for item in parse_sitemap([{'type': 'url', 'url': str(sitemap_url)}], http_client=http_client)] assert len(items) == 5 assert items[0] == SitemapUrl( loc='http://not-exists.com/', priority=0.8, changefreq='monthly', lastmod=datetime.fromisoformat('2005-02-03'), origin_sitemap_url=str(sitemap_url), ) async def test_gzipped_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test loading a gzipped sitemap with correct type and .xml.gz url.""" gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=gzipped_data, c_type='application/gzip') sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 assert set(sitemap.urls) == BASIC_RESULTS async def test_gzipped_sitemap_with_invalid_data(server_url: URL, http_client: HttpClient) -> None: """Test loading a invalid gzipped sitemap with correct type and .xml.gz url.""" compress_data = compress_gzip(BASIC_SITEMAP) invalid_gzipped_data = encode_base64(compress_data[:30]) sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=invalid_gzipped_data, c_type='application/gzip') sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 0 assert sitemap.urls == [] async def test_gz_sitemap_with_non_gzipped(server_url: URL, http_client: HttpClient) -> None: """Test loading a sitemap with gzip type and .xml.gz url, but without gzipped data.""" sitemap_url = (server_url / 'sitemap.xml.gz').with_query( base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/gzip' ) sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 assert set(sitemap.urls) == BASIC_RESULTS async def test_gzipped_sitemap_with_bad_type(server_url: URL, http_client: HttpClient) -> None: """Test loading a gzipped sitemap with bad type and .xml.gz url.""" gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) sitemap_url = (server_url / 'sitemap.xml.gz').with_query( base64=gzipped_data, c_type='application/xml; charset=utf-8' ) sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 assert set(sitemap.urls) == BASIC_RESULTS async def test_xml_sitemap_with_gzipped_data(server_url: URL, http_client: HttpClient) -> None: """Test loading a gzipped sitemap with correct type and .xml url.""" gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) sitemap_url = (server_url / 'sitemap.xml').with_query(base64=gzipped_data, c_type='application/gzip') sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 assert set(sitemap.urls) == BASIC_RESULTS async def test_parent_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test loading a parent sitemap that references child sitemaps.""" parent_sitemap = """ {child_sitemap} 2004-12-23 {child_sitemap_2} 2004-12-23 """.strip() child_sitemap = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) child_sitemap_2 = (server_url / 'sitemap.xml.gz').with_query(base64=encode_base64(compress_gzip(BASIC_SITEMAP))) parent_sitemap_content = parent_sitemap.format(child_sitemap=child_sitemap, child_sitemap_2=child_sitemap_2) encoded_parent_sitemap_content = encode_base64(parent_sitemap_content.encode()) parent_sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encoded_parent_sitemap_content) sitemap = await Sitemap.load(str(parent_sitemap_url), http_client=http_client) assert len(sitemap.urls) == 10 assert set(sitemap.urls) == BASIC_RESULTS async def test_non_sitemap_url(server_url: URL, http_client: HttpClient) -> None: """Test loading a URL that does not point to a sitemap.""" sitemap = await Sitemap.load(str(server_url), http_client=http_client) assert len(sitemap.urls) == 0 assert sitemap.urls == [] async def test_cdata_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test loading a sitemap with CDATA sections.""" cdata_sitemap = """ """.strip() sitemap_url = (server_url / 'sitemap.xml').with_query( base64=encode_base64(cdata_sitemap.encode()), c_type='application/xml; charset=utf-8' ) sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 1 assert sitemap.urls == ['http://not-exists.com/catalog'] async def test_txt_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test loading a plain text sitemap.""" urls = [ 'http://not-exists.com/catalog?item=78&desc=vacation_crete', 'http://not-exists.com/catalog?item=79&desc=vacation_somalia', ] txt_sitemap_content = '\n'.join(urls) sitemap_url = (server_url / 'sitemap.txt').with_query(base64=encode_base64(txt_sitemap_content.encode())) sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 2 assert set(sitemap.urls) == { 'http://not-exists.com/catalog?item=78&desc=vacation_crete', 'http://not-exists.com/catalog?item=79&desc=vacation_somalia', } async def test_sitemap_pretty(server_url: URL, http_client: HttpClient) -> None: """Test loading a pretty-printed sitemap.""" pretty_sitemap = """ http://not-exists.com/catalog?item=80&desc=vacation_turkey 2005-02-03 monthly 0.8 """.strip() sitemap_url = (server_url / 'sitemap.xml').with_query( base64=encode_base64(pretty_sitemap.encode()), c_type='application/xml; charset=utf-8' ) sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 1 assert sitemap.urls == ['http://not-exists.com/catalog?item=80&desc=vacation_turkey'] async def test_sitemap_from_string() -> None: """Test creating a Sitemap instance from an XML string.""" sitemap = await Sitemap.from_xml_string(BASIC_SITEMAP) assert len(sitemap.urls) == 5 assert set(sitemap.urls) == BASIC_RESULTS async def test_discover_sitemap_from_robots_txt() -> None: """Sitemap URL found in robots.txt is yielded.""" robots_content = b'User-agent: *\nSitemap: http://example.com/custom-sitemap.xml' http_client = _make_mock_client({'robots.txt': (200, robots_content)}) urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)] assert urls == ['http://example.com/custom-sitemap.xml'] async def test_discover_sitemap_from_common_paths() -> None: """Sitemap is found at common paths when robots.txt has none.""" http_client = _make_mock_client( {'/sitemap.xml': (200, b''), '/sitemap.txt': (200, b''), '/sitemap_index.xml': (200, b'')} ) urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)] assert urls == [ 'http://example.com/sitemap.xml', 'http://example.com/sitemap.txt', 'http://example.com/sitemap_index.xml', ] async def test_discover_sitemap_from_input_url() -> None: """Input URL that is already a sitemap is yielded directly without checking common paths.""" http_client = _make_mock_client({'/sitemap.txt': (200, b'')}) urls = [url async for url in discover_valid_sitemaps(['http://example.com/sitemap.xml'], http_client=http_client)] assert urls == ['http://example.com/sitemap.xml'] async def test_discover_sitemap_deduplication() -> None: """Sitemap URL found in robots.txt is not yielded again from common paths check.""" robots_content = b'User-agent: *\nSitemap: http://example.com/sitemap.xml' http_client = _make_mock_client( { 'robots.txt': (200, robots_content), '/sitemap.xml': (200, b''), } ) urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)] assert urls == ['http://example.com/sitemap.xml'] async def test_discover_sitemaps_multiple_domains() -> None: """Sitemaps from multiple domains are all discovered.""" http_client = _make_mock_client( { 'domain-a.com/sitemap.xml': (200, b''), 'domain-b.com/sitemap.xml': (200, b''), } ) urls = [ url async for url in discover_valid_sitemaps( ['http://domain-a.com/page', 'http://domain-b.com/page'], http_client=http_client, ) ] assert set(urls) == { 'http://domain-a.com/sitemap.xml', 'http://domain-b.com/sitemap.xml', } async def test_discover_sitemap_url_without_host_skipped() -> None: """URLs without a host are skipped.""" http_client = _make_mock_client({}) urls = [url async for url in discover_valid_sitemaps(['not-a-valid-url'], http_client=http_client)] assert urls == [] ================================================ FILE: tests/unit/_utils/test_system.py ================================================ from __future__ import annotations import sys from multiprocessing import get_context, synchronize from multiprocessing.shared_memory import SharedMemory from typing import TYPE_CHECKING import pytest from crawlee._utils.byte_size import ByteSize from crawlee._utils.system import get_cpu_info, get_memory_info if TYPE_CHECKING: from collections.abc import Callable def test_get_memory_info_returns_valid_values() -> None: memory_info = get_memory_info() assert ByteSize(0) < memory_info.total_size < ByteSize.from_tb(1) assert memory_info.current_size < memory_info.total_size def test_get_cpu_info_returns_valid_values() -> None: cpu_info = get_cpu_info() assert 0 <= cpu_info.used_ratio <= 1 @pytest.mark.skipif(sys.platform != 'linux', reason='Improved estimation available only on Linux') def test_memory_estimation_does_not_overestimate_due_to_shared_memory() -> None: """Test that memory usage estimation is not overestimating memory usage by counting shared memory multiple times. In this test, the parent process is started and its memory usage is measured in situations where it is running child processes without additional memory, with shared additional memory and with own unshared additional memory. Child process without additional memory are used to estimate baseline memory usage of any child process. The following estimation is asserted by the test: additional_memory_size_estimate_per_shared_memory_child * number_of_sharing_children_processes is approximately equal to additional_memory_size_estimate_per_unshared_memory_child where the additional shared memory is exactly the same as the unshared memory. """ ctx = get_context('fork') estimated_memory_expectation = ctx.Value('b', False) # noqa: FBT003 # Common usage pattern for multiprocessing.Value def parent_process() -> None: extra_memory_size = 1024 * 1024 * 100 # 100 MB children_count = 4 # Memory calculation is not exact, so allow for some tolerance. test_tolerance = 0.3 def no_extra_memory_child(ready: synchronize.Barrier, measured: synchronize.Barrier) -> None: ready.wait() measured.wait() def extra_memory_child(ready: synchronize.Barrier, measured: synchronize.Barrier) -> None: memory = SharedMemory(size=extra_memory_size, create=True) assert memory.buf is not None memory.buf[:] = bytearray([255 for _ in range(extra_memory_size)]) print(f'Using the memory... {memory.buf[-1]}') ready.wait() measured.wait() memory.close() memory.unlink() def shared_extra_memory_child( ready: synchronize.Barrier, measured: synchronize.Barrier, memory: SharedMemory ) -> None: assert memory.buf is not None print(f'Using the memory... {memory.buf[-1]}') ready.wait() measured.wait() def get_additional_memory_estimation_while_running_processes( *, target: Callable, count: int = 1, use_shared_memory: bool = False ) -> float: processes = [] ready = ctx.Barrier(parties=count + 1) measured = ctx.Barrier(parties=count + 1) shared_memory: None | SharedMemory = None memory_before = get_memory_info().current_size if use_shared_memory: shared_memory = SharedMemory(size=extra_memory_size, create=True) assert shared_memory.buf is not None shared_memory.buf[:] = bytearray([255 for _ in range(extra_memory_size)]) extra_args = [shared_memory] else: extra_args = [] for _ in range(count): p = ctx.Process(target=target, args=[ready, measured, *extra_args]) p.start() processes.append(p) ready.wait() memory_during = get_memory_info().current_size measured.wait() for p in processes: p.join() if shared_memory: shared_memory.close() shared_memory.unlink() return (memory_during - memory_before).to_mb() / count additional_memory_simple_child = get_additional_memory_estimation_while_running_processes( target=no_extra_memory_child, count=children_count ) additional_memory_extra_memory_child = ( get_additional_memory_estimation_while_running_processes(target=extra_memory_child, count=children_count) - additional_memory_simple_child ) additional_memory_shared_extra_memory_child = ( get_additional_memory_estimation_while_running_processes( target=shared_extra_memory_child, count=children_count, use_shared_memory=True ) - additional_memory_simple_child ) memory_estimation_difference_ratio = ( abs((additional_memory_shared_extra_memory_child * children_count) - additional_memory_extra_memory_child) / additional_memory_extra_memory_child ) estimated_memory_expectation.value = memory_estimation_difference_ratio < test_tolerance if not estimated_memory_expectation.value: print( f'{additional_memory_shared_extra_memory_child=}\n' f'{children_count=}\n' f'{additional_memory_extra_memory_child=}\n' f'{memory_estimation_difference_ratio=}' ) process = ctx.Process(target=parent_process) process.start() process.join() assert estimated_memory_expectation.value, ( 'Estimated memory usage for process with shared memory does not meet the expectation.' ) ================================================ FILE: tests/unit/_utils/test_timedelta_ms.py ================================================ from __future__ import annotations from datetime import timedelta from typing import Any import pytest from pydantic import BaseModel from crawlee._utils.models import timedelta_ms class _ModelWithTimedeltaMs(BaseModel): time_delta: timedelta_ms | None = None @pytest.mark.parametrize( ('time_delta_input', 'expected_time_delta', 'expected_model_dump_value'), [ (1.0, timedelta(milliseconds=1), 1), (1, timedelta(milliseconds=1), 1), ('1', timedelta(milliseconds=1), 1), (timedelta(milliseconds=1), timedelta(milliseconds=1), 1), (3.01, timedelta(microseconds=3010), 3), (3.5, timedelta(microseconds=3500), 4), (3.99, timedelta(microseconds=3990), 4), (None, None, None), (float('inf'), timedelta(days=999999999, seconds=3600 * 24 - 1, microseconds=999999), float('inf')), ], ) def test_model_with_timedelta_ms_input_types( time_delta_input: float | timedelta | Any | None, expected_time_delta: timedelta, expected_model_dump_value: int ) -> None: model = _ModelWithTimedeltaMs(time_delta=time_delta_input) # ty: ignore[invalid-argument-type] assert model.time_delta == expected_time_delta assert model.model_dump() == {'time_delta': expected_model_dump_value} ================================================ FILE: tests/unit/_utils/test_urls.py ================================================ from __future__ import annotations import pytest from pydantic import ValidationError from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute, validate_http_url def test_is_url_absolute() -> None: assert is_url_absolute('http://example.com/path') is True assert is_url_absolute('https://example.com/path') is True assert is_url_absolute('ftp://example.com/path') is True assert is_url_absolute('//example.com/path') is False assert is_url_absolute('/path/to/resource') is False assert is_url_absolute('relative/path/to/resource') is False assert is_url_absolute('example.com/path') is False def test_convert_to_absolute_url() -> None: base_url = 'http://example.com' relative_url = '/path/to/resource' absolute_url = convert_to_absolute_url(base_url, relative_url) assert absolute_url == 'http://example.com/path/to/resource' base_url = 'http://example.com' relative_url = '//example.com/path/to/resource' absolute_url = convert_to_absolute_url(base_url, relative_url) assert absolute_url == 'http://example.com/path/to/resource' base_url = 'http://example.com/base/' relative_url = '../path/to/resource' absolute_url = convert_to_absolute_url(base_url, relative_url) assert absolute_url == 'http://example.com/path/to/resource' def test_validate_http_url() -> None: assert validate_http_url(None) is None valid_url = 'https://example.com' assert validate_http_url(valid_url) == valid_url invalid_url = 'htp://invalid-url' with pytest.raises(ValidationError): validate_http_url(invalid_url) ================================================ FILE: tests/unit/browsers/test_browser_pool.py ================================================ from __future__ import annotations from datetime import timedelta from typing import TYPE_CHECKING from unittest.mock import AsyncMock import pytest from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin from crawlee.browsers._browser_controller import BrowserController from crawlee.browsers._types import CrawleePage from tests.unit.utils import run_alone_on_mac if TYPE_CHECKING: from collections.abc import Mapping from typing import Any from yarl import URL from crawlee.proxy_configuration import ProxyInfo async def test_default_plugin_new_page_creation(server_url: URL) -> None: async with BrowserPool() as browser_pool: page_1 = await browser_pool.new_page() await page_1.page.goto(str(server_url)) assert page_1.browser_type == 'chromium' assert page_1.page.url == str(server_url) assert ' None: plugin_chromium = PlaywrightBrowserPlugin(browser_type='chromium') plugin_firefox = PlaywrightBrowserPlugin(browser_type='firefox') async with BrowserPool([plugin_chromium, plugin_firefox]) as browser_pool: assert browser_pool.plugins == [plugin_chromium, plugin_firefox] page_1 = await browser_pool.new_page() await page_1.page.goto(str(server_url)) assert page_1.browser_type == 'chromium' assert page_1.page.url == str(server_url) assert ' None: plugin_chromium = PlaywrightBrowserPlugin(browser_type='chromium') plugin_firefox = PlaywrightBrowserPlugin(browser_type='firefox') async with BrowserPool([plugin_chromium, plugin_firefox]) as browser_pool: pages = await browser_pool.new_page_with_each_plugin() assert len(pages) == 2 assert pages[0].browser_type == 'chromium' assert pages[1].browser_type == 'firefox' await pages[0].page.goto(str(server_url)) assert pages[0].page.url == str(server_url) assert ' None: # Use a generous operation timeout so that Firefox has enough time to launch on slow Windows CI. async with BrowserPool.with_default_plugin( headless=True, browser_type='firefox', operation_timeout=timedelta(seconds=60) ) as browser_pool: assert len(browser_pool.plugins) == 1 assert isinstance(browser_pool.plugins[0], PlaywrightBrowserPlugin) page = await browser_pool.new_page() assert page.browser_type == 'firefox' await page.page.goto(str(server_url)) assert page.page.url == str(server_url) assert ' None: async with BrowserPool() as browser_pool: page_1 = await browser_pool.new_page() with pytest.raises(ValueError, match=r'Page with ID: .* already exists.'): await browser_pool.new_page(page_id=page_1.id) async def test_new_page_with_invalid_plugin() -> None: plugin_1 = PlaywrightBrowserPlugin(browser_type='chromium') plugin_2 = PlaywrightBrowserPlugin(browser_type='firefox') async with BrowserPool([plugin_1]) as browser_pool: with pytest.raises(ValueError, match=r'Provided browser_plugin is not one of the plugins used by BrowserPool.'): await browser_pool.new_page(browser_plugin=plugin_2) async def test_resource_management(server_url: URL) -> None: playwright_plugin = PlaywrightBrowserPlugin(browser_type='chromium') async with BrowserPool([playwright_plugin]) as browser_pool: page = await browser_pool.new_page() await page.page.goto(str(server_url)) assert page.page.url == str(server_url) assert ' None: plugin = PlaywrightBrowserPlugin() browser_pool = BrowserPool([plugin]) assert browser_pool.active is False with pytest.raises(RuntimeError, match=r'BrowserPool is not active.'): await browser_pool.new_page() with pytest.raises(RuntimeError, match=r'BrowserPool is not active.'): await browser_pool.new_page_with_each_plugin() with pytest.raises(RuntimeError, match=r'BrowserPool is already active.'): async with browser_pool, browser_pool: pass async with browser_pool: assert browser_pool.active is True async def test_with_plugin_contains_page_options(server_url: URL) -> None: plugin = PlaywrightBrowserPlugin(browser_new_context_options={'user_agent': 'My Best User-Agent'}) async with BrowserPool(plugins=[plugin]) as browser_pool: test_page = await browser_pool.new_page() await test_page.page.goto(str(server_url / 'user-agent')) assert 'My Best User-Agent' in await test_page.page.content() await test_page.page.close() @pytest.mark.parametrize( ('retire_after_page_count', 'expect_equal_browsers'), [ pytest.param(2, True, id='Two pages opened in the same browser'), pytest.param(1, False, id='Each page opened in a new browser.'), ], ) async def test_browser_pool_retire_browser_after_page_count( retire_after_page_count: int, *, expect_equal_browsers: bool ) -> None: async with BrowserPool(retire_browser_after_page_count=retire_after_page_count) as browser_pool: test_page = await browser_pool.new_page() first_browser = test_page.page.context await test_page.page.close() test_page = await browser_pool.new_page() second_browser = test_page.page.context await test_page.page.close() if expect_equal_browsers: assert first_browser is second_browser else: assert first_browser is not second_browser async def test_pre_page_create_hook_is_called() -> None: call_mock = AsyncMock() async with BrowserPool() as browser_pool: @browser_pool.pre_page_create_hook async def hook( page_id: str, controller: BrowserController, browser_new_context_options: dict[str, Any], proxy_info: ProxyInfo | None, ) -> None: await call_mock(page_id, controller, browser_new_context_options, proxy_info) browser_new_context_options['user_agent'] = 'Modified User-Agent' assert len(controller.pages) == 0 test_page = await browser_pool.new_page() user_agent = await test_page.page.evaluate('navigator.userAgent') await test_page.page.close() assert user_agent == 'Modified User-Agent' call_mock.assert_awaited_once() page_id, controller, _, proxy_info = call_mock.call_args[0] assert isinstance(page_id, str) assert test_page.id == page_id assert isinstance(controller, BrowserController) assert proxy_info is None async def test_post_page_create_hook_is_called() -> None: call_mock = AsyncMock() async with BrowserPool() as browser_pool: @browser_pool.post_page_create_hook async def hook(crawlee_page: CrawleePage, controller: BrowserController) -> None: await call_mock(crawlee_page, controller) await crawlee_page.page.evaluate('window.__hook_applied = true') assert isinstance(crawlee_page, CrawleePage) assert len(controller.pages) == 1 test_page = await browser_pool.new_page() js_result = await test_page.page.evaluate('window.__hook_applied') await test_page.page.close() assert js_result is True call_mock.assert_awaited_once() crawlee_page, controller = call_mock.call_args[0] assert test_page is crawlee_page assert isinstance(controller, BrowserController) async def test_pre_page_close_hook() -> None: call_mock = AsyncMock() async with BrowserPool() as browser_pool: @browser_pool.pre_page_close_hook async def hook(crawlee_page: CrawleePage, controller: BrowserController) -> None: await call_mock(crawlee_page, controller) assert not crawlee_page.page.is_closed() assert len(controller.pages) == 1 test_page = await browser_pool.new_page() await test_page.page.close() call_mock.assert_awaited_once() assert test_page.page.is_closed() async def test_post_page_close_hook() -> None: call_mock = AsyncMock() async with BrowserPool() as browser_pool: @browser_pool.post_page_close_hook async def hook(page_id: str, controller: BrowserController) -> None: await call_mock(page_id, controller) assert len(controller.pages) == 0 test_page = await browser_pool.new_page() await test_page.page.close() page_id, controller = call_mock.call_args[0] call_mock.assert_awaited_once() assert test_page.id == page_id assert isinstance(controller, BrowserController) async def test_page_hooks_execution_order() -> None: call_order: list[str] = [] async with BrowserPool() as browser_pool: @browser_pool.pre_page_create_hook async def pre_create( _page_id: str, _controller: BrowserController, _browser_new_context_options: Mapping[str, Any], _proxy_info: ProxyInfo | None, ) -> None: call_order.append('pre_create') @browser_pool.post_page_create_hook async def post_create(_crawlee_page: CrawleePage, _controller: BrowserController) -> None: call_order.append('post_create') @browser_pool.pre_page_close_hook async def pre_close(_crawlee_page: CrawleePage, _controller: BrowserController) -> None: call_order.append('pre_close') @browser_pool.post_page_close_hook async def post_close(_page_id: str, _controller: BrowserController) -> None: call_order.append('post_close') page = await browser_pool.new_page() await page.page.close() assert call_order == ['pre_create', 'post_create', 'pre_close', 'post_close'] async def test_multiple_hooks_all_called() -> None: call_order: list[str] = [] async with BrowserPool() as browser_pool: @browser_pool.post_page_create_hook async def first(_crawlee_page: CrawleePage, _controller: BrowserController) -> None: call_order.append('first') @browser_pool.post_page_create_hook async def second(_crawlee_page: CrawleePage, _controller: BrowserController) -> None: call_order.append('second') page = await browser_pool.new_page() await page.page.close() assert call_order == ['first', 'second'] ================================================ FILE: tests/unit/browsers/test_playwright_browser.py ================================================ from __future__ import annotations from pathlib import Path from typing import TYPE_CHECKING import pytest from playwright.async_api import async_playwright from crawlee.browsers._playwright_browser import PlaywrightPersistentBrowser if TYPE_CHECKING: from collections.abc import AsyncGenerator from playwright.async_api import Playwright @pytest.fixture async def playwright() -> AsyncGenerator[Playwright, None]: async with async_playwright() as playwright: yield playwright async def test_init(playwright: Playwright) -> None: browser_type = playwright.chromium persist_browser = PlaywrightPersistentBrowser(browser_type, user_data_dir=None, browser_launch_options={}) assert persist_browser._browser_type == browser_type assert persist_browser.browser_type == browser_type assert persist_browser._browser_launch_options == {} assert persist_browser._temp_dir is None assert persist_browser._user_data_dir is None assert persist_browser._is_connected is True assert persist_browser.is_connected() is True async def test_delete_temp_folder_with_close_browser(playwright: Playwright) -> None: persist_browser = PlaywrightPersistentBrowser( playwright.chromium, user_data_dir=None, browser_launch_options={'headless': True} ) await persist_browser.new_context() assert isinstance(persist_browser._temp_dir, Path) current_temp_dir = persist_browser._temp_dir assert current_temp_dir.exists() await persist_browser.close() assert not current_temp_dir.exists() ================================================ FILE: tests/unit/browsers/test_playwright_browser_controller.py ================================================ from __future__ import annotations import asyncio from datetime import datetime, timedelta, timezone from typing import TYPE_CHECKING, Any from unittest.mock import AsyncMock import pytest from playwright.async_api import Browser, BrowserContext, Page, Playwright, async_playwright from crawlee.browsers import PlaywrightBrowserController, PlaywrightPersistentBrowser if TYPE_CHECKING: from collections.abc import AsyncGenerator from pathlib import Path from yarl import URL @pytest.fixture async def playwright() -> AsyncGenerator[Playwright, None]: async with async_playwright() as playwright: yield playwright @pytest.fixture async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]: browser = await playwright.chromium.launch() yield browser await browser.close() @pytest.fixture async def controller(browser: Browser) -> AsyncGenerator[PlaywrightBrowserController, None]: controller = PlaywrightBrowserController(browser, max_open_pages_per_browser=2) yield controller await controller.close() async def test_initial_state(browser: Browser) -> None: controller = PlaywrightBrowserController(browser) # Test initial state assert controller.pages == [] assert controller.pages_count == 0 assert isinstance(controller.last_page_opened_at, datetime) assert controller.idle_time < timedelta(seconds=1) assert controller.has_free_capacity @pytest.mark.run_alone async def test_open_and_close_page(controller: PlaywrightBrowserController, server_url: URL) -> None: page = await controller.new_page() await page.goto(str(server_url)) assert page in controller.pages assert controller.pages_count == 1 assert controller.last_page_opened_at <= datetime.now(timezone.utc) await page.close() assert page not in controller.pages assert controller.pages_count == 0 async def test_max_open_pages_limit(controller: PlaywrightBrowserController) -> None: page1 = await controller.new_page() assert controller.pages_count == 1 page2 = await controller.new_page() assert controller.pages_count == 2 with pytest.raises(ValueError, match=r'Cannot open more pages in this browser.'): await controller.new_page() assert controller.pages_count == 2 await page1.close() assert controller.pages_count == 1 page3 = await controller.new_page() assert controller.pages_count == 2 await page2.close() await page3.close() assert controller.pages == [] assert controller.pages_count == 0 async def test_idle_time(controller: PlaywrightBrowserController) -> None: idle_time_before = controller.idle_time await asyncio.sleep(1) # Simulate waiting idle_time_after = controller.idle_time assert idle_time_after > idle_time_before async def test_close_browser_with_open_pages(browser: Browser) -> None: controller = PlaywrightBrowserController(browser, max_open_pages_per_browser=2) _ = await controller.new_page() with pytest.raises(ValueError, match=r'Cannot close the browser while there are open pages.'): await controller.close() assert controller.pages_count == 1 assert controller.is_browser_connected await controller.close(force=True) assert controller.pages_count == 0 assert not controller.is_browser_connected async def test_memory_leak_on_concurrent_context_creation() -> None: """Test that only one browser context is created when multiple pages are opened concurrently.""" # Prepare mocked browser with relevant methods and attributes mocked_browser = AsyncMock() mocked_context_launcher = AsyncMock() mocked_context = AsyncMock(spec=BrowserContext) mocked_context_launcher.return_value = mocked_context mocked_context.new_page.return_value = AsyncMock(spec=Page) async def delayed_launch_persistent_context(*args: Any, **kwargs: Any) -> Any: """Ensure that both calls to create context overlap in time.""" await asyncio.sleep(5) # Simulate delay in creation to make sure race condition happens return await mocked_context_launcher(*args, **kwargs) mocked_browser.launch_persistent_context = delayed_launch_persistent_context # Create minimal instance of PlaywrightBrowserController with mocked browser controller = PlaywrightBrowserController( PlaywrightPersistentBrowser(mocked_browser, None, {}), header_generator=None, fingerprint_generator=None ) # Both calls will try to create browser context at the same time, but only one context should be created. await asyncio.gather(controller.new_page(), controller.new_page()) assert mocked_context_launcher.call_count == 1 async def test_max_open_pages_limit_on_concurrent_creation(controller: PlaywrightBrowserController) -> None: pages = await asyncio.gather(controller.new_page(), controller.new_page()) assert controller.pages_count == 2 for page in pages: await page.close() async def test_max_open_pages_limit_error_on_concurrent_creation(controller: PlaywrightBrowserController) -> None: """Test that max open pages limit is respected during concurrent page creation.""" with pytest.raises(ValueError, match=r'Cannot open more pages in this browser.'): await asyncio.gather(controller.new_page(), controller.new_page(), controller.new_page()) async def test_browser_with_pre_existing_context(tmp_path: Path) -> None: """Test that using `Browser` with pre-existing active context re-uses such context.""" async with async_playwright() as pw: persistent_context = await pw.firefox.launch_persistent_context( user_data_dir=str(tmp_path), headless=True, ) browser = persistent_context.browser assert browser controller = PlaywrightBrowserController(browser=browser) page_1 = await controller.new_page() page_2 = await controller.new_page() assert page_1.context == page_2.context == persistent_context ================================================ FILE: tests/unit/browsers/test_playwright_browser_plugin.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING import pytest from crawlee.browsers import PlaywrightBrowserPlugin if TYPE_CHECKING: from collections.abc import AsyncGenerator from yarl import URL @pytest.fixture async def plugin() -> AsyncGenerator[PlaywrightBrowserPlugin, None]: async with PlaywrightBrowserPlugin() as plugin: yield plugin async def test_initial_state() -> None: plugin = PlaywrightBrowserPlugin( browser_type='chromium', browser_launch_options={'headless': False}, browser_new_context_options={'viewport': {'width': 1920, 'height': 1080}}, max_open_pages_per_browser=10, ) # Test initial state assert plugin.browser_type == 'chromium' assert 'headless' in plugin.browser_launch_options assert plugin.browser_launch_options['headless'] is False assert plugin.browser_new_context_options == {'viewport': {'width': 1920, 'height': 1080}} assert plugin.max_open_pages_per_browser == 10 async def test_new_browser(plugin: PlaywrightBrowserPlugin, server_url: URL) -> None: browser_controller = await plugin.new_browser() assert browser_controller.is_browser_connected page = await browser_controller.new_page() await page.goto(str(server_url)) await page.close() await browser_controller.close() assert not browser_controller.is_browser_connected async def test_multiple_new_browsers(plugin: PlaywrightBrowserPlugin) -> None: browser_controller_1 = await plugin.new_browser() browser_controller_2 = await plugin.new_browser() assert browser_controller_1 is not browser_controller_2 async def test_methods_raise_error_when_not_active() -> None: plugin = PlaywrightBrowserPlugin() assert plugin.active is False with pytest.raises(RuntimeError, match=r'Plugin is not active'): await plugin.new_browser() with pytest.raises(RuntimeError, match=r'Plugin is already active.'): async with plugin, plugin: pass async with plugin: assert plugin.active is True async def raise_error_if_chrome_and_executable_path() -> None: with pytest.raises( ValueError, match=r'Cannot use `use_chrome` with `Configuration.default_browser_path` or `executable_path` set.' ): PlaywrightBrowserPlugin( browser_type='chrome', browser_launch_options={'executable_path': '/path/to/chrome'}, ) ================================================ FILE: tests/unit/conftest.py ================================================ from __future__ import annotations import logging import os import warnings from typing import TYPE_CHECKING, Any, cast import pytest from curl_cffi import CurlHttpVersion from fakeredis import FakeAsyncRedis from proxy import Proxy from uvicorn.config import Config from crawlee import service_locator from crawlee.crawlers import BasicCrawler from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_network from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient, ImpitHttpClient from crawlee.proxy_configuration import ProxyInfo from crawlee.statistics import Statistics from crawlee.storages import KeyValueStore from tests.unit.server import TestServer, app, serve_in_thread if TYPE_CHECKING: from collections.abc import AsyncGenerator, Callable, Iterator from pathlib import Path from yarl import URL from crawlee.http_clients._base import HttpClient @pytest.fixture(autouse=True) async def suppress_user_warning() -> AsyncGenerator[None, None]: """Suppress user warnings during tests. Mostly to suppress warnings about the experimental status of the SqlStorageClient. """ with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) yield @pytest.fixture def prepare_test_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Callable[[], None]: """Prepare the testing environment by resetting the global state before each test. This fixture ensures that the global state of the package is reset to a known baseline before each test runs. It also configures a temporary storage directory for test isolation. Args: monkeypatch: Test utility provided by pytest for patching. tmp_path: A unique temporary directory path provided by pytest for test isolation. Returns: A callable that prepares the test environment. """ def _prepare_test_env() -> None: # Disable the browser sandbox by setting the environment variable. This is required for running # Playwright tests in the CI environment, where the sandbox is not supported. monkeypatch.setenv('CRAWLEE_DISABLE_BROWSER_SANDBOX', 'true') # Set the environment variable for the local storage directory to the temporary path. monkeypatch.setenv('CRAWLEE_STORAGE_DIR', str(tmp_path)) # Reset the services in the service locator. service_locator._configuration = None service_locator._event_manager = None service_locator._storage_client = None service_locator.storage_instance_manager.clear_cache() # Verify that the test environment was set up correctly. assert os.environ.get('CRAWLEE_STORAGE_DIR') == str(tmp_path) # Reset global class variables to ensure test isolation. KeyValueStore._autosaved_values = {} Statistics._Statistics__next_id = 0 # type:ignore[attr-defined] # Mangled attribute BasicCrawler._BasicCrawler__next_id = 0 # type:ignore[attr-defined] # Mangled attribute return _prepare_test_env @pytest.fixture(autouse=True) def _isolate_test_environment(prepare_test_env: Callable[[], None]) -> None: """Isolate the testing environment by resetting global state before and after each test. This fixture ensures that each test starts with a clean slate and that any modifications during the test do not affect subsequent tests. It runs automatically for all tests. Args: prepare_test_env: Fixture to prepare the environment before each test. """ prepare_test_env() @pytest.fixture(autouse=True) def _set_crawler_log_level(pytestconfig: pytest.Config, monkeypatch: pytest.MonkeyPatch) -> None: from crawlee import _log_config # noqa: PLC0415 loglevel = cast('str | None', pytestconfig.getoption('--log-level')) if loglevel is not None: monkeypatch.setattr(_log_config, 'get_configured_log_level', lambda: getattr(logging, loglevel.upper())) @pytest.fixture async def proxy_info(unused_tcp_port: int) -> ProxyInfo: username = 'user' password = 'pass' return ProxyInfo( url=f'http://{username}:{password}@127.0.0.1:{unused_tcp_port}', scheme='http', hostname='127.0.0.1', port=unused_tcp_port, username=username, password=password, ) @pytest.fixture async def proxy(proxy_info: ProxyInfo) -> AsyncGenerator[ProxyInfo, None]: with Proxy( [ '--hostname', proxy_info.hostname, '--port', str(proxy_info.port), '--basic-auth', f'{proxy_info.username}:{proxy_info.password}', ] ): yield proxy_info @pytest.fixture async def disabled_proxy(proxy_info: ProxyInfo) -> AsyncGenerator[ProxyInfo, None]: with Proxy( [ '--hostname', proxy_info.hostname, '--port', str(proxy_info.port), '--basic-auth', f'{proxy_info.username}:{proxy_info.password}', '--disable-http-proxy', ] ): yield proxy_info @pytest.fixture(scope='session') def header_network() -> dict: return get_available_header_network() @pytest.fixture async def key_value_store() -> AsyncGenerator[KeyValueStore, None]: kvs = await KeyValueStore.open() yield kvs await kvs.drop() @pytest.fixture(scope='session') def http_server(unused_tcp_port_factory: Callable[[], int]) -> Iterator[TestServer]: """Create and start an HTTP test server.""" config = Config(app=app, lifespan='off', loop='asyncio', port=unused_tcp_port_factory()) server = TestServer(config=config) yield from serve_in_thread(server) @pytest.fixture(scope='session') def server_url(http_server: TestServer) -> URL: """Provide the base URL of the test server.""" return http_server.url # It is needed only in some tests, so we use the standard `scope=function` @pytest.fixture def redirect_http_server(unused_tcp_port_factory: Callable[[], int]) -> Iterator[TestServer]: """Create and start an HTTP test server.""" config = Config( app=app, lifespan='off', loop='asyncio', port=unused_tcp_port_factory(), limit_max_requests=100, timeout_graceful_shutdown=10, log_level='error', access_log=False, ws='websockets-sansio', ) server = TestServer(config=config) yield from serve_in_thread(server) @pytest.fixture def redirect_server_url(redirect_http_server: TestServer) -> URL: """Provide the base URL of the test server.""" return redirect_http_server.url @pytest.fixture( params=[ pytest.param('httpx', id='httpx'), pytest.param('impit', id='impit'), pytest.param('curl', id='curl'), ] ) async def http_client(request: pytest.FixtureRequest) -> AsyncGenerator[HttpClient, None]: class_client: type[HttpClient] kwargs: dict[str, Any] if request.param == 'curl': class_client = CurlImpersonateHttpClient kwargs = {'http_version': CurlHttpVersion.V1_1} elif request.param == 'impit': class_client = ImpitHttpClient kwargs = {'http3': False} else: class_client = HttpxHttpClient kwargs = {'http2': True} async with class_client(**kwargs) as client: yield client @pytest.fixture def redis_client() -> FakeAsyncRedis: return FakeAsyncRedis() ================================================ FILE: tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py ================================================ from __future__ import annotations import asyncio import logging from dataclasses import dataclass from datetime import timedelta from itertools import cycle from typing import TYPE_CHECKING, cast from unittest.mock import Mock, call, patch import pytest from bs4 import Tag from parsel import Selector from typing_extensions import override from crawlee import Request from crawlee.crawlers import ( AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightPostNavCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, BasicCrawler, RenderingType, RenderingTypePrediction, RenderingTypePredictor, ) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( AdaptivePlaywrightCrawlerStatisticState, ) from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import ( AdaptiveContextError, ) from crawlee.sessions import SessionPool from crawlee.statistics import Statistics from crawlee.storage_clients import SqlStorageClient from crawlee.storages import KeyValueStore, RequestQueue if TYPE_CHECKING: from collections.abc import AsyncGenerator, Iterator from pathlib import Path from yarl import URL _H1_TEXT = 'Static' _H2_TEXT = 'Only in browser' _H3_CHANGED_TEXT = 'Changed by JS' _INJECTED_JS_DELAY_MS = 100 _PAGE_CONTENT_STATIC = f"""

{_H1_TEXT}

Initial text

""" @pytest.fixture def test_urls(server_url: URL) -> list[str]: """Example pages used in the test are mocked for static requests.""" return [ str(server_url.with_path('echo_content').with_query(content=_PAGE_CONTENT_STATIC)), str(server_url.with_path('echo_content').with_query(id='test2', content=_PAGE_CONTENT_STATIC)), ] @pytest.fixture async def key_value_store() -> AsyncGenerator[KeyValueStore, None]: kvs = await KeyValueStore.open() yield kvs await kvs.drop() class _SimpleRenderingTypePredictor(RenderingTypePredictor): """Simplified predictor for tests.""" def __init__( self, rendering_types: Iterator[RenderingType] | None = None, detection_probability_recommendation: None | Iterator[float] = None, ) -> None: super().__init__() self._rendering_types = rendering_types or cycle(['static']) self._detection_probability_recommendation = detection_probability_recommendation or cycle([1]) @override def predict(self, request: Request) -> RenderingTypePrediction: return RenderingTypePrediction(next(self._rendering_types), next(self._detection_probability_recommendation)) @override def store_result(self, request: Request, rendering_type: RenderingType) -> None: pass @dataclass(frozen=True) class TestInput: __test__ = False expected_pw_count: int expected_static_count: int rendering_types: Iterator[RenderingType] detection_probability_recommendation: Iterator[float] @pytest.mark.parametrize( 'test_input', [ pytest.param( TestInput( expected_pw_count=0, expected_static_count=2, # Lack of ty support, see https://github.com/astral-sh/ty/issues/2348. rendering_types=cycle(['static']), detection_probability_recommendation=cycle([0]), ), id='Static only', ), pytest.param( TestInput( expected_pw_count=2, expected_static_count=0, rendering_types=cycle(['client only']), detection_probability_recommendation=cycle([0]), ), id='Client only', ), pytest.param( TestInput( expected_pw_count=1, expected_static_count=1, rendering_types=cycle(['static', 'client only']), detection_probability_recommendation=cycle([0]), ), id='Mixed', ), pytest.param( TestInput( expected_pw_count=2, expected_static_count=2, rendering_types=cycle(['static', 'client only']), detection_probability_recommendation=cycle([1]), ), id='Enforced rendering type detection', ), ], ) async def test_adaptive_crawling( test_input: TestInput, test_urls: list[str], ) -> None: """Tests correct routing to pre-nav hooks and correct handling through proper handler.""" predictor = _SimpleRenderingTypePredictor( rendering_types=test_input.rendering_types, detection_probability_recommendation=test_input.detection_probability_recommendation, ) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=predictor, ) pw_handler_count = 0 static_handler_count = 0 pw_hook_count = 0 static_hook_count = 0 @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: nonlocal pw_handler_count nonlocal static_handler_count try: # page is available only if it was crawled by PlaywrightCrawler. context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. pw_handler_count += 1 except AdaptiveContextError: static_handler_count += 1 @crawler.pre_navigation_hook async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: # Intentionally unused arg nonlocal static_hook_count nonlocal pw_hook_count try: # page is available only if it was crawled by PlaywrightCrawler. context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. pw_hook_count += 1 except AdaptiveContextError: static_hook_count += 1 await crawler.run(test_urls) assert pw_handler_count == test_input.expected_pw_count assert pw_hook_count == test_input.expected_pw_count assert static_handler_count == test_input.expected_static_count assert static_hook_count == test_input.expected_static_count async def test_adaptive_crawling_parsel(test_urls: list[str]) -> None: """Top level test for parsel. Only one argument combination. (The rest of code is tested with bs variant.)""" predictor = _SimpleRenderingTypePredictor( rendering_types=cycle(['static', 'client only']), detection_probability_recommendation=cycle([0]), ) crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( rendering_type_predictor=predictor, ) pw_handler_count = 0 static_handler_count = 0 @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: nonlocal pw_handler_count nonlocal static_handler_count try: # page is available only if it was crawled by PlaywrightCrawler. context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. pw_handler_count += 1 except AdaptiveContextError: static_handler_count += 1 await crawler.run(test_urls) assert pw_handler_count == 1 assert static_handler_count == 1 async def test_adaptive_crawling_pre_nav_change_to_context(test_urls: list[str]) -> None: """Tests that context can be modified in pre-navigation hooks.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_enforce_detection, ) user_data_in_pre_nav_hook = [] user_data_in_handler = [] @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: user_data_in_handler.append(context.request.user_data.get('data', None)) @crawler.pre_navigation_hook async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: user_data_in_pre_nav_hook.append(context.request.user_data.get('data', None)) try: # page is available only if it was crawled by PlaywrightCrawler. context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. context.request.user_data['data'] = 'pw' except AdaptiveContextError: context.request.user_data['data'] = 'bs' await crawler.run(test_urls[:1]) # Check that repeated pre nav hook invocations do not influence each other while probing assert user_data_in_pre_nav_hook == [None, None] # Check that the request handler sees changes to user data done by pre nav hooks assert user_data_in_handler == ['pw', 'bs'] async def test_playwright_only_pre_navigation_hook(test_urls: list[str]) -> None: """Test that hook can be registered for playwright only sub crawler. Create a situation where one page is crawled by both sub crawlers. One common pre navigation hook is registered and one playwright only pre navigation hook is registered.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_enforce_detection, ) pre_nav_hook_common = Mock() pre_nav_hook_playwright = Mock() @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: pass @crawler.pre_navigation_hook async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: pre_nav_hook_common(context.request.url) @crawler.pre_navigation_hook(playwright_only=True) async def pre_nav_hook_pw_only(context: AdaptivePlaywrightPreNavCrawlingContext) -> None: pre_nav_hook_playwright(context.page.url) await crawler.run(test_urls[:1]) # Default behavior. Hook is called every time, both static sub crawler and playwright sub crawler. pre_nav_hook_common.assert_has_calls([call(test_urls[0]), call(test_urls[0])]) # Hook is called only by playwright sub crawler. pre_nav_hook_playwright.assert_called_once_with('about:blank') async def test_adaptive_crawling_post_nav_change_to_context(test_urls: list[str]) -> None: """Tests that context can be modified in post-navigation hooks.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_enforce_detection, ) user_data_in_post_nav_hook = [] user_data_in_handler = [] @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: user_data_in_handler.append(context.request.user_data.get('data', None)) @crawler.post_navigation_hook async def post_nav_hook(context: AdaptivePlaywrightPostNavCrawlingContext) -> None: user_data_in_post_nav_hook.append(context.request.user_data.get('data', None)) try: # page is available only if it was crawled by PlaywrightCrawler. context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. context.request.user_data['data'] = 'pw' except AdaptiveContextError: context.request.user_data['data'] = 'bs' await crawler.run(test_urls[:1]) # Check that repeated post nav hook invocations do not influence each other while probing assert user_data_in_post_nav_hook == [None, None] # Check that the request handler sees changes to user data done by post nav hooks assert user_data_in_handler == ['pw', 'bs'] async def test_playwright_only_post_navigation_hook(test_urls: list[str]) -> None: """Test that hook can be registered for playwright only sub crawler. Create a situation where one page is crawled by both sub crawlers. One common post navigation hook is registered and one playwright only post navigation hook is registered.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_enforce_detection, ) post_nav_hook_common = Mock() post_nav_hook_playwright = Mock() @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: pass @crawler.post_navigation_hook async def post_nav_hook(context: AdaptivePlaywrightPostNavCrawlingContext) -> None: post_nav_hook_common(context.request.url) @crawler.post_navigation_hook(playwright_only=True) async def post_nav_hook_pw_only(context: AdaptivePlaywrightPostNavCrawlingContext) -> None: post_nav_hook_playwright(context.page.url) await crawler.run(test_urls[:1]) # Default behavior. Hook is called every time, both static sub crawler and playwright sub crawler. post_nav_hook_common.assert_has_calls([call(test_urls[0]), call(test_urls[0])]) # Hook is called only by playwright sub crawler. post_nav_hook_playwright.assert_called_once_with(test_urls[0]) async def test_adaptive_crawling_result(test_urls: list[str]) -> None: """Tests that result only from one sub crawler is saved. Enforced rendering type detection to run both sub crawlers.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_enforce_detection, ) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: try: # page is available only if it was crawled by PlaywrightCrawler. context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. await context.push_data({'handler': 'pw'}) except AdaptiveContextError: await context.push_data({'handler': 'bs'}) await crawler.run(test_urls[:1]) # Enforced rendering type detection will trigger both sub crawlers, but only pw crawler result is saved. assert (await crawler.get_data()).items == [{'handler': 'pw'}] @pytest.mark.parametrize( ('pw_saved_data', 'static_saved_data', 'expected_result_rendering_type'), [ pytest.param({'some': 'data'}, {'some': 'data'}, 'static', id='Same results from sub crawlers'), pytest.param({'some': 'data'}, {'different': 'data'}, 'client only', id='Different results from sub crawlers'), ], ) async def test_adaptive_crawling_predictor_calls( pw_saved_data: dict[str, str], static_saved_data: dict[str, str], expected_result_rendering_type: RenderingType, test_urls: list[str], ) -> None: """Tests expected predictor calls. Same results.""" some_label = 'bla' some_url = test_urls[0] static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() requests = [Request.from_url(url=some_url, label=some_label)] crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_enforce_detection, ) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: try: # page is available only if it was crawled by PlaywrightCrawler. context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. await context.push_data(pw_saved_data) except AdaptiveContextError: await context.push_data(static_saved_data) with ( patch.object(static_only_predictor_enforce_detection, 'store_result', Mock()) as mocked_store_result, patch.object( static_only_predictor_enforce_detection, 'predict', Mock(return_value=RenderingTypePrediction('static', 1)) ) as mocked_predict, ): await crawler.run(requests) assert mocked_predict.call_count == 1 assert mocked_predict.call_args[0][0].url == requests[0].url # If `static` and `client only` results are same, `store_result` should be called with `static`. mocked_store_result.assert_called_once_with(mocked_predict.call_args[0][0], expected_result_rendering_type) async def test_adaptive_crawling_result_use_state_isolation( key_value_store: KeyValueStore, test_urls: list[str] ) -> None: """Tests that global state accessed through `use_state` is changed only by one sub crawler. Enforced rendering type detection to run both sub crawlers.""" static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor() crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_enforce_detection, ) await key_value_store.set_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0', {'counter': 0}) request_handler_calls = 0 @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: nonlocal request_handler_calls state = cast('dict[str, int]', await context.use_state()) request_handler_calls += 1 state['counter'] += 1 await crawler.run(test_urls[:1]) await key_value_store.persist_autosaved_values() # Request handler was called twice assert request_handler_calls == 2 # Increment of global state happened only once assert (await key_value_store.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0'))['counter'] == 1 async def test_adaptive_crawling_statistics(test_urls: list[str]) -> None: """Test adaptive crawler statistics. Crawler set to static crawling, but due to result_checker returning False on static crawling result it will do browser crawling instead as well. This increments all three adaptive crawling related stats.""" static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_predictor_no_detection, result_checker=lambda result: False, # noqa: ARG005 # Intentionally unused argument. ) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: pass await crawler.run(test_urls[:1]) assert crawler.statistics.state.http_only_request_handler_runs == 1 assert crawler.statistics.state.browser_request_handler_runs == 1 assert crawler.statistics.state.rendering_type_mispredictions == 1 # Despite running both sub crawlers the top crawler statistics should count this as one request finished. assert crawler.statistics.state.requests_finished == 1 assert crawler.statistics.state.requests_failed == 0 @pytest.mark.parametrize( 'error_in_pw_crawler', [ pytest.param(False, id='Error only in static sub crawler'), pytest.param(True, id='Error in both sub crawlers'), ], ) async def test_adaptive_crawler_exceptions_in_sub_crawlers(*, error_in_pw_crawler: bool, test_urls: list[str]) -> None: """Test that correct results are committed when exceptions are raised in sub crawlers. Exception in bs sub crawler will be logged and pw sub crawler used instead. Any result from bs sub crawler will be discarded, result form pw crawler will be saved instead. (But global state modifications through `use_state` will not be reverted!!!) Exception in pw sub crawler will prevent any result from being committed. Even if `push_data` was called before the exception """ static_only_no_detection_predictor = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=static_only_no_detection_predictor, ) saved_data = {'some': 'data'} @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: try: # page is available only if it was crawled by PlaywrightCrawler. context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. await context.push_data(saved_data) if error_in_pw_crawler: raise RuntimeError('Some pw sub crawler related error') except AdaptiveContextError: await context.push_data({'this': 'data should not be saved'}) raise RuntimeError('Some bs sub crawler related error') from None await crawler.run(test_urls[:1]) dataset = await crawler.get_dataset() stored_results = [item async for item in dataset.iterate_items()] if error_in_pw_crawler: assert stored_results == [] else: assert stored_results == [saved_data] async def test_adaptive_playwright_crawler_statistics_in_init() -> None: """Tests that adaptive crawler uses created AdaptivePlaywrightCrawlerStatistics from inputted Statistics.""" persistence_enabled = True persist_state_kvs_name = 'some-name' persist_state_key = 'come key' log_message = 'some message' periodic_message_logger = logging.getLogger('some logger') log_interval = timedelta(minutes=2) statistics = Statistics.with_default_state( persistence_enabled=persistence_enabled, persist_state_kvs_name=persist_state_kvs_name, persist_state_key=persist_state_key, log_message=log_message, periodic_message_logger=periodic_message_logger, log_interval=log_interval, ) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(statistics=statistics) await crawler.run([]) # ensure that statistics get initialized assert type(crawler._statistics.state) is AdaptivePlaywrightCrawlerStatisticState assert crawler._statistics._state._persistence_enabled == persistence_enabled assert crawler._statistics._state._persist_state_key == persist_state_key assert crawler._statistics._log_message == log_message assert crawler._statistics._periodic_message_logger == periodic_message_logger async def test_adaptive_playwright_crawler_timeout_in_sub_crawler(test_urls: list[str]) -> None: """Tests that timeout in static sub crawler forces fall back to browser sub crawler. Create situation where static sub crawler blocks(should time out), such error should start browser sub crawler. """ static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) # Use a generous timeout so the static pipeline has enough time to reach the handler even on slow CI. # The handler will block indefinitely, so the timeout will always fire during the handler's wait. request_handler_timeout = timedelta(seconds=10) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( max_request_retries=0, rendering_type_predictor=static_only_predictor_no_detection, request_handler_timeout=request_handler_timeout, ) mocked_static_handler = Mock(name='static_handler') mocked_browser_handler = Mock(name='browser_handler') @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: try: # page is available only if it was crawled by PlaywrightCrawler. context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. mocked_browser_handler() except AdaptiveContextError: mocked_static_handler() # Relax timeout for the fallback browser request to allow for slow browser startup on CI crawler._request_handler_timeout = timedelta(seconds=120) # Block indefinitely - will be cancelled when the request_handler_timeout fires. await asyncio.Event().wait() await crawler.run(test_urls[:1]) mocked_static_handler.assert_called_once_with() # Browser handler was capable of running despite static handler blocking longer than the handler timeout. mocked_browser_handler.assert_called_once_with() async def test_adaptive_playwright_crawler_default_predictor(test_urls: list[str]) -> None: """Test default rendering type predictor integration into crawler.""" crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser() mocked_static_handler = Mock() mocked_browser_handler = Mock() @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: try: # page is available only if it was crawled by PlaywrightCrawler. context.page # noqa:B018 Intentionally "useless expression". Can trigger exception. mocked_browser_handler() except AdaptiveContextError: mocked_static_handler() await crawler.run(test_urls[:1]) # First prediction should trigger rendering type detection as the predictor does not have any data for prediction. mocked_static_handler.assert_called_once_with() mocked_browser_handler.assert_called_once_with() async def test_adaptive_context_query_selector_beautiful_soup(test_urls: list[str]) -> None: """Test that `context.query_selector_one` works regardless of the crawl type for BeautifulSoup variant. Handler tries to locate two elements h1 and h2. h1 exists immediately, h2 is created dynamically by inline JS snippet embedded in the html. Create situation where page is crawled with static sub crawler first. Static sub crawler should be able to locate only h1. It will try to wait for h2, trying to wait for h2 will trigger `AdaptiveContextError` which will force the adaptive crawler to try playwright sub crawler instead. Playwright sub crawler is able to wait for the h2 element.""" # Get page with injected JS code that will add some element after timeout static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( max_request_retries=1, rendering_type_predictor=static_only_predictor_no_detection, ) mocked_h1_handler = Mock() mocked_h2_handler = Mock() @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: h1 = await context.query_selector_one('h1', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)) mocked_h1_handler(h1) h2 = await context.query_selector_one('h2', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)) mocked_h2_handler(h2) await crawler.run(test_urls[:1]) expected_h1_tag = Tag(name='h1') expected_h1_tag.append(_H1_TEXT) expected_h2_tag = Tag(name='h2') expected_h2_tag.append(_H2_TEXT) # Called by both sub crawlers mocked_h1_handler.assert_has_calls([call(expected_h1_tag), call(expected_h1_tag)]) # Called only by pw sub crawler mocked_h2_handler.assert_has_calls([call(expected_h2_tag)]) @pytest.mark.flaky( rerun=3, reason='Test is flaky on Windows and MacOS, see https://github.com/apify/crawlee-python/issues/1650.', ) async def test_adaptive_context_query_selector_parsel(test_urls: list[str]) -> None: """Test that `context.query_selector_one` works regardless of the crawl type for Parsel variant. Handler tries to locate two elements h1 and h2. h1 exists immediately, h2 is created dynamically by inline JS snippet embedded in the html. Create situation where page is crawled with static sub crawler first. Static sub crawler should be able to locate only h1. It will try to wait for h2, trying to wait for h2 will trigger `AdaptiveContextError` which will force the adaptive crawler to try playwright sub crawler instead. Playwright sub crawler is able to wait for the h2 element.""" # Get page with injected JS code that will add some element after timeout static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) expected_h1_tag = f'

{_H1_TEXT}

' expected_h2_tag = f'

{_H2_TEXT}

' crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( max_request_retries=1, rendering_type_predictor=static_only_predictor_no_detection, ) mocked_h1_handler = Mock() mocked_h2_handler = Mock() @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: if h1 := await context.query_selector_one('h1', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)): mocked_h1_handler(type(h1), h1.get()) if h2 := await context.query_selector_one('h2', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)): mocked_h2_handler(type(h2), h2.get()) await crawler.run(test_urls[:1]) # Called by both sub crawlers mocked_h1_handler.assert_has_calls([call(Selector, expected_h1_tag), call(Selector, expected_h1_tag)]) # Called only by pw sub crawler mocked_h2_handler.assert_has_calls([call(Selector, expected_h2_tag)]) async def test_adaptive_context_parse_with_static_parser_parsel(test_urls: list[str]) -> None: """Test `context.parse_with_static_parser` works regardless of the crawl type for Parsel variant. (Test covers also `context.wait_for_selector`, which is called by `context.parse_with_static_parser`) """ static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) expected_h2_tag = f'

{_H2_TEXT}

' crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( max_request_retries=1, rendering_type_predictor=static_only_predictor_no_detection, ) mocked_h2_handler = Mock() @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: h2_static = context.parsed_content.css('h2') # Should not find anything mocked_h2_handler(h2_static) # Reparse whole page after h2 appears parsed_content_after_h2_appeared = await context.parse_with_static_parser( selector='h2', timeout=timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2) ) mocked_h2_handler(parsed_content_after_h2_appeared.css('h2')[0].get()) await crawler.run(test_urls[:1]) mocked_h2_handler.assert_has_calls( [ call([]), # Static sub crawler tried and did not find h2. call([]), # Playwright sub crawler tried and did not find h2 without waiting. call(expected_h2_tag), # Playwright waited for h2 to appear. ] ) async def test_adaptive_context_helpers_on_changed_selector(test_urls: list[str]) -> None: """Test that context helpers work on latest version of the page. Scenario where page is changed after a while. H2 element is added and text of H3 element is modified. Test that context helpers automatically work on latest version of the page by reading H3 element and expecting it's dynamically changed text instead of the original static text. """ browser_only_predictor_no_detection = _SimpleRenderingTypePredictor( rendering_types=cycle(['client only']), detection_probability_recommendation=cycle([0]), ) expected_h3_tag = f'

{_H3_CHANGED_TEXT}

' crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( max_request_retries=1, rendering_type_predictor=browser_only_predictor_no_detection, ) mocked_h3_handler = Mock() @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: await context.query_selector_one('h2') # Wait for change that is indicated by appearance of h2 element. if h3 := await context.query_selector_one('h3'): mocked_h3_handler(h3.get()) # Get updated h3 element. await crawler.run(test_urls[:1]) mocked_h3_handler.assert_called_once_with(expected_h3_tag) async def test_adaptive_context_query_non_existing_element(test_urls: list[str]) -> None: """Test that querying non-existing selector returns `None`""" browser_only_predictor_no_detection = _SimpleRenderingTypePredictor( rendering_types=cycle(['client only']), detection_probability_recommendation=cycle([0]), ) crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( max_request_retries=1, rendering_type_predictor=browser_only_predictor_no_detection, ) mocked_h3_handler = Mock() @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: mocked_h3_handler(await context.query_selector_one('non sense selector', timeout=timedelta(milliseconds=1))) await crawler.run(test_urls[:1]) mocked_h3_handler.assert_called_once_with(None) @pytest.mark.parametrize( 'test_input', [ pytest.param( TestInput( expected_pw_count=0, expected_static_count=2, rendering_types=cycle(['static']), detection_probability_recommendation=cycle([0]), ), id='Static only', ), pytest.param( TestInput( expected_pw_count=2, expected_static_count=0, rendering_types=cycle(['client only']), detection_probability_recommendation=cycle([0]), ), id='Client only', ), pytest.param( TestInput( expected_pw_count=2, expected_static_count=2, rendering_types=cycle(['static', 'client only']), detection_probability_recommendation=cycle([1]), ), id='Enforced rendering type detection', ), ], ) async def test_change_context_state_after_handling(test_input: TestInput, server_url: URL) -> None: """Test that context state is saved after handling the request.""" predictor = _SimpleRenderingTypePredictor( rendering_types=test_input.rendering_types, detection_probability_recommendation=test_input.detection_probability_recommendation, ) request_queue = await RequestQueue.open(name='state-test') used_session_id = None async with SessionPool() as session_pool: crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( rendering_type_predictor=predictor, session_pool=session_pool, request_manager=request_queue, ) @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: nonlocal used_session_id if context.session is not None: used_session_id = context.session.id context.session.user_data['session_state'] = True if isinstance(context.request.user_data['request_state'], list): context.request.user_data['request_state'].append('handler') request = Request.from_url(str(server_url), user_data={'request_state': ['initial']}) await crawler.run([request]) assert used_session_id is not None session = await session_pool.get_session_by_id(used_session_id) check_request = await request_queue.get_request(request.unique_key) assert session is not None assert check_request is not None assert session.user_data.get('session_state') is True # Check that request user data was updated in the handler and only onse. assert check_request.user_data.get('request_state') == ['initial', 'handler'] await request_queue.drop() async def test_adaptive_playwright_crawler_with_sql_storage(test_urls: list[str], tmp_path: Path) -> None: """Tests that AdaptivePlaywrightCrawler can be initialized with SqlStorageClient.""" storage_dir = tmp_path / 'test_table.db' async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client: crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( storage_client=storage_client, ) mocked_handler = Mock() @crawler.router.default_handler async def request_handler(_context: AdaptivePlaywrightCrawlingContext) -> None: mocked_handler() await crawler.run(test_urls[:1]) mocked_handler.assert_called() ================================================ FILE: tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py ================================================ from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import ( AdaptivePlaywrightCrawlerStatisticState, ) from crawlee.statistics import Statistics async def test_predictor_state_persistence() -> None: """Test that adaptive statistics can be correctly persisted and initialized from persisted values.""" async with Statistics( state_model=AdaptivePlaywrightCrawlerStatisticState, persistence_enabled=True ) as adaptive_statistics: adaptive_statistics.state.browser_request_handler_runs = 1 adaptive_statistics.state.rendering_type_mispredictions = 2 adaptive_statistics.state.http_only_request_handler_runs = 3 persistence_state_key = adaptive_statistics._state._persist_state_key # adaptive_statistics are persisted after leaving the context # new_adaptive_statistics are initialized from the persisted values. async with Statistics( state_model=AdaptivePlaywrightCrawlerStatisticState, persistence_enabled=True, persist_state_key=persistence_state_key, ) as new_adaptive_statistics: pass assert new_adaptive_statistics.state.browser_request_handler_runs == 1 assert new_adaptive_statistics.state.rendering_type_mispredictions == 2 assert new_adaptive_statistics.state.http_only_request_handler_runs == 3 ================================================ FILE: tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawling_context.py ================================================ ================================================ FILE: tests/unit/crawlers/_adaptive_playwright/test_predictor.py ================================================ from __future__ import annotations import pytest from crawlee import Request from crawlee.crawlers._adaptive_playwright._rendering_type_predictor import ( DefaultRenderingTypePredictor, RenderingType, calculate_url_similarity, get_url_components, ) from crawlee.storages import KeyValueStore @pytest.mark.parametrize('label', ['some label', None]) @pytest.mark.parametrize( ('url', 'expected_prediction'), [ ('http://www.aaa.com/some/stuff/extra', 'static'), ('http://www.aab.com/some/otherstuff', 'static'), ('http://www.aac.com/some', 'static'), ('http://www.ddd.com/some/stuff/extra', 'client only'), ('http://www.dde.com/some/otherstuff', 'client only'), ('http://www.ddf.com/some', 'client only'), ], ) async def test_predictor_same_label(url: str, expected_prediction: RenderingType, label: str | None) -> None: async with DefaultRenderingTypePredictor() as predictor: learning_inputs: tuple[tuple[str, RenderingType], ...] = ( ('http://www.aaa.com/some/stuff', 'static'), ('http://www.aab.com/some/stuff', 'static'), ('http://www.aac.com/some/stuff', 'static'), ('http://www.ddd.com/some/stuff', 'client only'), ('http://www.dde.com/some/stuff', 'client only'), ('http://www.ddf.com/some/stuff', 'client only'), ) # Learn from small set for learned_url, rendering_type in learning_inputs: predictor.store_result(Request.from_url(url=learned_url, label=label), rendering_type=rendering_type) assert predictor.predict(Request.from_url(url=url, label=label)).rendering_type == expected_prediction async def test_predictor_new_label_increased_detection_probability_recommendation() -> None: """Test that urls of uncommon labels have increased detection recommendation. This increase should gradually drop as the predictor learns more data with this label.""" detection_ratio = 0.01 label = 'some label' async with DefaultRenderingTypePredictor(detection_ratio=detection_ratio) as predictor: # Learn first prediction of this label predictor.store_result( Request.from_url(url='http://www.aaa.com/some/stuff', label=label), rendering_type='static' ) # Increased detection_probability_recommendation prediction = predictor.predict(Request.from_url(url='http://www.aaa.com/some/stuffa', label=label)) assert prediction.rendering_type == 'static' assert prediction.detection_probability_recommendation == detection_ratio * 4 # Learn second prediction of this label predictor.store_result( Request.from_url(url='http://www.aaa.com/some/stuffe', label=label), rendering_type='static' ) # Increased detection_probability_recommendation prediction = predictor.predict(Request.from_url(url='http://www.aaa.com/some/stuffa', label=label)) assert prediction.rendering_type == 'static' assert prediction.detection_probability_recommendation == detection_ratio * 3 # Learn third prediction of this label predictor.store_result( Request.from_url(url='http://www.aaa.com/some/stuffi', label=label), rendering_type='static' ) # Increased detection_probability_recommendation prediction = predictor.predict(Request.from_url(url='http://www.aaa.com/some/stuffa', label=label)) assert prediction.rendering_type == 'static' assert prediction.detection_probability_recommendation == detection_ratio * 2 # Learn fourth prediction of this label. predictor.store_result( Request.from_url(url='http://www.aaa.com/some/stuffo', label=label), rendering_type='static' ) # Label considered stable now. There should be no increase of detection_probability_recommendation. prediction = predictor.predict(Request.from_url(url='http://www.aaa.com/some/stuffa', label=label)) assert prediction.rendering_type == 'static' assert prediction.detection_probability_recommendation == detection_ratio async def test_unreliable_prediction() -> None: """Test that detection_probability_recommendation for unreliable predictions is 1. Create situation where no learning data of new label is available for the predictor. It's first prediction is not reliable as both options have 50% chance, so it should set maximum detection_probability_recommendation.""" learnt_label = 'some label' async with DefaultRenderingTypePredictor() as predictor: # Learn two predictions of some label. One of each to make predictor very uncertain. predictor.store_result( Request.from_url(url='http://www.aaa.com/some/stuff', label=learnt_label), rendering_type='static' ) predictor.store_result( Request.from_url(url='http://www.aaa.com/some/otherstuff', label=learnt_label), rendering_type='client only' ) # Predict for new label. Predictor does not have enough information to give any reliable guess and should make # it clear by setting detection_probability_recommendation=1 probability = predictor.predict( Request.from_url(url='http://www.unknown.com', label='new label') ).detection_probability_recommendation assert probability == 1 async def test_no_learning_data_prediction() -> None: """Test that predictor can predict even if it never learnt anything before. It should give some prediction, but it has to set detection_probability_recommendation=1""" async with DefaultRenderingTypePredictor() as predictor: probability = predictor.predict( Request.from_url(url='http://www.unknown.com', label='new label') ).detection_probability_recommendation assert probability == 1 async def test_persistent_no_learning_data_prediction() -> None: """Test that the model is saved after initialisation in KeyValueStore.""" persist_key = 'test-no_learning-state' async with DefaultRenderingTypePredictor(persistence_enabled=True, persist_state_key=persist_key) as _predictor: pass kvs = await KeyValueStore.open() persisted_data = await kvs.get_value(persist_key) assert persisted_data is not None assert persisted_data['model']['is_fitted'] is False async def test_persistent_prediction() -> None: """Test that the model and resources is saved after train in KeyValueStore.""" persist_key = 'test-persistent-state' async with DefaultRenderingTypePredictor(persistence_enabled=True, persist_state_key=persist_key) as predictor: # Learn some data predictor.store_result( Request.from_url(url='http://www.aaa.com/some/stuff', label='some label'), rendering_type='static' ) kvs = await KeyValueStore.open() persisted_data = await kvs.get_value(persist_key) assert persisted_data is not None assert persisted_data['model']['is_fitted'] is True @pytest.mark.parametrize( ('persistence_enabled', 'same_result'), [ pytest.param(True, True, id='with persistence'), pytest.param(False, False, id='without persistence'), ], ) async def test_persistent_prediction_recovery(*, persistence_enabled: bool, same_result: bool) -> None: """Test that the model and resources is recovered from KeyValueStore.""" persist_key = 'test-persistent-state-recovery' async with DefaultRenderingTypePredictor( detection_ratio=0.01, persistence_enabled=persistence_enabled, persist_state_key=persist_key ) as predictor: # Learn some data predictor.store_result( Request.from_url(url='http://www.aaa.com/some/stuff', label='some label'), rendering_type='static' ) before_recover_prediction = predictor.predict( Request.from_url(url='http://www.aaa.com/some/stuff', label='some label') ) # Recover predictor async with DefaultRenderingTypePredictor( detection_ratio=0.01, persistence_enabled=True, persist_state_key=persist_key ) as recover_predictor: after_recover_prediction = recover_predictor.predict( Request.from_url(url='http://www.aaa.com/some/stuff', label='some label') ) # If persistence is enabled, the predicted results must be the same. if same_result: assert ( before_recover_prediction.detection_probability_recommendation == after_recover_prediction.detection_probability_recommendation ) else: assert ( before_recover_prediction.detection_probability_recommendation != after_recover_prediction.detection_probability_recommendation ) @pytest.mark.parametrize( ('url_1', 'url_2', 'expected_rounded_similarity'), [ ( 'https://docs.python.org/3/library/itertools.html#itertools.zip_longest', 'https://docs.python.org/3.7/library/itertools.html#itertools.zip_longest', 0.67, ), ('https://differente.com/same', 'https://differenta.com/same', 0), ('https://same.com/almost_the_same', 'https://same.com/almost_the_sama', 1), ('https://same.com/same/extra', 'https://same.com/same', 0.5), ], ) def test_url_similarity(url_1: str, url_2: str, expected_rounded_similarity: float) -> None: assert ( round(calculate_url_similarity(url_1=get_url_components(url_1), url_2=get_url_components(url_2)), 2) == expected_rounded_similarity ) ================================================ FILE: tests/unit/crawlers/_basic/test_basic_crawler.py ================================================ # ruff: noqa: ARG001 from __future__ import annotations import asyncio import json import logging import os import re import sys import time from asyncio import Future from collections import Counter from concurrent.futures import ProcessPoolExecutor from dataclasses import dataclass from datetime import timedelta from itertools import product from typing import TYPE_CHECKING, Any, Literal, cast from unittest.mock import AsyncMock, Mock, call, patch import pytest from crawlee import ConcurrencySettings, Glob, service_locator from crawlee._request import Request, RequestState from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod from crawlee._utils.robots import RobotsTxtFile from crawlee.configuration import Configuration from crawlee.crawlers import BasicCrawler from crawlee.errors import RequestCollisionError, SessionError, UserDefinedErrorHandlerError from crawlee.events import Event, EventCrawlerStatusData from crawlee.events._local_event_manager import LocalEventManager from crawlee.request_loaders import RequestList, RequestManagerTandem from crawlee.sessions import Session, SessionPool from crawlee.statistics import FinalStatistics from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient from crawlee.storages import Dataset, KeyValueStore, RequestQueue if TYPE_CHECKING: from collections.abc import Callable, Sequence from pathlib import Path from yarl import URL from crawlee._types import JsonSerializable from crawlee.statistics import StatisticsState async def test_processes_requests_from_explicit_queue() -> None: queue = await RequestQueue.open() await queue.add_requests(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) crawler = BasicCrawler(request_manager=queue) calls = list[str]() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: calls.append(context.request.url) await crawler.run() assert calls == ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'] async def test_processes_requests_from_request_source_tandem() -> None: request_queue = await RequestQueue.open() await request_queue.add_requests( ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'] ) request_list = RequestList(['https://a.placeholder.com', 'https://d.placeholder.com', 'https://e.placeholder.com']) crawler = BasicCrawler(request_manager=RequestManagerTandem(request_list, request_queue)) calls = set[str]() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: calls.add(context.request.url) await crawler.run() assert calls == { 'https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com', 'https://d.placeholder.com', 'https://e.placeholder.com', } async def test_processes_requests_from_run_args() -> None: crawler = BasicCrawler() calls = list[str]() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: calls.append(context.request.url) await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) assert calls == ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'] async def test_allows_multiple_run_calls() -> None: crawler = BasicCrawler() calls = list[str]() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: calls.append(context.request.url) await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) assert calls == [ 'https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com', 'https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com', ] async def test_retries_failed_requests() -> None: crawler = BasicCrawler() calls = list[str]() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: calls.append(context.request.url) if context.request.url == 'https://b.placeholder.com': raise RuntimeError('Arbitrary crash for testing purposes') await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) assert calls == [ 'https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com', 'https://b.placeholder.com', 'https://b.placeholder.com', 'https://b.placeholder.com', ] async def test_respects_no_retry() -> None: crawler = BasicCrawler(max_request_retries=2) calls = list[str]() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: calls.append(context.request.url) raise RuntimeError('Arbitrary crash for testing purposes') await crawler.run( [ 'https://a.placeholder.com', 'https://b.placeholder.com', Request.from_url(url='https://c.placeholder.com', no_retry=True), ] ) assert calls == [ 'https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com', 'https://a.placeholder.com', 'https://b.placeholder.com', 'https://a.placeholder.com', 'https://b.placeholder.com', ] async def test_respects_request_specific_max_retries() -> None: crawler = BasicCrawler(max_request_retries=0) calls = list[str]() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: calls.append(context.request.url) raise RuntimeError('Arbitrary crash for testing purposes') await crawler.run( [ 'https://a.placeholder.com', 'https://b.placeholder.com', Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 1}}), ] ) assert calls == [ 'https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com', 'https://c.placeholder.com', ] async def test_calls_error_handler() -> None: # Data structure to better track the calls to the error handler. @dataclass(frozen=True) class Call: url: str error: Exception # List to store the information of calls to the error handler. calls = list[Call]() crawler = BasicCrawler(max_request_retries=2) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: if context.request.url == 'https://b.placeholder.com': raise RuntimeError('Arbitrary crash for testing purposes') @crawler.error_handler async def error_handler(context: BasicCrawlingContext, error: Exception) -> Request: # Append the current call information. calls.append(Call(context.request.url, error)) return context.request await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) # Verify that the error handler was called twice assert len(calls) == 2 # Check calls for error_call in calls: assert error_call.url == 'https://b.placeholder.com' assert isinstance(error_call.error, RuntimeError) async def test_calls_error_handler_for_session_errors() -> None: crawler = BasicCrawler( max_session_rotations=1, ) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: raise SessionError('Arbitrary session error for testing purposes') error_handler_mock = AsyncMock() @crawler.error_handler async def error_handler(context: BasicCrawlingContext, error: Exception) -> None: await error_handler_mock(context, error) await crawler.run(['https://crawlee.dev']) assert error_handler_mock.call_count == 1 async def test_handles_error_in_error_handler() -> None: crawler = BasicCrawler(max_request_retries=3) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: if context.request.url == 'https://b.placeholder.com': raise RuntimeError('Arbitrary crash for testing purposes') @crawler.error_handler async def error_handler(context: BasicCrawlingContext, error: Exception) -> None: raise RuntimeError('Crash in error handler') with pytest.raises(UserDefinedErrorHandlerError): await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) async def test_calls_failed_request_handler() -> None: crawler = BasicCrawler(max_request_retries=3) calls = list[tuple[BasicCrawlingContext, Exception]]() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: if context.request.url == 'https://b.placeholder.com': raise RuntimeError('Arbitrary crash for testing purposes') @crawler.failed_request_handler async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None: calls.append((context, error)) await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) assert len(calls) == 1 assert calls[0][0].request.url == 'https://b.placeholder.com' assert isinstance(calls[0][1], RuntimeError) @pytest.mark.parametrize('handler', ['failed_request_handler', 'error_handler']) async def test_handlers_use_context_helpers(tmp_path: Path, handler: str) -> None: """Test that context helpers used in `failed_request_handler` and in `error_handler` have effect.""" # Prepare crawler storage_client = FileSystemStorageClient() crawler = BasicCrawler( max_request_retries=1, storage_client=storage_client, configuration=Configuration(storage_dir=str(tmp_path)) ) # Test data rq_alias = 'other' test_data = {'some': 'data'} test_key = 'key' test_value = 'value' test_request = Request.from_url('https://d.placeholder.com') # Request handler with injected error @crawler.router.default_handler async def request_handler(context: BasicCrawlingContext) -> None: raise RuntimeError('Arbitrary crash for testing purposes') # Apply one of the handlers @getattr(crawler, handler) async def handler_implementation(context: BasicCrawlingContext, error: Exception) -> None: await context.push_data(test_data) await context.add_requests(requests=[test_request], rq_alias=rq_alias) kvs = await context.get_key_value_store() await kvs.set_value(test_key, test_value) await crawler.run(['https://b.placeholder.com']) # Verify that the context helpers used in handlers had effect on used storages dataset = await Dataset.open(storage_client=storage_client) kvs = await KeyValueStore.open(storage_client=storage_client) rq = await RequestQueue.open(alias=rq_alias, storage_client=storage_client) assert test_value == await kvs.get_value(test_key) assert [test_data] == (await dataset.get_data()).items assert test_request == await rq.fetch_next_request() async def test_handles_error_in_failed_request_handler() -> None: crawler = BasicCrawler(max_request_retries=3) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: if context.request.url == 'https://b.placeholder.com': raise RuntimeError('Arbitrary crash for testing purposes') @crawler.failed_request_handler async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None: raise RuntimeError('Crash in failed request handler') with pytest.raises(UserDefinedErrorHandlerError): await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) @pytest.mark.parametrize( ('method', 'path', 'payload'), [ pytest.param('GET', 'get', None, id='get send_request'), pytest.param('POST', 'post', b'Hello, world!', id='post send_request'), ], ) async def test_send_request_works(server_url: URL, method: HttpMethod, path: str, payload: None | bytes) -> None: response_data: dict[str, Any] = {} crawler = BasicCrawler(max_request_retries=3) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: response = await context.send_request(str(server_url / path), method=method, payload=payload) response_data['body'] = json.loads(await response.read()) response_data['headers'] = response.headers await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) response_body = response_data.get('body') assert response_body is not None assert response_body.get('data') == (payload.decode() if payload else None) response_headers = response_data.get('headers') assert response_headers is not None content_type = response_headers.get('content-type') assert content_type is not None assert content_type == 'application/json' @dataclass class AddRequestsTestInput: start_url: str loaded_url: str requests: Sequence[str | Request] expected_urls: Sequence[str] kwargs: EnqueueLinksKwargs STRATEGY_TEST_URLS = ( 'https://someplace.com/', 'http://someplace.com/index.html', 'https://blog.someplace.com/index.html', 'https://redirect.someplace.com', 'https://other.place.com/index.html', 'https://someplace.jp/', ) INCLUDE_TEST_URLS = ( 'https://someplace.com/', 'https://someplace.com/blog/category/cats', 'https://someplace.com/blog/category/boots', 'https://someplace.com/blog/archive/index.html', 'https://someplace.com/blog/archive/cats', ) @pytest.mark.parametrize( 'test_input', argvalues=[ # Basic use case pytest.param( AddRequestsTestInput( start_url='https://a.placeholder.com', loaded_url='https://a.placeholder.com', requests=[ 'https://a.placeholder.com', Request.from_url('https://b.placeholder.com'), 'https://c.placeholder.com', ], kwargs={}, expected_urls=['https://b.placeholder.com', 'https://c.placeholder.com'], ), id='basic', ), # Enqueue strategy pytest.param( AddRequestsTestInput( start_url=STRATEGY_TEST_URLS[0], loaded_url=STRATEGY_TEST_URLS[0], requests=STRATEGY_TEST_URLS, kwargs=EnqueueLinksKwargs(), expected_urls=STRATEGY_TEST_URLS[1:], ), id='enqueue_strategy_default', ), pytest.param( AddRequestsTestInput( start_url=STRATEGY_TEST_URLS[0], loaded_url=STRATEGY_TEST_URLS[0], requests=STRATEGY_TEST_URLS, kwargs=EnqueueLinksKwargs(strategy='all'), expected_urls=STRATEGY_TEST_URLS[1:], ), id='enqueue_strategy_all', ), pytest.param( AddRequestsTestInput( start_url=STRATEGY_TEST_URLS[0], loaded_url=STRATEGY_TEST_URLS[0], requests=STRATEGY_TEST_URLS, kwargs=EnqueueLinksKwargs(strategy='same-domain'), expected_urls=STRATEGY_TEST_URLS[1:4], ), id='enqueue_strategy_same_domain', ), pytest.param( AddRequestsTestInput( start_url=STRATEGY_TEST_URLS[0], loaded_url=STRATEGY_TEST_URLS[0], requests=STRATEGY_TEST_URLS, kwargs=EnqueueLinksKwargs(strategy='same-hostname'), expected_urls=[STRATEGY_TEST_URLS[1]], ), id='enqueue_strategy_same_hostname', ), pytest.param( AddRequestsTestInput( start_url=STRATEGY_TEST_URLS[0], loaded_url=STRATEGY_TEST_URLS[0], requests=STRATEGY_TEST_URLS, kwargs=EnqueueLinksKwargs(strategy='same-origin'), expected_urls=[], ), id='enqueue_strategy_same_origin', ), # Enqueue strategy with redirect pytest.param( AddRequestsTestInput( start_url=STRATEGY_TEST_URLS[3], loaded_url=STRATEGY_TEST_URLS[0], requests=STRATEGY_TEST_URLS, kwargs=EnqueueLinksKwargs(), expected_urls=STRATEGY_TEST_URLS[:3] + STRATEGY_TEST_URLS[4:], ), id='redirect_enqueue_strategy_default', ), pytest.param( AddRequestsTestInput( start_url=STRATEGY_TEST_URLS[3], loaded_url=STRATEGY_TEST_URLS[0], requests=STRATEGY_TEST_URLS, kwargs=EnqueueLinksKwargs(strategy='all'), expected_urls=STRATEGY_TEST_URLS[:3] + STRATEGY_TEST_URLS[4:], ), id='redirect_enqueue_strategy_all', ), pytest.param( AddRequestsTestInput( start_url=STRATEGY_TEST_URLS[3], loaded_url=STRATEGY_TEST_URLS[0], requests=STRATEGY_TEST_URLS, kwargs=EnqueueLinksKwargs(strategy='same-domain'), expected_urls=STRATEGY_TEST_URLS[:3], ), id='redirect_enqueue_strategy_same_domain', ), pytest.param( AddRequestsTestInput( start_url=STRATEGY_TEST_URLS[3], loaded_url=STRATEGY_TEST_URLS[0], requests=STRATEGY_TEST_URLS, kwargs=EnqueueLinksKwargs(strategy='same-hostname'), expected_urls=[], ), id='redirect_enqueue_strategy_same_hostname', ), pytest.param( AddRequestsTestInput( start_url=STRATEGY_TEST_URLS[3], loaded_url=STRATEGY_TEST_URLS[0], requests=STRATEGY_TEST_URLS, kwargs=EnqueueLinksKwargs(strategy='same-origin'), expected_urls=[], ), id='redirect_enqueue_strategy_same_origin', ), # Include/exclude pytest.param( AddRequestsTestInput( start_url=INCLUDE_TEST_URLS[0], loaded_url=INCLUDE_TEST_URLS[0], requests=INCLUDE_TEST_URLS, kwargs=EnqueueLinksKwargs(include=[Glob('https://someplace.com/**/cats')]), expected_urls=[INCLUDE_TEST_URLS[1], INCLUDE_TEST_URLS[4]], ), id='include_exclude_1', ), pytest.param( AddRequestsTestInput( start_url=INCLUDE_TEST_URLS[0], loaded_url=INCLUDE_TEST_URLS[0], requests=INCLUDE_TEST_URLS, kwargs=EnqueueLinksKwargs(exclude=[Glob('https://someplace.com/**/cats')]), expected_urls=[INCLUDE_TEST_URLS[2], INCLUDE_TEST_URLS[3]], ), id='include_exclude_2', ), pytest.param( AddRequestsTestInput( start_url=INCLUDE_TEST_URLS[0], loaded_url=INCLUDE_TEST_URLS[0], requests=INCLUDE_TEST_URLS, kwargs=EnqueueLinksKwargs( include=[Glob('https://someplace.com/**/cats')], exclude=[Glob('https://**/archive/**')] ), expected_urls=[INCLUDE_TEST_URLS[1]], ), id='include_exclude_3', ), ], ) async def test_enqueue_strategy(test_input: AddRequestsTestInput) -> None: visit = Mock() crawler = BasicCrawler() @crawler.router.handler('start') async def start_handler(context: BasicCrawlingContext) -> None: # Assign test value to loaded_url - BasicCrawler does not do any navigation by itself context.request.loaded_url = test_input.loaded_url await context.add_requests( test_input.requests, **test_input.kwargs, ) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: visit(context.request.url) await crawler.run([Request.from_url(test_input.start_url, label='start')]) visited = {call[0][0] for call in visit.call_args_list} assert visited == set(test_input.expected_urls) async def test_session_rotation(server_url: URL) -> None: session_ids: list[str | None] = [] crawler = BasicCrawler( max_session_rotations=7, max_request_retries=1, ) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: session_ids.append(context.session.id if context.session else None) raise SessionError('Test error') await crawler.run([str(server_url)]) # exactly 7 handler calls happened assert len(session_ids) == 7 # all session ids are not None assert None not in session_ids # and each was a different session assert len(set(session_ids)) == 7 async def test_final_statistics() -> None: crawler = BasicCrawler(max_request_retries=2) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: id_param = context.request.get_query_param_from_url('id') assert id_param is not None id = int(id_param) await asyncio.sleep(0.001) if context.request.retry_count == 0 and id % 2 == 0: raise RuntimeError('First crash') if context.request.retry_count == 1 and id % 3 == 0: raise RuntimeError('Second crash') if context.request.retry_count == 2 and id % 4 == 0: raise RuntimeError('Third crash') final_statistics = await crawler.run( [Request.from_url(f'https://someplace.com/?id={id}', label='start') for id in range(50)] ) assert final_statistics.requests_total == 50 assert final_statistics.requests_finished == 45 assert final_statistics.requests_failed == 5 assert final_statistics.retry_histogram == [25, 16, 9] assert final_statistics.request_avg_finished_duration is not None assert final_statistics.request_avg_finished_duration > timedelta() assert final_statistics.request_avg_failed_duration is not None assert final_statistics.request_avg_failed_duration > timedelta() assert final_statistics.request_total_duration > timedelta() assert final_statistics.crawler_runtime > timedelta() assert final_statistics.requests_finished_per_minute > 0 assert final_statistics.requests_failed_per_minute > 0 async def test_crawler_get_storages() -> None: crawler = BasicCrawler() rp = await crawler.get_request_manager() assert isinstance(rp, RequestQueue) dataset = await crawler.get_dataset() assert isinstance(dataset, Dataset) kvs = await crawler.get_key_value_store() assert isinstance(kvs, KeyValueStore) async def test_crawler_run_requests() -> None: crawler = BasicCrawler() seen_urls = list[str]() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: seen_urls.append(context.request.url) start_urls = [ 'http://test.io/1', 'http://test.io/2', 'http://test.io/3', ] stats = await crawler.run(start_urls) assert seen_urls == start_urls assert stats.requests_total == 3 assert stats.requests_finished == 3 async def test_context_push_and_get_data() -> None: crawler = BasicCrawler() dataset = await Dataset.open() await dataset.push_data({'a': 1}) assert (await crawler.get_data()).items == [{'a': 1}] @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: await context.push_data({'b': 2}) await dataset.push_data({'c': 3}) assert (await crawler.get_data()).items == [{'a': 1}, {'c': 3}] stats = await crawler.run(['http://test.io/1']) assert (await crawler.get_data()).items == [{'a': 1}, {'c': 3}, {'b': 2}] assert stats.requests_total == 1 assert stats.requests_finished == 1 async def test_context_push_and_get_data_handler_error() -> None: crawler = BasicCrawler() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: await context.push_data({'b': 2}) raise RuntimeError('Watch me crash') stats = await crawler.run(['https://a.placeholder.com']) assert (await crawler.get_data()).items == [] assert stats.requests_total == 1 assert stats.requests_finished == 0 assert stats.requests_failed == 1 async def test_crawler_push_and_export_data(tmp_path: Path) -> None: crawler = BasicCrawler() dataset = await Dataset.open() await dataset.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}]) await dataset.push_data({'id': 2, 'test': 'test'}) await crawler.export_data(path=tmp_path / 'dataset.json') await crawler.export_data(path=tmp_path / 'dataset.csv') assert json.load((tmp_path / 'dataset.json').open()) == [ {'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}, {'id': 2, 'test': 'test'}, ] # On Windows, text mode file writes convert \n to \r\n, resulting in \r\n line endings. # On Unix/Linux, \n remains as \n. if sys.platform == 'win32': assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n' else: assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\n0,test\n1,test\n2,test\n' async def test_crawler_export_data_additional_kwargs(tmp_path: Path) -> None: crawler = BasicCrawler() dataset = await Dataset.open() await dataset.push_data({'z': 1, 'a': 2}) json_path = tmp_path / 'dataset.json' csv_path = tmp_path / 'dataset.csv' await crawler.export_data(path=json_path, sort_keys=True, separators=(',', ':')) await crawler.export_data(path=csv_path, delimiter=';', lineterminator='\n') assert json_path.read_text() == '[{"a":2,"z":1}]' assert csv_path.read_text() == 'z;a\n1;2\n' async def test_context_push_and_export_data(tmp_path: Path) -> None: crawler = BasicCrawler() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: await context.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}]) await context.push_data({'id': 2, 'test': 'test'}) await crawler.run(['http://test.io/1']) await crawler.export_data(path=tmp_path / 'dataset.json') await crawler.export_data(path=tmp_path / 'dataset.csv') assert json.load((tmp_path / 'dataset.json').open()) == [ {'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}, {'id': 2, 'test': 'test'}, ] # On Windows, text mode file writes convert \n to \r\n, resulting in \r\n line endings. # On Unix/Linux, \n remains as \n. if sys.platform == 'win32': assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n' else: assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\n0,test\n1,test\n2,test\n' async def test_context_update_kv_store() -> None: crawler = BasicCrawler() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: store = await context.get_key_value_store() await store.set_value('foo', 'bar') await crawler.run(['https://hello.world']) store = await crawler.get_key_value_store() assert (await store.get_value('foo')) == 'bar' async def test_context_use_state() -> None: crawler = BasicCrawler() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: await context.use_state({'hello': 'world'}) await crawler.run(['https://hello.world']) kvs = await crawler.get_key_value_store() value = await kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0') assert value == {'hello': 'world'} async def test_crawler_use_state() -> None: crawler = BasicCrawler() await crawler.use_state({'hello': 'world'}) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: # The state set by the crawler must be available in the context of the request handler state = await context.use_state() assert state == {'hello': 'world'} await crawler.run(['https://hello.world']) async def test_context_use_state_crawlers_share_state() -> None: async def handler(context: BasicCrawlingContext) -> None: state = await context.use_state({'urls': []}) assert isinstance(state['urls'], list) state['urls'].append(context.request.url) crawler_1 = BasicCrawler(id=0, request_handler=handler) crawler_2 = BasicCrawler(id=0, request_handler=handler) await crawler_1.run(['https://a.com']) await crawler_2.run(['https://b.com']) kvs = await KeyValueStore.open() assert crawler_1._id == crawler_2._id == 0 assert await kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_{crawler_1._id}') == { 'urls': ['https://a.com', 'https://b.com'] } async def test_crawlers_share_stats() -> None: async def handler(context: BasicCrawlingContext) -> None: await context.use_state({'urls': []}) crawler_1 = BasicCrawler(id=0, request_handler=handler) crawler_2 = BasicCrawler(id=0, request_handler=handler, statistics=crawler_1.statistics) result1 = await crawler_1.run(['https://a.com']) result2 = await crawler_2.run(['https://b.com']) assert crawler_1.statistics == crawler_2.statistics assert result1.requests_finished == 1 assert result2.requests_finished == 2 async def test_context_use_state_crawlers_own_state() -> None: async def handler(context: BasicCrawlingContext) -> None: state = await context.use_state({'urls': []}) assert isinstance(state['urls'], list) state['urls'].append(context.request.url) crawler_1 = BasicCrawler(request_handler=handler) crawler_2 = BasicCrawler(request_handler=handler) await crawler_1.run(['https://a.com']) await crawler_2.run(['https://b.com']) kvs = await KeyValueStore.open() assert await kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0') == {'urls': ['https://a.com']} assert await kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_1') == {'urls': ['https://b.com']} async def test_context_handlers_use_state(key_value_store: KeyValueStore) -> None: state_in_handler_one: dict[str, JsonSerializable] = {} state_in_handler_two: dict[str, JsonSerializable] = {} state_in_handler_three: dict[str, JsonSerializable] = {} crawler = BasicCrawler() @crawler.router.handler('one') async def handler_one(context: BasicCrawlingContext) -> None: state = await context.use_state({'hello': 'world'}) state_in_handler_one.update(state) state['hello'] = 'new_world' await context.add_requests([Request.from_url('https://crawlee.dev/docs/quick-start', label='two')]) @crawler.router.handler('two') async def handler_two(context: BasicCrawlingContext) -> None: state = await context.use_state({'hello': 'world'}) state_in_handler_two.update(state) state['hello'] = 'last_world' @crawler.router.handler('three') async def handler_three(context: BasicCrawlingContext) -> None: state = await context.use_state({'hello': 'world'}) state_in_handler_three.update(state) await crawler.run([Request.from_url('https://crawlee.dev/', label='one')]) await crawler.run([Request.from_url('https://crawlee.dev/docs/examples', label='three')]) # The state in handler_one must match the default state assert state_in_handler_one == {'hello': 'world'} # The state in handler_two must match the state updated in handler_one assert state_in_handler_two == {'hello': 'new_world'} # The state in handler_three must match the final state updated in previous run assert state_in_handler_three == {'hello': 'last_world'} store = await crawler.get_key_value_store() # The state in the KVS must match with the last set state assert (await store.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0')) == {'hello': 'last_world'} @pytest.mark.parametrize( 'use_failed_requests', [pytest.param(True, id='failed requests'), pytest.param(False, id='finished requests')] ) async def test_max_requests_per_crawl(*, use_failed_requests: bool) -> None: start_urls = [ 'http://test.io/1', 'http://test.io/2', 'http://test.io/3', 'http://test.io/4', 'http://test.io/5', ] processed_urls = [] # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately crawler = BasicCrawler( concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), max_requests_per_crawl=3, ) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: if use_failed_requests: raise RuntimeError('Arbitrary crash for testing purposes') processed_urls.append(context.request.url) stats = await crawler.run(start_urls) # Verify that only 3 out of the 5 provided URLs were made if not use_failed_requests: assert len(processed_urls) == 3 assert stats.requests_finished == 3 assert stats.requests_total == 3 async def test_max_crawl_depth() -> None: processed_urls = [] # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately crawler = BasicCrawler( concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), max_crawl_depth=2, ) @crawler.router.handler('start') async def start_handler(context: BasicCrawlingContext) -> None: processed_urls.append(context.request.url) await context.add_requests(['https://someplace.com/too-deep']) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: processed_urls.append(context.request.url) start_request = Request.from_url('https://someplace.com/', label='start') start_request.crawl_depth = 2 stats = await crawler.run([start_request]) assert len(processed_urls) == 1 assert stats.requests_total == 1 assert stats.requests_finished == 1 @pytest.mark.parametrize( ('total_requests', 'fail_at_request', 'expected_starts', 'expected_finished'), [ (3, None, 3, 3), (3, 2, 2, 1), ], ids=[ 'all_requests_successful', 'abort_on_second_request', ], ) async def test_abort_on_error( total_requests: int, fail_at_request: int | None, expected_starts: int, expected_finished: int ) -> None: starts_urls = [] crawler = BasicCrawler( concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), abort_on_error=True, ) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: starts_urls.append(context.request.url) if context.request.user_data.get('n_request') == fail_at_request: raise ValueError('Error request') stats = await crawler.run( [ Request.from_url('https://crawlee.dev', always_enqueue=True, user_data={'n_request': i + 1}) for i in range(total_requests) ] ) assert len(starts_urls) == expected_starts assert stats.requests_finished == expected_finished def test_crawler_log() -> None: crawler = BasicCrawler() assert isinstance(crawler.log, logging.Logger) crawler.log.info('Test log message') async def test_consecutive_runs_purge_request_queue() -> None: crawler = BasicCrawler() visit = Mock() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: visit(context.request.url) await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) counter = Counter(args[0][0] for args in visit.call_args_list) assert counter == { 'https://a.placeholder.com': 3, 'https://b.placeholder.com': 3, 'https://c.placeholder.com': 3, } @pytest.mark.skipif(os.name == 'nt' and 'CI' in os.environ, reason='Skipped in Windows CI') @pytest.mark.parametrize( ('statistics_log_format'), [ pytest.param('table', id='With table for logs'), pytest.param('inline', id='With inline logs'), ], ) async def test_logs_final_statistics( monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture, statistics_log_format: Literal['table', 'inline'] ) -> None: # Set the log level to INFO to capture the final statistics log. caplog.set_level(logging.INFO) crawler = BasicCrawler(configure_logging=False, statistics_log_format=statistics_log_format) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: await context.push_data({'something': 'something'}) fake_statistics = FinalStatistics( requests_finished=4, requests_failed=33, retry_histogram=[1, 4, 8], request_avg_failed_duration=timedelta(seconds=99), request_avg_finished_duration=timedelta(milliseconds=483), requests_finished_per_minute=0.33, requests_failed_per_minute=0.1, request_total_duration=timedelta(minutes=12), requests_total=37, crawler_runtime=timedelta(minutes=5), ) monkeypatch.setattr(crawler._statistics, 'calculate', lambda: fake_statistics) result = await crawler.run() assert result is fake_statistics final_statistics = next( (record for record in caplog.records if record.msg.startswith('Final')), None, ) assert final_statistics is not None if statistics_log_format == 'table': assert final_statistics.msg.splitlines() == [ 'Final request statistics:', '┌───────────────────────────────┬────────────┐', '│ requests_finished │ 4 │', '│ requests_failed │ 33 │', '│ retry_histogram │ [1, 4, 8] │', '│ request_avg_failed_duration │ 1min 39.0s │', '│ request_avg_finished_duration │ 483.0ms │', '│ requests_finished_per_minute │ 0.33 │', '│ requests_failed_per_minute │ 0.1 │', '│ request_total_duration │ 12min │', '│ requests_total │ 37 │', '│ crawler_runtime │ 5min │', '└───────────────────────────────┴────────────┘', ] else: assert final_statistics.msg == 'Final request statistics:' # `extra` parameters are not defined on `LogRecord`, so we cast to `Any` to access them. record = cast('Any', final_statistics) assert record.requests_finished == 4 assert record.requests_failed == 33 assert record.retry_histogram == [1, 4, 8] assert record.request_avg_failed_duration == 99.0 assert record.request_avg_finished_duration == 0.483 assert record.requests_finished_per_minute == 0.33 assert record.requests_failed_per_minute == 0.1 assert record.request_total_duration == 720.0 assert record.requests_total == 37 assert record.crawler_runtime == 300.0 async def test_crawler_manual_stop() -> None: """Test that no new requests are handled after crawler.stop() is called.""" start_urls = [ 'http://test.io/1', 'http://test.io/2', 'http://test.io/3', ] processed_urls = [] # Set max_concurrency to 1 to ensure testing urls are visited one by one in order. crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1)) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: processed_urls.append(context.request.url) if context.request.url == start_urls[1]: crawler.stop() stats = await crawler.run(start_urls) # Verify that only 2 out of the 3 provided URLs were made assert len(processed_urls) == 2 assert stats.requests_total == 2 assert stats.requests_finished == 2 @pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.Barrier was introduced in Python 3.11.') async def test_crawler_multiple_stops_in_parallel() -> None: """Test that no new requests are handled after crawler.stop() is called, but ongoing requests can still finish.""" start_urls = [ 'http://test.io/1', 'http://test.io/2', 'http://test.io/3', ] processed_urls = [] # Set concurrency to 2 to ensure two urls are being visited in parallel. crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(desired_concurrency=2, max_concurrency=2)) both_handlers_started = asyncio.Barrier(2) # type:ignore[attr-defined] # Test is skipped in older Python versions. only_one_handler_at_a_time = asyncio.Semaphore(1) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: await both_handlers_started.wait() # Block until both handlers are started. async with only_one_handler_at_a_time: # Reliably create situation where one handler called `crawler.stop()`, while other handler is still running. crawler.stop(reason=f'Stop called on {context.request.url}') processed_urls.append(context.request.url) stats = await crawler.run(start_urls) # Verify that only 2 out of the 3 provided URLs were made assert len(processed_urls) == 2 assert stats.requests_total == 2 assert stats.requests_finished == 2 async def test_services_no_side_effect_on_crawler_init() -> None: custom_configuration = Configuration() custom_event_manager = LocalEventManager.from_config(custom_configuration) custom_storage_client = MemoryStorageClient() _ = BasicCrawler( configuration=custom_configuration, event_manager=custom_event_manager, storage_client=custom_storage_client, ) assert service_locator.get_configuration() is not custom_configuration assert service_locator.get_event_manager() is not custom_event_manager assert service_locator.get_storage_client() is not custom_storage_client async def test_crawler_uses_default_services() -> None: custom_configuration = Configuration() service_locator.set_configuration(custom_configuration) custom_event_manager = LocalEventManager.from_config(custom_configuration) service_locator.set_event_manager(custom_event_manager) custom_storage_client = MemoryStorageClient() service_locator.set_storage_client(custom_storage_client) basic_crawler = BasicCrawler() assert basic_crawler._service_locator.get_configuration() is custom_configuration assert basic_crawler._service_locator.get_event_manager() is custom_event_manager assert basic_crawler._service_locator.get_storage_client() is custom_storage_client async def test_services_crawlers_can_use_different_services() -> None: custom_configuration_1 = Configuration() custom_event_manager_1 = LocalEventManager.from_config(custom_configuration_1) custom_storage_client_1 = MemoryStorageClient() custom_configuration_2 = Configuration() custom_event_manager_2 = LocalEventManager.from_config(custom_configuration_2) custom_storage_client_2 = MemoryStorageClient() _ = BasicCrawler( configuration=custom_configuration_1, event_manager=custom_event_manager_1, storage_client=custom_storage_client_1, ) _ = BasicCrawler( configuration=custom_configuration_2, event_manager=custom_event_manager_2, storage_client=custom_storage_client_2, ) async def test_crawler_uses_default_storages(tmp_path: Path) -> None: configuration = Configuration( storage_dir=str(tmp_path), purge_on_start=True, ) service_locator.set_configuration(configuration) dataset = await Dataset.open() kvs = await KeyValueStore.open() rq = await RequestQueue.open() crawler = BasicCrawler() assert dataset is await crawler.get_dataset() assert kvs is await crawler.get_key_value_store() assert rq is await crawler.get_request_manager() async def test_crawler_can_use_other_storages(tmp_path: Path) -> None: configuration = Configuration( storage_dir=str(tmp_path), purge_on_start=True, ) service_locator.set_configuration(configuration) dataset = await Dataset.open() kvs = await KeyValueStore.open() rq = await RequestQueue.open() crawler = BasicCrawler(storage_client=MemoryStorageClient()) assert dataset is not await crawler.get_dataset() assert kvs is not await crawler.get_key_value_store() assert rq is not await crawler.get_request_manager() async def test_crawler_can_use_other_storages_of_same_type(tmp_path: Path) -> None: """Test that crawler can use non-global storage of the same type as global storage without conflicts""" a_path = tmp_path / 'a' b_path = tmp_path / 'b' a_path.mkdir() b_path.mkdir() expected_paths = { path / storage for path, storage in product({a_path, b_path}, {'datasets', 'key_value_stores', 'request_queues'}) } configuration_a = Configuration( storage_dir=str(a_path), purge_on_start=True, ) configuration_b = Configuration( storage_dir=str(b_path), purge_on_start=True, ) # Set global configuration service_locator.set_configuration(configuration_a) service_locator.set_storage_client(FileSystemStorageClient()) # Create storages based on the global services dataset = await Dataset.open() kvs = await KeyValueStore.open() rq = await RequestQueue.open() # Set the crawler to use different storage client crawler = BasicCrawler(storage_client=FileSystemStorageClient(), configuration=configuration_b) # Assert that the storages are different assert dataset is not await crawler.get_dataset() assert kvs is not await crawler.get_key_value_store() assert rq is not await crawler.get_request_manager() # Assert that all storages exists on the filesystem for path in expected_paths: assert path.is_dir() async def test_allows_storage_client_overwrite_before_run(monkeypatch: pytest.MonkeyPatch) -> None: custom_storage_client = MemoryStorageClient() crawler = BasicCrawler( storage_client=custom_storage_client, ) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: await context.push_data({'foo': 'bar'}) other_storage_client = MemoryStorageClient() service_locator.set_storage_client(other_storage_client) with monkeypatch.context() as monkey: spy = Mock(wraps=service_locator.get_storage_client) monkey.setattr(service_locator, 'get_storage_client', spy) await crawler.run(['https://does-not-matter.com']) assert spy.call_count >= 1 dataset = await crawler.get_dataset() data = await dataset.get_data() assert data.items == [{'foo': 'bar'}] @pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.Barrier was introduced in Python 3.11.') async def test_context_use_state_race_condition_in_handlers(key_value_store: KeyValueStore) -> None: """Two parallel handlers increment global variable obtained by `use_state` method. Result should be incremented by 2. Method `use_state` must be implemented in a way that prevents race conditions in such scenario.""" # Test is skipped in older Python versions. from asyncio import Barrier # type:ignore[attr-defined] # noqa: PLC0415 crawler = BasicCrawler() store = await crawler.get_key_value_store() await store.set_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0', {'counter': 0}) handler_barrier = Barrier(2) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: state = cast('dict[str, int]', await context.use_state()) await handler_barrier.wait() # Block until both handlers get the state. state['counter'] += 1 await handler_barrier.wait() # Block until both handlers increment the state. await crawler.run(['https://crawlee.dev/', 'https://crawlee.dev/docs/quick-start']) store = await crawler.get_key_value_store() # Ensure that local state is pushed back to kvs. await store.persist_autosaved_values() assert (await store.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0'))['counter'] == 2 @pytest.mark.run_alone @pytest.mark.flaky( reruns=3, reason='Test is flaky on Windows and MacOS, see https://github.com/apify/crawlee-python/issues/1652.' ) @pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.timeout was introduced in Python 3.11.') @pytest.mark.parametrize( 'sleep_type', [ pytest.param('async_sleep'), pytest.param('sync_sleep', marks=pytest.mark.skip(reason='https://github.com/apify/crawlee-python/issues/908')), ], ) async def test_timeout_in_handler(sleep_type: str) -> None: """Test that timeout from request handler is treated the same way as exception thrown in request handler. Handler should be able to time out even if the code causing the timeout is blocking sync code. Crawler should attempt to retry it. This test creates situation where the request handler times out twice, on third retry it does not time out.""" # Test is skipped in older Python versions. from asyncio import timeout # type:ignore[attr-defined] # noqa: PLC0415 non_realtime_system_coefficient = 10 handler_timeout = timedelta(seconds=1) max_request_retries = 3 double_handler_timeout_s = handler_timeout.total_seconds() * 2 handler_sleep = iter([double_handler_timeout_s, double_handler_timeout_s, 0]) crawler = BasicCrawler( request_handler_timeout=handler_timeout, max_request_retries=max_request_retries, storage_client=MemoryStorageClient(), ) mocked_handler_before_sleep = Mock() mocked_handler_after_sleep = Mock() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: mocked_handler_before_sleep() if sleep_type == 'async_sleep': await asyncio.sleep(next(handler_sleep)) else: time.sleep(next(handler_sleep)) # noqa:ASYNC251 # Using blocking sleep in async function is the test. # This will not execute if timeout happens. mocked_handler_after_sleep() # Timeout in pytest, because previous implementation would run crawler until following: # "The request queue seems to be stuck for 300.0s, resetting internal state." async with timeout(max_request_retries * double_handler_timeout_s * non_realtime_system_coefficient): await crawler.run(['https://a.placeholder.com']) assert crawler.statistics.state.requests_finished == 1 assert mocked_handler_before_sleep.call_count == max_request_retries assert mocked_handler_after_sleep.call_count == 1 @pytest.mark.flaky( reruns=3, reason='Test is flaky on Windows and MacOS, see https://github.com/apify/crawlee-python/issues/1649.', ) @pytest.mark.parametrize( ('keep_alive', 'max_requests_per_crawl', 'expected_handled_requests_count'), [ pytest.param(True, 2, 2, id='keep_alive, 2 requests'), pytest.param(True, 1, 1, id='keep_alive, but max_requests_per_crawl achieved after 1 request'), pytest.param(False, 2, 0, id='Crawler without keep_alive (default), crawler finished before adding requests'), ], ) async def test_keep_alive( *, keep_alive: bool, max_requests_per_crawl: int, expected_handled_requests_count: int ) -> None: """Test that crawler can be kept alive without any requests and stopped with `crawler.stop()`. Crawler should stop if `max_requests_per_crawl` is reached regardless of the `keep_alive` flag.""" additional_urls = ['https://a.placeholder.com', 'https://b.placeholder.com'] expected_handler_calls = [call(url) for url in additional_urls[:expected_handled_requests_count]] crawler = BasicCrawler( keep_alive=keep_alive, max_requests_per_crawl=max_requests_per_crawl, # If more request can run in parallel, then max_requests_per_crawl is not deterministic. concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), storage_client=MemoryStorageClient(), ) mocked_handler = Mock() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: mocked_handler(context.request.url) if context.request == additional_urls[-1]: crawler.stop() crawler_run_task = asyncio.create_task(crawler.run()) # Give some time to crawler to finish(or be in keep_alive state) and add new request. # TODO: Replace sleep time by waiting for specific crawler state. # https://github.com/apify/crawlee-python/issues/925 await asyncio.sleep(1) assert crawler_run_task.done() != keep_alive add_request_task = asyncio.create_task(crawler.add_requests(additional_urls)) await asyncio.gather(crawler_run_task, add_request_task) mocked_handler.assert_has_calls(expected_handler_calls) @pytest.mark.parametrize( ('retire'), [ pytest.param(False, id='without retire'), pytest.param(True, id='with retire'), ], ) async def test_session_retire_in_user_handler(*, retire: bool) -> None: crawler = BasicCrawler(session_pool=SessionPool(max_pool_size=1)) sessions = list[str]() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: if context.session: sessions.append(context.session.id) context.session.retire() if retire else None await context.add_requests(['https://b.placeholder.com']) await crawler.run(['https://a.placeholder.com']) # The session should differ if `retire` was called and match otherwise since pool size == 1 if retire: assert sessions[1] != sessions[0] else: assert sessions[1] == sessions[0] async def test_bound_session_to_request() -> None: async with SessionPool() as session_pool: check_session: Session = await session_pool.get_session() used_sessions = list[str]() crawler = BasicCrawler(session_pool=session_pool) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: if context.session: used_sessions.append(context.session.id) requests = [ Request.from_url('https://a.placeholder.com', session_id=check_session.id, always_enqueue=True) for _ in range(10) ] await crawler.run(requests) assert len(used_sessions) == 10 assert set(used_sessions) == {check_session.id} async def test_bound_sessions_to_same_request() -> None: # Use a custom function to avoid errors due to random Session retrieval def create_session_function() -> Callable[[], Session]: counter = -1 def create_session() -> Session: nonlocal counter counter += 1 return Session(id=str(counter)) return create_session check_sessions = [str(session_id) for session_id in range(10)] used_sessions = list[str]() crawler = BasicCrawler(session_pool=SessionPool(create_session_function=create_session_function())) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: if context.session: used_sessions.append(context.session.id) requests = [ Request.from_url('https://a.placeholder.com', session_id=str(session_id), use_extended_unique_key=True) for session_id in range(10) ] await crawler.run(requests) assert len(used_sessions) == 10 assert set(used_sessions) == set(check_sessions) async def test_error_bound_session_to_request() -> None: crawler = BasicCrawler(request_handler=AsyncMock()) requests = [Request.from_url('https://a.placeholder.com', session_id='1', always_enqueue=True) for _ in range(10)] stats = await crawler.run(requests) assert stats.requests_total == 10 assert stats.requests_failed == 10 assert stats.retry_histogram == [10] async def test_handle_error_bound_session_to_request() -> None: error_handler_mock = AsyncMock() crawler = BasicCrawler(request_handler=AsyncMock()) @crawler.failed_request_handler async def error_req_hook(context: BasicCrawlingContext, error: Exception) -> None: if isinstance(error, RequestCollisionError): await error_handler_mock(context, error) requests = [Request.from_url('https://a.placeholder.com', session_id='1')] await crawler.run(requests) assert error_handler_mock.call_count == 1 async def test_handles_session_error_in_failed_request_handler() -> None: crawler = BasicCrawler(max_session_rotations=1) handler_requests = set() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: raise SessionError('blocked') @crawler.failed_request_handler async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None: handler_requests.add(context.request.url) requests = ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'] await crawler.run(requests) assert set(requests) == handler_requests async def test_lock_with_get_robots_txt_file_for_url(server_url: URL) -> None: crawler = BasicCrawler(respect_robots_txt_file=True) with patch('crawlee.crawlers._basic._basic_crawler.RobotsTxtFile.find', wraps=RobotsTxtFile.find) as spy: await asyncio.gather( *[asyncio.create_task(crawler._get_robots_txt_file_for_url(str(server_url))) for _ in range(10)] ) # Check that the lock was acquired only once assert spy.call_count == 1 async def test_reduced_logs_from_timed_out_request_handler(caplog: pytest.LogCaptureFixture) -> None: caplog.set_level(logging.INFO) crawler = BasicCrawler( configure_logging=False, max_request_retries=1, request_handler_timeout=timedelta(seconds=1), ) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: # Intentionally add a delay longer than the timeout to trigger the timeout mechanism await asyncio.sleep(10) # INJECTED DELAY # Capture all logs from the 'crawlee' logger at INFO level or higher with caplog.at_level(logging.INFO, logger='crawlee'): await crawler.run([Request.from_url('https://a.placeholder.com')]) # Check for the timeout message in any of the logs found_timeout_message = False for record in caplog.records: if record.message and 'timed out after 1.0 seconds' in record.message: full_message = (record.message or '') + (record.exc_text or '') assert '\n' not in full_message assert '# INJECTED DELAY' in full_message found_timeout_message = True break assert found_timeout_message, 'Expected log message about request handler error was not found.' async def test_reduced_logs_from_time_out_in_request_handler(caplog: pytest.LogCaptureFixture) -> None: crawler = BasicCrawler(configure_logging=False, max_request_retries=1) @crawler.router.default_handler async def default_handler(_: BasicCrawlingContext) -> None: await asyncio.wait_for(Future(), timeout=1) # Capture all logs from the 'crawlee' logger at INFO level or higher with caplog.at_level(logging.INFO, logger='crawlee'): await crawler.run([Request.from_url('https://a.placeholder.com')]) # Check for 1 line summary message found_timeout_message = False for record in caplog.records: if re.match( r'Retrying request to .* due to: Timeout raised by user defined handler\. File .*, line .*,' r' in default_handler, await asyncio.wait_for\(Future\(\), timeout=1\)', record.message, ): found_timeout_message = True break assert found_timeout_message, 'Expected log message about request handler error was not found.' async def test_status_message_callback() -> None: """Test that status message callback is called with the correct message.""" status_message_callback = AsyncMock() states: list[dict[str, StatisticsState | None]] = [] async def status_callback( state: StatisticsState, previous_state: StatisticsState | None, message: str ) -> str | None: await status_message_callback(message) states.append({'state': state, 'previous_state': previous_state}) return message crawler = BasicCrawler( status_message_callback=status_callback, status_message_logging_interval=timedelta(seconds=0.01) ) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: await asyncio.sleep(0.1) # Simulate some processing time await crawler.run(['https://a.placeholder.com']) assert status_message_callback.called assert len(states) > 1 first_call = states[0] second_call = states[1] # For the first call, `previous_state` is None assert first_call['state'] is not None assert first_call['previous_state'] is None # For second call, `previous_state` is the first state assert second_call['state'] is not None assert second_call['previous_state'] is not None assert second_call['previous_state'] == first_call['state'] async def test_status_message_emit() -> None: event_manager = service_locator.get_event_manager() status_message_listener = Mock() def listener(event_data: EventCrawlerStatusData) -> None: status_message_listener(event_data) event_manager.on(event=Event.CRAWLER_STATUS, listener=listener) crawler = BasicCrawler(request_handler=AsyncMock()) await crawler.run(['https://a.placeholder.com']) event_manager.off(event=Event.CRAWLER_STATUS, listener=listener) assert status_message_listener.called @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'by_id'), [ pytest.param('named-queue', None, False, id='with rq_name'), pytest.param(None, 'alias-queue', False, id='with rq_alias'), pytest.param('id-queue', None, True, id='with rq_id'), ], ) async def test_add_requests_with_rq_param(queue_name: str | None, queue_alias: str | None, *, by_id: bool) -> None: crawler = BasicCrawler() rq = await RequestQueue.open(name=queue_name, alias=queue_alias) if by_id: queue_id = rq.id queue_name = None else: queue_id = None visit_urls = set() check_requests = [ Request.from_url('https://a.placeholder.com'), Request.from_url('https://b.placeholder.com'), Request.from_url('https://c.placeholder.com'), ] @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: visit_urls.add(context.request.url) await context.add_requests(check_requests, rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias) await crawler.run(['https://start.placeholder.com']) requests_from_queue = [] while request := await rq.fetch_next_request(): requests_from_queue.append(request) assert requests_from_queue == check_requests assert visit_urls == {'https://start.placeholder.com'} await rq.drop() @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'queue_id'), [ pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'), pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'), pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'), pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'), ], ) async def test_add_requests_error_with_multi_params( queue_id: str | None, queue_name: str | None, queue_alias: str | None ) -> None: crawler = BasicCrawler() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: with pytest.raises(ValueError, match='Only one of `rq_id`, `rq_name` or `rq_alias` can be set'): await context.add_requests( [Request.from_url('https://a.placeholder.com')], rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias, ) await crawler.run(['https://start.placeholder.com']) async def test_crawler_purge_request_queue_uses_same_storage_client() -> None: """Make sure that purge on start does not replace the storage client in the underlying storage manager""" # Set some different storage_client globally and different for Crawlee. service_locator.set_storage_client(FileSystemStorageClient()) unrelated_rq = await RequestQueue.open() unrelated_request = Request.from_url('https://x.placeholder.com') await unrelated_rq.add_request(unrelated_request) crawler = BasicCrawler(storage_client=MemoryStorageClient()) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: context.log.info(context.request.url) for _ in (1, 2): await crawler.run(requests=[Request.from_url('https://a.placeholder.com')], purge_request_queue=True) assert crawler.statistics.state.requests_finished == 1 # Crawler should not fall back to the default storage after the purge assert await unrelated_rq.fetch_next_request() == unrelated_request async def _run_crawler(crawler_id: int | None, requests: list[str], storage_dir: str) -> StatisticsState: """Run crawler and return its statistics state. Must be defined like this to be pickable for ProcessPoolExecutor.""" async def request_handler(context: BasicCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Add visited url to crawler state and use it to verify state persistence. state = await context.use_state({'urls': []}) state['urls'] = state.get('urls') assert isinstance(state['urls'], list) state['urls'].append(context.request.url) context.log.info(f'State {state}') crawler = BasicCrawler( id=crawler_id, request_handler=request_handler, concurrency_settings=ConcurrencySettings(max_concurrency=1, desired_concurrency=1), configuration=Configuration( storage_dir=storage_dir, purge_on_start=False, ), ) await crawler.run(requests) return crawler.statistics.state @dataclass class _CrawlerInput: requests: list[str] id: None | int = None def _process_run_crawlers(crawler_inputs: list[_CrawlerInput], storage_dir: str) -> list[StatisticsState]: return [ asyncio.run(_run_crawler(crawler_id=crawler_input.id, requests=crawler_input.requests, storage_dir=storage_dir)) for crawler_input in crawler_inputs ] async def test_crawler_state_persistence(tmp_path: Path) -> None: """Test that crawler statistics and state persist and are loaded correctly. This test simulates starting the crawler process twice, and checks that the statistics include first run.""" state_kvs = await KeyValueStore.open( storage_client=FileSystemStorageClient(), configuration=Configuration(storage_dir=str(tmp_path)) ) with ProcessPoolExecutor() as executor: # Crawl 2 requests in the first run and automatically persist the state. first_run_state = executor.submit( _process_run_crawlers, crawler_inputs=[_CrawlerInput(requests=['https://a.placeholder.com', 'https://b.placeholder.com'])], storage_dir=str(tmp_path), ).result()[0] # Expected state after first crawler run assert first_run_state.requests_finished == 2 state = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0') assert state.get('urls') == ['https://a.placeholder.com', 'https://b.placeholder.com'] # Do not reuse the executor to simulate a fresh process to avoid modified class attributes. with ProcessPoolExecutor() as executor: # Crawl 1 additional requests in the second run, but use previously automatically persisted state. second_run_state = executor.submit( _process_run_crawlers, crawler_inputs=[_CrawlerInput(requests=['https://c.placeholder.com'])], storage_dir=str(tmp_path), ).result()[0] # Expected state after second crawler run # 2 requests from first run and 1 request from second run. assert second_run_state.requests_finished == 3 state = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0') assert state.get('urls') == [ 'https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com', ] assert first_run_state.crawler_started_at == second_run_state.crawler_started_at assert first_run_state.crawler_finished_at assert second_run_state.crawler_finished_at assert first_run_state.crawler_finished_at < second_run_state.crawler_finished_at assert first_run_state.crawler_runtime < second_run_state.crawler_runtime async def test_crawler_state_persistence_2_crawlers_with_migration(tmp_path: Path) -> None: """Test that crawler statistics and state persist and are loaded correctly. This test simulates starting the crawler process twice, and checks that the statistics include first run. Each time two distinct crawlers are running, and they should keep using their own statistics and state.""" state_kvs = await KeyValueStore.open( storage_client=FileSystemStorageClient(), configuration=Configuration(storage_dir=str(tmp_path)) ) with ProcessPoolExecutor() as executor: # Run 2 crawler, each crawl 1 request in and automatically persist the state. first_run_states = executor.submit( _process_run_crawlers, crawler_inputs=[ _CrawlerInput(requests=['https://a.placeholder.com']), _CrawlerInput(requests=['https://c.placeholder.com']), ], storage_dir=str(tmp_path), ).result() # Expected state after first crawler run assert first_run_states[0].requests_finished == 1 assert first_run_states[1].requests_finished == 1 state_0 = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0') assert state_0.get('urls') == ['https://a.placeholder.com'] state_1 = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_1') assert state_1.get('urls') == ['https://c.placeholder.com'] with ProcessPoolExecutor() as executor: # Run 2 crawler, each crawl 1 request in and automatically persist the state. second_run_states = executor.submit( _process_run_crawlers, crawler_inputs=[ _CrawlerInput(requests=['https://b.placeholder.com']), _CrawlerInput(requests=['https://d.placeholder.com']), ], storage_dir=str(tmp_path), ).result() # Expected state after first crawler run assert second_run_states[0].requests_finished == 2 assert second_run_states[1].requests_finished == 2 state_0 = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0') assert state_0.get('urls') == ['https://a.placeholder.com', 'https://b.placeholder.com'] state_1 = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_1') assert state_1.get('urls') == ['https://c.placeholder.com', 'https://d.placeholder.com'] async def test_crawler_intermediate_statistics() -> None: """Test that crawler statistics are correctly updating total runtime on every calculate call.""" crawler = BasicCrawler() check_time = timedelta(seconds=0.1) async def wait_for_statistics_initialization() -> None: while not crawler.statistics.active: # noqa: ASYNC110 # It is ok for tests. await asyncio.sleep(0.1) @crawler.router.default_handler async def handler(_: BasicCrawlingContext) -> None: await asyncio.sleep(check_time.total_seconds() * 5) # Start crawler and wait until statistics are initialized. crawler_task = asyncio.create_task(crawler.run(['https://a.placeholder.com'])) await wait_for_statistics_initialization() # Wait some time and check that runtime is updated. await asyncio.sleep(check_time.total_seconds()) crawler.statistics.calculate() assert crawler.statistics.state.crawler_runtime >= check_time # Wait for crawler to finish await crawler_task async def test_protect_request_in_run_handlers() -> None: """Test that request in crawling context are protected in run handlers.""" request_queue = await RequestQueue.open(name='state-test') request = Request.from_url('https://test.url/', user_data={'request_state': ['initial']}) crawler = BasicCrawler(request_manager=request_queue, max_request_retries=0) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: if isinstance(context.request.user_data['request_state'], list): context.request.user_data['request_state'].append('modified') raise ValueError('Simulated error after modifying request') await crawler.run([request]) check_request = await request_queue.get_request(request.unique_key) assert check_request is not None assert check_request.user_data['request_state'] == ['initial'] await request_queue.drop() async def test_new_request_error_handler() -> None: """Test that error in new_request_handler is handled properly.""" queue = await RequestQueue.open() crawler = BasicCrawler( request_manager=queue, ) request = Request.from_url('https://a.placeholder.com') @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: if '|test' in context.request.unique_key: return raise ValueError('This error should not be handled by error handler') @crawler.error_handler async def error_handler(context: BasicCrawlingContext, error: Exception) -> Request | None: return Request.from_url( context.request.url, unique_key=f'{context.request.unique_key}|test', ) await crawler.run([request]) original_request = await queue.get_request(request.unique_key) error_request = await queue.get_request(f'{request.unique_key}|test') assert original_request is not None assert original_request.state == RequestState.ERROR_HANDLER assert original_request.was_already_handled assert error_request is not None assert error_request.state == RequestState.DONE assert error_request.was_already_handled ================================================ FILE: tests/unit/crawlers/_basic/test_context_pipeline.py ================================================ from __future__ import annotations import logging from dataclasses import dataclass from typing import TYPE_CHECKING from unittest.mock import AsyncMock import pytest from crawlee import Request from crawlee._types import BasicCrawlingContext from crawlee.crawlers import ContextPipeline from crawlee.errors import ContextPipelineFinalizationError, ContextPipelineInitializationError, RequestHandlerError from crawlee.sessions._session import Session if TYPE_CHECKING: from collections.abc import AsyncGenerator @dataclass(frozen=True) class EnhancedCrawlingContext(BasicCrawlingContext): foo: str @dataclass(frozen=True) class MoreEnhancedCrawlingContext(EnhancedCrawlingContext): bar: int async def test_calls_consumer_without_middleware() -> None: consumer = AsyncMock() pipeline = ContextPipeline() context = BasicCrawlingContext( request=Request.from_url(url='https://test.io/'), send_request=AsyncMock(), add_requests=AsyncMock(), session=Session(), proxy_info=AsyncMock(), push_data=AsyncMock(), use_state=AsyncMock(), get_key_value_store=AsyncMock(), log=logging.getLogger(), ) await pipeline(context, consumer) consumer.assert_called_once_with(context) async def test_calls_consumers_and_middlewares() -> None: events = list[str]() async def consumer(context: MoreEnhancedCrawlingContext) -> None: events.append('consumer_called') assert context.bar == 4 async def middleware_a(context: BasicCrawlingContext) -> AsyncGenerator[EnhancedCrawlingContext, None]: events.append('middleware_a_in') yield EnhancedCrawlingContext( request=context.request, foo='foo', send_request=AsyncMock(), add_requests=AsyncMock(), session=context.session, proxy_info=AsyncMock(), push_data=AsyncMock(), use_state=AsyncMock(), get_key_value_store=AsyncMock(), log=logging.getLogger(), ) events.append('middleware_a_out') async def middleware_b(context: EnhancedCrawlingContext) -> AsyncGenerator[MoreEnhancedCrawlingContext, None]: events.append('middleware_b_in') yield MoreEnhancedCrawlingContext( request=context.request, foo=context.foo, bar=4, send_request=AsyncMock(), add_requests=AsyncMock(), session=context.session, proxy_info=AsyncMock(), push_data=AsyncMock(), use_state=AsyncMock(), get_key_value_store=AsyncMock(), log=logging.getLogger(), ) events.append('middleware_b_out') pipeline = ContextPipeline[BasicCrawlingContext]().compose(middleware_a).compose(middleware_b) context = BasicCrawlingContext( request=Request.from_url(url='https://test.io/'), send_request=AsyncMock(), add_requests=AsyncMock(), session=Session(), proxy_info=AsyncMock(), push_data=AsyncMock(), use_state=AsyncMock(), get_key_value_store=AsyncMock(), log=logging.getLogger(), ) await pipeline(context, consumer) assert events == [ 'middleware_a_in', 'middleware_b_in', 'consumer_called', 'middleware_b_out', 'middleware_a_out', ] async def test_wraps_consumer_errors() -> None: consumer = AsyncMock(side_effect=RuntimeError('Arbitrary crash for testing purposes')) pipeline = ContextPipeline() context = BasicCrawlingContext( request=Request.from_url(url='https://test.io/'), send_request=AsyncMock(), add_requests=AsyncMock(), session=Session(), proxy_info=AsyncMock(), push_data=AsyncMock(), use_state=AsyncMock(), get_key_value_store=AsyncMock(), log=logging.getLogger(), ) with pytest.raises(RequestHandlerError): await pipeline(context, consumer) async def test_handles_exceptions_in_middleware_initialization() -> None: consumer = AsyncMock() cleanup = AsyncMock() async def step_1(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingContext, None]: yield context await cleanup() async def step_2(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingContext, None]: raise RuntimeError('Crash during middleware initialization') yield context pipeline = ContextPipeline().compose(step_1).compose(step_2) context = BasicCrawlingContext( request=Request.from_url(url='https://test.io/'), send_request=AsyncMock(), add_requests=AsyncMock(), session=Session(), proxy_info=AsyncMock(), push_data=AsyncMock(), use_state=AsyncMock(), get_key_value_store=AsyncMock(), log=logging.getLogger(), ) with pytest.raises(ContextPipelineInitializationError): await pipeline(context, consumer) assert not consumer.called assert cleanup.called async def test_handles_exceptions_in_middleware_finalization() -> None: consumer = AsyncMock() cleanup = AsyncMock() async def step_1(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingContext, None]: yield context await cleanup() async def step_2(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingContext, None]: yield context raise RuntimeError('Crash during middleware finalization') pipeline = ContextPipeline().compose(step_1).compose(step_2) context = BasicCrawlingContext( request=Request.from_url(url='https://test.io/'), send_request=AsyncMock(), add_requests=AsyncMock(), session=Session(), proxy_info=AsyncMock(), push_data=AsyncMock(), use_state=AsyncMock(), get_key_value_store=AsyncMock(), log=logging.getLogger(), ) with pytest.raises(ContextPipelineFinalizationError): await pipeline(context, consumer) assert consumer.called assert not cleanup.called ================================================ FILE: tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py ================================================ from __future__ import annotations import asyncio from datetime import timedelta from typing import TYPE_CHECKING from unittest import mock import pytest from crawlee import ConcurrencySettings, Glob, HttpHeaders, Request, RequestTransformAction, SkippedReason from crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.storages import RequestQueue if TYPE_CHECKING: from yarl import URL from crawlee._request import RequestOptions from crawlee.http_clients._base import HttpClient async def test_basic(server_url: URL, http_client: HttpClient) -> None: crawler = BeautifulSoupCrawler(http_client=http_client) handler = mock.AsyncMock() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: links = context.soup.find_all('a') await handler(links) await crawler.run([str(server_url / 'start_enqueue')]) assert handler.called # The handler should find three links assert len(handler.call_args[0][0]) == 3 async def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None: redirect_target = str(server_url / 'start_enqueue') redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target)) requests = [redirect_url] crawler = BeautifulSoupCrawler(http_client=http_client) visit = mock.Mock() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links() await crawler.run(requests) expected_visit_calls = [ mock.call(redirect_url), mock.call(str(server_url / 'sub_index')), mock.call(str(server_url / 'page_1')), mock.call(str(server_url / 'page_2')), mock.call(str(server_url / 'page_3')), mock.call(str(server_url / 'page_4')), mock.call(str(server_url / 'base_page')), mock.call(str(server_url / 'base_subpath/page_5')), ] assert visit.mock_calls[0] == expected_visit_calls[0] visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None: redirect_target = str(server_url / 'start_enqueue_non_href') redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target)) requests = [redirect_url] crawler = BeautifulSoupCrawler(http_client=http_client) visit = mock.Mock() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links(selector='img', attribute='src') await crawler.run(requests) expected_visit_calls = [ mock.call(redirect_url), mock.call(str(server_url / 'base_subpath/image_1')), mock.call(str(server_url / 'image_2')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_selector(server_url: URL, http_client: HttpClient) -> None: crawler = BeautifulSoupCrawler(http_client=http_client) visit = mock.Mock() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links(selector='a.foo') await crawler.run([str(server_url / 'start_enqueue')]) expected_visit_calls = [ mock.call(str(server_url / 'start_enqueue')), mock.call(str(server_url / 'sub_index')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpClient) -> None: start_urls = [str(server_url / 'start_enqueue')] processed_urls = [] # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately crawler = BeautifulSoupCrawler( concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), max_requests_per_crawl=3, http_client=http_client, ) @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await context.enqueue_links() processed_urls.append(context.request.url) stats = await crawler.run(start_urls) # Verify that only 3 out of the possible 5 requests were made assert len(processed_urls) == 3 assert stats.requests_total == 3 assert stats.requests_finished == 3 async def test_enqueue_links_with_transform_request_function(server_url: URL, http_client: HttpClient) -> None: crawler = BeautifulSoupCrawler(http_client=http_client) visit = mock.Mock() headers = [] def test_transform_request_function( request_options: RequestOptions, ) -> RequestOptions | RequestTransformAction: if 'page_3' in request_options['url']: return 'skip' request_options['headers'] = HttpHeaders({'transform-header': 'my-header'}) return request_options @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: visit(context.request.url) headers.append(context.request.headers) await context.enqueue_links(transform_request_function=test_transform_request_function) await crawler.run([str(server_url / 'start_enqueue')]) # url /page_3 should not be visited expected_visit_calls = [ mock.call(str(server_url / 'start_enqueue')), mock.call(str(server_url / 'sub_index')), mock.call(str(server_url / 'page_1')), mock.call(str(server_url / 'page_2')), mock.call(str(server_url / 'base_page')), mock.call(str(server_url / 'page_4')), mock.call(str(server_url / 'base_subpath/page_5')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) # # all urls added to `enqueue_links` must have a custom header assert headers[1]['transform-header'] == 'my-header' assert headers[2]['transform-header'] == 'my-header' assert headers[3]['transform-header'] == 'my-header' async def test_handle_blocked_request(server_url: URL, http_client: HttpClient) -> None: crawler = BeautifulSoupCrawler(max_session_rotations=1, http_client=http_client) stats = await crawler.run([str(server_url / 'incapsula')]) assert stats.requests_failed == 1 def test_default_logger() -> None: assert BeautifulSoupCrawler().log.name == 'BeautifulSoupCrawler' async def test_respect_robots_txt(server_url: URL, http_client: HttpClient) -> None: crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True) visit = mock.Mock() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links() await crawler.run([str(server_url / 'start_enqueue')]) expected_visit_calls = [ mock.call(str(server_url / 'start_enqueue')), mock.call(str(server_url / 'sub_index')), mock.call(str(server_url / 'base_page')), mock.call(str(server_url / 'base_subpath/page_5')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None: """Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt.""" visit = mock.Mock() fail = mock.Mock() crawler = BeautifulSoupCrawler( http_client=http_client, respect_robots_txt_file=True, max_request_retries=0, ) @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links(strategy='all') @crawler.failed_request_handler async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None: fail(context.request.url) await crawler.run([str(server_url / 'problematic_links')]) # Email must be skipped # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler. expected_visit_calls = [ mock.call(str(server_url / 'problematic_links')), mock.call('https://avatars.githubusercontent.com/apify'), ] visit.assert_has_calls(expected_visit_calls, any_order=True) # The budplaceholder.com does not exist. expected_fail_calls = [ mock.call('https://budplaceholder.com/'), ] fail.assert_has_calls(expected_fail_calls, any_order=True) async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None: crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True) skip = mock.Mock() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await context.enqueue_links() @crawler.on_skipped_request async def skipped_hook(url: str, _reason: SkippedReason) -> None: skip(url) await crawler.run([str(server_url / 'start_enqueue')]) expected_skip_calls = [ mock.call(str(server_url / 'page_1')), mock.call(str(server_url / 'page_2')), mock.call(str(server_url / 'page_3')), mock.call(str(server_url / 'page_4')), ] skip.assert_has_calls(expected_skip_calls, any_order=True) async def test_extract_links(server_url: URL, http_client: HttpClient) -> None: crawler = BeautifulSoupCrawler(http_client=http_client) extracted_links: list[str] = [] @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: links = await context.extract_links(exclude=[Glob(f'{server_url}sub_index')]) extracted_links.extend(request.url for request in links) await crawler.run([str(server_url / 'start_enqueue')]) assert len(extracted_links) == 1 assert extracted_links[0] == str(server_url / 'page_1') async def test_extract_non_href_links(server_url: URL, http_client: HttpClient) -> None: crawler = BeautifulSoupCrawler(http_client=http_client) extracted_links: list[str] = [] @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: links = await context.extract_links(selector='li', attribute='data-href') extracted_links.extend(request.url for request in links) await crawler.run([str(server_url / 'non_href_links')]) assert len(extracted_links) == 1 assert extracted_links[0] == str(server_url / 'page_2') @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'by_id'), [ pytest.param('named-queue', None, False, id='with rq_name'), pytest.param(None, 'alias-queue', False, id='with rq_alias'), pytest.param('id-queue', None, True, id='with rq_id'), ], ) async def test_enqueue_links_with_rq_param( server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool ) -> None: crawler = BeautifulSoupCrawler(http_client=http_client) rq = await RequestQueue.open(name=queue_name, alias=queue_alias) if by_id: queue_name = None queue_id = rq.id else: queue_id = None visit_urls: set[str] = set() @crawler.router.default_handler async def handler(context: BeautifulSoupCrawlingContext) -> None: visit_urls.add(context.request.url) await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias) await crawler.run([str(server_url / 'start_enqueue')]) requests_from_queue: list[str] = [] while request := await rq.fetch_next_request(): requests_from_queue.append(request.url) assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')} assert visit_urls == {str(server_url / 'start_enqueue')} await rq.drop() @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'by_id'), [ pytest.param('named-queue', None, False, id='with rq_name'), pytest.param(None, 'alias-queue', False, id='with rq_alias'), pytest.param('id-queue', None, True, id='with rq_id'), ], ) async def test_enqueue_links_requests_with_rq_param( server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool ) -> None: crawler = BeautifulSoupCrawler(http_client=http_client) rq = await RequestQueue.open(name=queue_name, alias=queue_alias) if by_id: queue_name = None queue_id = rq.id else: queue_id = None visit_urls: set[str] = set() check_requests: list[str] = [ 'https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com', ] @crawler.router.default_handler async def handler(context: BeautifulSoupCrawlingContext) -> None: visit_urls.add(context.request.url) await context.enqueue_links( requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id, strategy='all' ) await crawler.run([str(server_url / 'start_enqueue')]) requests_from_queue: list[str] = [] while request := await rq.fetch_next_request(): requests_from_queue.append(request.url) assert set(requests_from_queue) == set(check_requests) assert visit_urls == {str(server_url / 'start_enqueue')} await rq.drop() @pytest.mark.parametrize( ('queue_id', 'queue_name', 'queue_alias'), [ pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'), pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'), pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'), pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'), ], ) async def test_enqueue_links_error_with_multi_params( server_url: URL, http_client: HttpClient, queue_id: str | None, queue_name: str | None, queue_alias: str | None ) -> None: crawler = BeautifulSoupCrawler(http_client=http_client) @crawler.router.default_handler async def handler(context: BeautifulSoupCrawlingContext) -> None: with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'): await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias) await crawler.run([str(server_url / 'start_enqueue')]) async def test_navigation_timeout_on_slow_request(server_url: URL, http_client: HttpClient) -> None: """Test that navigation_timeout causes TimeoutError on slow HTTP requests.""" crawler = BeautifulSoupCrawler( http_client=http_client, navigation_timeout=timedelta(seconds=1), max_request_retries=0, ) failed_request_handler = mock.AsyncMock() crawler.failed_request_handler(failed_request_handler) request_handler = mock.AsyncMock() crawler.router.default_handler(request_handler) # Request endpoint that delays 5 seconds - should timeout at 1 second await crawler.run([str(server_url.with_path('/slow').with_query(delay=5))]) assert failed_request_handler.call_count == 1 assert isinstance(failed_request_handler.call_args[0][1], asyncio.TimeoutError) async def test_navigation_timeout_applies_to_hooks(server_url: URL) -> None: crawler = BeautifulSoupCrawler( navigation_timeout=timedelta(seconds=1), max_request_retries=0, ) request_handler = mock.AsyncMock() crawler.router.default_handler(request_handler) crawler.pre_navigation_hook(lambda _: asyncio.sleep(1)) # Pre-navigation hook takes 1 second (exceeds navigation timeout), so the URL will not be handled result = await crawler.run([str(server_url)]) assert result.requests_failed == 1 assert result.requests_finished == 0 assert request_handler.call_count == 0 async def test_slow_navigation_does_not_count_toward_handler_timeout(server_url: URL, http_client: HttpClient) -> None: crawler = BeautifulSoupCrawler( http_client=http_client, request_handler_timeout=timedelta(seconds=0.5), max_request_retries=0, ) request_handler = mock.AsyncMock() crawler.router.default_handler(request_handler) # Navigation takes 1 second (exceeds handler timeout), but should still succeed result = await crawler.run([str(server_url.with_path('/slow').with_query(delay=1))]) assert result.requests_failed == 0 assert result.requests_finished == 1 assert request_handler.call_count == 1 async def test_enqueue_strategy_after_redirect(server_url: URL, redirect_server_url: URL) -> None: crawler = BeautifulSoupCrawler() handler_calls = mock.AsyncMock() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await handler_calls(context.request.url) target_url = str(server_url.with_path('redirect').with_query(url=str(redirect_server_url))) await context.enqueue_links(requests=[Request.from_url(target_url)], strategy='same-origin') await crawler.run([str(server_url)]) assert handler_calls.called assert handler_calls.call_count == 1 async def test_enqueue_links_with_limit(server_url: URL, http_client: HttpClient) -> None: start_url = str(server_url / 'sub_index') requests = [start_url] crawler = BeautifulSoupCrawler(http_client=http_client) visit = mock.Mock() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links(limit=1) await crawler.run(requests) # Only one link should be enqueued from sub_index due to the limit expected_visit_calls = [ mock.call(start_url), mock.call(str(server_url / 'page_3')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) ================================================ FILE: tests/unit/crawlers/_http/test_http_crawler.py ================================================ from __future__ import annotations import json from typing import TYPE_CHECKING from unittest.mock import AsyncMock, Mock from urllib.parse import parse_qs, urlencode import pytest from crawlee import ConcurrencySettings, Request, RequestState from crawlee.crawlers import HttpCrawler from crawlee.sessions import SessionPool from crawlee.statistics import Statistics from crawlee.storages import RequestQueue from tests.unit.server_endpoints import HELLO_WORLD if TYPE_CHECKING: from collections.abc import Awaitable, Callable from yarl import URL from crawlee._types import BasicCrawlingContext from crawlee.crawlers import HttpCrawlingContext from crawlee.http_clients._base import HttpClient # Payload, e.g. data for a form submission. PAYLOAD = { 'custname': 'John Doe', 'custtel': '1234567890', 'custemail': 'johndoe@example.com', 'size': 'large', 'topping': '["bacon", "cheese", "mushroom"]', 'delivery': '13:00', 'comments': 'Please ring the doorbell upon arrival.', } @pytest.fixture async def mock_request_handler() -> Callable[[HttpCrawlingContext], Awaitable[None]] | AsyncMock: return AsyncMock() @pytest.fixture async def crawler( http_client: HttpClient, mock_request_handler: Callable[[HttpCrawlingContext], Awaitable[None]] ) -> HttpCrawler: return HttpCrawler(http_client=http_client, request_handler=mock_request_handler) @pytest.fixture async def crawler_without_retries( mock_request_handler: Callable[[HttpCrawlingContext], Awaitable[None]], ) -> HttpCrawler: return HttpCrawler( request_handler=mock_request_handler, retry_on_blocked=False, max_request_retries=0, ) async def test_fetches_html( crawler: HttpCrawler, mock_request_handler: AsyncMock, server_url: URL, ) -> None: await crawler.add_requests([str(server_url)]) await crawler.run() mock_request_handler.assert_called_once() assert mock_request_handler.call_args[0][0].request.url == str(server_url) async def test_handles_redirects(crawler: HttpCrawler, mock_request_handler: AsyncMock, server_url: URL) -> None: redirect_target = str(server_url) redirect_url = str(server_url.with_path('redirect').with_query(url=redirect_target)) await crawler.add_requests([redirect_url]) await crawler.run() mock_request_handler.assert_called_once() assert mock_request_handler.call_args[0][0].request.loaded_url == redirect_target assert mock_request_handler.call_args[0][0].request.url == redirect_url @pytest.mark.parametrize( ('additional_http_error_status_codes', 'ignore_http_error_status_codes', 'expected_number_error'), [ # error without retry for all 4xx statuses pytest.param([], [], 1, id='default_behavior'), # make retry for codes in `additional_http_error_status_codes` list pytest.param([402], [], 3, id='additional_status_codes'), # take as successful status codes from the `ignore_http_error_status_codes` list pytest.param([], [402], 0, id='ignore_error_status_codes'), # check precedence for `additional_http_error_status_codes` pytest.param([402], [402], 3, id='additional_and_ignore'), ], ) async def test_handles_client_errors( additional_http_error_status_codes: list[int], ignore_http_error_status_codes: list[int], expected_number_error: int, mock_request_handler: AsyncMock, server_url: URL, ) -> None: crawler = HttpCrawler( request_handler=mock_request_handler, additional_http_error_status_codes=additional_http_error_status_codes, ignore_http_error_status_codes=ignore_http_error_status_codes, max_request_retries=2, ) await crawler.add_requests([str(server_url / 'status/402')]) await crawler.run() assert crawler.statistics.error_tracker.total == expected_number_error # Request handler should not be called for error status codes. if expected_number_error: mock_request_handler.assert_not_called() else: mock_request_handler.assert_called() @pytest.mark.parametrize( ('ignore_http_error_status_codes', 'use_session_pool', 'expected_session_rotate', 'expected_number_error'), [ # change session and retry for no block 4xx statuses pytest.param([], True, 4, 1, id='default_behavior'), # error without retry for all 4xx statuses pytest.param([], False, 0, 1, id='default_behavior_without_session_pool'), # take as successful status codes from the `ignore_http_error_status_codes` list with Session Pool pytest.param([403], True, 0, 0, id='ignore_error_status_codes'), # take as successful status codes from the `ignore_http_error_status_codes` list without Session Pool pytest.param([403], False, 0, 0, id='ignore_error_status_codes_without_session_pool'), ], ) async def test_handles_session_block_errors( *, ignore_http_error_status_codes: list[int], use_session_pool: bool, expected_session_rotate: int, expected_number_error: int, mock_request_handler: AsyncMock, server_url: URL, ) -> None: crawler = HttpCrawler( request_handler=mock_request_handler, ignore_http_error_status_codes=ignore_http_error_status_codes, max_request_retries=3, max_session_rotations=5, use_session_pool=use_session_pool, ) await crawler.add_requests([str(server_url / 'status/403')]) await crawler.run() assert crawler.statistics.error_tracker.total == expected_number_error assert crawler.statistics.error_tracker_retry.total == expected_session_rotate # Request handler should not be called for error status codes. if expected_number_error: mock_request_handler.assert_not_called() else: mock_request_handler.assert_called() async def test_handles_server_error(crawler: HttpCrawler, mock_request_handler: AsyncMock, server_url: URL) -> None: await crawler.add_requests([str(server_url / 'status/500')]) await crawler.run() mock_request_handler.assert_not_called() async def test_stores_cookies(http_client: HttpClient, server_url: URL) -> None: visit = Mock() track_session_usage = Mock() async with SessionPool(max_pool_size=1) as session_pool: crawler = HttpCrawler( # /cookies/set might redirect us to a page that we can't access - no problem, we only care about cookies ignore_http_error_status_codes=[401], session_pool=session_pool, http_client=http_client, ) @crawler.router.default_handler async def handler(context: HttpCrawlingContext) -> None: visit(context.request.url) track_session_usage(context.session.id if context.session else None) await crawler.run( [ str(server_url.with_path('set_cookies').extend_query(a=1)), str(server_url.with_path('set_cookies').extend_query(b=2)), str(server_url.with_path('set_cookies').extend_query(c=3)), ] ) visited = {call[0][0] for call in visit.call_args_list} assert len(visited) == 3 session_ids = {call[0][0] for call in track_session_usage.call_args_list} assert len(session_ids) == 1 session = await session_pool.get_session_by_id(session_ids.pop()) assert session is not None assert {cookie['name']: cookie['value'] for cookie in session.cookies.get_cookies_as_dicts()} == { 'a': '1', 'b': '2', 'c': '3', } async def test_do_not_retry_on_client_errors(crawler: HttpCrawler, server_url: URL) -> None: await crawler.add_requests([str(server_url / 'status/400')]) stats = await crawler.run() # by default, client errors are not retried assert stats.requests_failed == 1 assert stats.retry_histogram == [1] assert stats.requests_total == 1 async def test_http_status_statistics(crawler: HttpCrawler, server_url: URL) -> None: await crawler.add_requests([str(server_url.with_path('status/500').with_query(id=i)) for i in range(10)]) await crawler.add_requests([str(server_url.with_path('status/402').with_query(id=i)) for i in range(10)]) await crawler.add_requests([str(server_url.with_path('status/403').with_query(id=i)) for i in range(10)]) await crawler.add_requests([str(server_url.with_query(id=i)) for i in range(10)]) await crawler.run() assert crawler.statistics.state.requests_with_status_code == { '200': 10, '403': 100, # block errors change session and retry '402': 10, # client errors are not retried by default '500': 40, # server errors are retried by default } async def test_sending_payload_as_raw_data(http_client: HttpClient, server_url: URL) -> None: crawler = HttpCrawler(http_client=http_client) responses = [] @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: response = json.loads(await context.http_response.read()) # The post endpoint returns the provided payload in the response. responses.append(response) encoded_payload = urlencode(PAYLOAD).encode() request = Request.from_url( url=str(server_url / 'post'), method='POST', payload=encoded_payload, ) await crawler.run([request]) assert len(responses) == 1, 'Request handler should be called exactly once.' assert responses[0]['data'].encode() == encoded_payload, 'Response payload data does not match the sent payload.' # The reconstructed payload data should match the original payload. We have to flatten the values, because # parse_qs returns a list of values for each key. response_data = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(responses[0]['data']).items()} assert response_data == PAYLOAD, 'The reconstructed payload data does not match the sent payload.' assert responses[0]['json'] is None, 'Response JSON data should be empty when only raw data is sent.' assert responses[0]['form'] == {}, 'Response form data should be empty when only raw data is sent.' async def test_sending_payload_as_form_data(http_client: HttpClient, server_url: URL) -> None: crawler = HttpCrawler(http_client=http_client) responses = [] @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: response = json.loads(await context.http_response.read()) # The /post endpoint returns the provided payload in the response. responses.append(response) request = Request.from_url( url=str(server_url / 'post'), method='POST', headers={'content-type': 'application/x-www-form-urlencoded'}, payload=urlencode(PAYLOAD).encode(), ) await crawler.run([request]) assert len(responses) == 1, 'Request handler should be called exactly once.' assert responses[0]['form'] == PAYLOAD, 'Form data in response does not match the sent payload.' assert responses[0]['json'] is None, 'Response JSON data should be empty when only form data is sent.' assert responses[0]['data'] == '', 'Response raw data should be empty when only form data is sent.' async def test_sending_payload_as_json(http_client: HttpClient, server_url: URL) -> None: crawler = HttpCrawler(http_client=http_client) responses = [] @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: response = json.loads(await context.http_response.read()) # The /post endpoint returns the provided payload in the response. responses.append(response) json_payload = json.dumps(PAYLOAD).encode() request = Request.from_url( url=str(server_url / 'post'), method='POST', payload=json_payload, headers={'content-type': 'application/json'}, ) await crawler.run([request]) assert len(responses) == 1, 'Request handler should be called exactly once.' assert responses[0]['data'].encode() == json_payload, 'Response raw JSON data does not match the sent payload.' assert responses[0]['json'] == PAYLOAD, 'Response JSON data does not match the sent payload.' assert responses[0]['form'] == {}, 'Response form data should be empty when only JSON data is sent.' async def test_sending_url_query_params(http_client: HttpClient, server_url: URL) -> None: crawler = HttpCrawler(http_client=http_client) responses = [] @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: response = json.loads(await context.http_response.read()) # The /get endpoint returns the provided query parameters in the response. responses.append(response) base_url = server_url / 'get' query_params = {'param1': 'value1', 'param2': 'value2'} request = Request.from_url(url=str(base_url.extend_query(query_params))) await crawler.run([request]) assert len(responses) == 1, 'Request handler should be called exactly once.' response_args = responses[0]['args'] assert response_args == query_params, 'Reconstructed query params must match the original query params.' async def test_http_crawler_pre_navigation_hook_execution(server_url: URL) -> None: """Test that pre-navigation hooks are executed.""" crawler = HttpCrawler(request_handler=AsyncMock()) call_mock = AsyncMock() # Register pre navigation hook. @crawler.pre_navigation_hook async def pre_nav_hook(context: BasicCrawlingContext) -> None: await call_mock(context.request.loaded_url) await crawler.run([str(server_url)]) # `pre_navigation_hook` is called before the request is made, so the loaded URL should be None. call_mock.assert_called_once_with(None) async def test_http_crawler_post_navigation_hook_execution(server_url: URL) -> None: """Test that post-navigation hooks are executed.""" crawler = HttpCrawler(request_handler=AsyncMock()) call_mock = AsyncMock() # Register post navigation hook. @crawler.post_navigation_hook async def post_nav_hook(context: HttpCrawlingContext) -> None: await call_mock(context.request.loaded_url) await crawler.run([str(server_url)]) # `post_navigation_hook` is called after the request is made, so the loaded URL should be the result URL. call_mock.assert_called_once_with(str(server_url)) async def test_http_crawler_navigation_hooks_order(server_url: URL) -> None: """Test that post-navigation hooks are executed in correct order.""" execution_order = [] crawler = HttpCrawler() # Register final context handler. @crawler.router.default_handler async def default_request_handler(_context: HttpCrawlingContext) -> None: execution_order.append('final handler') # Register pre navigation hook. @crawler.pre_navigation_hook async def pre_nav_hook_1(_context: BasicCrawlingContext) -> None: execution_order.append('pre-navigation-hook 1') # Register pre navigation hook. @crawler.pre_navigation_hook async def pre_nav_hook(_context: BasicCrawlingContext) -> None: execution_order.append('pre-navigation-hook 2') # Register post navigation hook. @crawler.post_navigation_hook async def post_nav_hook_1(_context: HttpCrawlingContext) -> None: execution_order.append('post-navigation-hook 1') # Register post navigation hook. @crawler.post_navigation_hook async def post_nav_hook_2(_context: HttpCrawlingContext) -> None: execution_order.append('post-navigation-hook 2') await crawler.run([str(server_url)]) assert execution_order == [ 'pre-navigation-hook 1', 'pre-navigation-hook 2', 'post-navigation-hook 1', 'post-navigation-hook 2', 'final handler', ] async def test_isolation_cookies(http_client: HttpClient, server_url: URL) -> None: """Test isolation cookies for Session with curl""" sessions_ids: list[str] = [] sessions_cookies: dict[str, dict[str, str]] = {} response_cookies: dict[str, dict[str, str]] = {} crawler = HttpCrawler( session_pool=SessionPool( max_pool_size=1, create_session_settings={ 'max_error_score': 50, }, ), http_client=http_client, max_request_retries=10, concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), ) @crawler.router.default_handler async def handler(context: HttpCrawlingContext) -> None: if not context.session: return sessions_ids.append(context.session.id) if context.request.unique_key not in {'1', '2'}: return sessions_cookies[context.session.id] = { cookie['name']: cookie['value'] for cookie in context.session.cookies.get_cookies_as_dicts() } response_data = json.loads(await context.http_response.read()) response_cookies[context.session.id] = response_data.get('cookies') if context.request.user_data.get('retire_session'): context.session.retire() await crawler.run( [ # The first request sets the cookie in the session str(server_url.with_path('set_cookies').extend_query(a=1)), # With the second request, we check the cookies in the session and set retire Request.from_url(str(server_url.with_path('/cookies')), unique_key='1', user_data={'retire_session': True}), # The third request is made with a new session to make sure it does not use another session's cookies Request.from_url(str(server_url.with_path('/cookies')), unique_key='2'), ] ) assert len(sessions_cookies) == 2 assert len(response_cookies) == 2 assert sessions_ids[0] == sessions_ids[1] cookie_session_id = sessions_ids[0] clean_session_id = sessions_ids[2] assert cookie_session_id != clean_session_id # The initiated cookies must match in both the response and the session store assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'} # For a clean session, the cookie should not be in the session store or in the response # This way we can be sure that no cookies are being leaked through the http client assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {} async def test_store_complex_cookies(server_url: URL) -> None: visit = Mock() track_session_usage = Mock() async with SessionPool(max_pool_size=1) as session_pool: crawler = HttpCrawler(session_pool=session_pool) @crawler.router.default_handler async def handler(context: HttpCrawlingContext) -> None: visit(context.request.url) track_session_usage(context.session.id if context.session else None) await crawler.run([str(server_url / 'set_complex_cookies')]) visited = {call[0][0] for call in visit.call_args_list} assert len(visited) == 1 session_ids = {call[0][0] for call in track_session_usage.call_args_list} assert len(session_ids) == 1 session = await session_pool.get_session_by_id(session_ids.pop()) assert session is not None session_cookies_dict = {cookie['name']: cookie for cookie in session.cookies.get_cookies_as_dicts()} assert len(session_cookies_dict) == 6 # cookie string: 'basic=1; Path=/; HttpOnly; SameSite=Lax' assert session_cookies_dict['basic'] == { 'name': 'basic', 'value': '1', 'domain': server_url.host, 'path': '/', 'secure': False, 'http_only': True, 'same_site': 'Lax', } # cookie string: 'withpath=2; Path=/html; SameSite=None' assert session_cookies_dict['withpath'] == { 'name': 'withpath', 'value': '2', 'domain': server_url.host, 'path': '/html', 'secure': False, 'http_only': False, 'same_site': 'None', } # cookie string: 'strict=3; Path=/; SameSite=Strict' assert session_cookies_dict['strict'] == { 'name': 'strict', 'value': '3', 'domain': server_url.host, 'path': '/', 'secure': False, 'http_only': False, 'same_site': 'Strict', } # cookie string: 'secure=4; Path=/; HttpOnly; Secure; SameSite=Strict' assert session_cookies_dict['secure'] == { 'name': 'secure', 'value': '4', 'domain': server_url.host, 'path': '/', 'secure': True, 'http_only': True, 'same_site': 'Strict', } # cookie string: 'short=5; Path=/;' assert session_cookies_dict['short'] == { 'name': 'short', 'value': '5', 'domain': server_url.host, 'path': '/', 'secure': False, 'http_only': False, } # Some clients may ignore `.` at the beginning of the domain # https://www.rfc-editor.org/rfc/rfc6265#section-4.1.2.3 assert session_cookies_dict['domain'] == { 'name': 'domain', 'value': '6', 'domain': {server_url.host}, 'path': '/', 'secure': False, 'http_only': False, } or { 'name': 'domain', 'value': '6', 'domain': f'.{server_url.host}', 'path': '/', 'secure': False, 'http_only': False, } def test_default_logger() -> None: assert HttpCrawler().log.name == 'HttpCrawler' async def test_get_snapshot(server_url: URL) -> None: crawler = HttpCrawler() snapshot = None @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: nonlocal snapshot snapshot = await context.get_snapshot() await crawler.run([str(server_url)]) assert snapshot is not None assert snapshot.html is not None assert snapshot.html == HELLO_WORLD.decode('utf8') async def test_error_snapshot_through_statistics(server_url: URL) -> None: statistics = Statistics.with_default_state(save_error_snapshots=True) crawler = HttpCrawler(statistics=statistics) @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: raise RuntimeError(rf'Exception /\ with file name unfriendly symbols in {context.request.url}') await crawler.run([str(server_url)]) kvs = await crawler.get_key_value_store() kvs_content = {} async for key_info in kvs.iterate_keys(): # Skip any non-error snapshot keys, e.g. __RQ_STATE_. if 'ERROR_SNAPSHOT' not in key_info.key: continue kvs_content[key_info.key] = await kvs.get_value(key_info.key) # One error, three time retried. content_key = next(iter(kvs_content)) assert crawler.statistics.error_tracker.total == 4 assert crawler.statistics.error_tracker.unique_error_count == 1 assert len(kvs_content) == 1 assert content_key.endswith('.html') assert kvs_content[content_key] == HELLO_WORLD.decode('utf8') async def test_request_state(server_url: URL) -> None: queue = await RequestQueue.open(alias='http_request_state') crawler = HttpCrawler(request_manager=queue) success_request = Request.from_url(str(server_url)) assert success_request.state == RequestState.UNPROCESSED error_request = Request.from_url(str(server_url / 'error'), user_data={'cause_error': True}) requests_states: dict[str, dict[str, RequestState]] = {success_request.unique_key: {}, error_request.unique_key: {}} @crawler.pre_navigation_hook async def pre_navigation_hook(context: BasicCrawlingContext) -> None: requests_states[context.request.unique_key]['pre_navigation'] = context.request.state @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: if context.request.user_data.get('cause_error'): raise ValueError('Caused error as requested') requests_states[context.request.unique_key]['request_handler'] = context.request.state @crawler.error_handler async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None: requests_states[context.request.unique_key]['error_handler'] = context.request.state @crawler.failed_request_handler async def failed_request_handler(context: BasicCrawlingContext, _error: Exception) -> None: requests_states[context.request.unique_key]['failed_request_handler'] = context.request.state await crawler.run([success_request, error_request]) handled_success_request = await queue.get_request(success_request.unique_key) assert handled_success_request is not None assert handled_success_request.state == RequestState.DONE assert requests_states[success_request.unique_key] == { 'pre_navigation': RequestState.BEFORE_NAV, 'request_handler': RequestState.REQUEST_HANDLER, } handled_error_request = await queue.get_request(error_request.unique_key) assert handled_error_request is not None assert handled_error_request.state == RequestState.ERROR assert requests_states[error_request.unique_key] == { 'pre_navigation': RequestState.BEFORE_NAV, 'error_handler': RequestState.ERROR_HANDLER, 'failed_request_handler': RequestState.ERROR, } await queue.drop() ================================================ FILE: tests/unit/crawlers/_parsel/test_parsel_crawler.py ================================================ from __future__ import annotations import sys from typing import TYPE_CHECKING from unittest import mock import pytest from crawlee import ConcurrencySettings, Glob, HttpHeaders, Request, RequestTransformAction, SkippedReason from crawlee.crawlers import ParselCrawler from crawlee.storages import RequestQueue if TYPE_CHECKING: from yarl import URL from crawlee._request import RequestOptions from crawlee.crawlers import BasicCrawlingContext, ParselCrawlingContext from crawlee.http_clients._base import HttpClient async def test_basic(server_url: URL, http_client: HttpClient) -> None: crawler = ParselCrawler(http_client=http_client) handler = mock.AsyncMock() @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: links = context.selector.css('a::attr(href)').getall() await handler(links) await crawler.run([str(server_url / 'start_enqueue')]) assert handler.called # The handler should find three links assert len(handler.call_args[0][0]) == 3 async def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None: redirect_target = str(server_url / 'start_enqueue') redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target)) requests = [redirect_url] crawler = ParselCrawler(http_client=http_client) visit = mock.Mock() @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: url = str(context.request.url) visit(url) await context.enqueue_links() await crawler.run(requests) expected_visit_calls = [ mock.call(redirect_url), mock.call(str(server_url / 'sub_index')), mock.call(str(server_url / 'page_1')), mock.call(str(server_url / 'page_2')), mock.call(str(server_url / 'page_3')), mock.call(str(server_url / 'page_4')), mock.call(str(server_url / 'base_page')), mock.call(str(server_url / 'base_subpath/page_5')), ] assert visit.mock_calls[0] == expected_visit_calls[0] visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None: redirect_target = str(server_url / 'start_enqueue_non_href') redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target)) requests = [redirect_url] crawler = ParselCrawler(http_client=http_client) visit = mock.Mock() @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links(selector='img', attribute='src') await crawler.run(requests) expected_visit_calls = [ mock.call(redirect_url), mock.call(str(server_url / 'base_subpath/image_1')), mock.call(str(server_url / 'image_2')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_with_incompatible_kwargs_raises_error(server_url: URL) -> None: """Call `enqueue_links` with arguments that can't be used together.""" crawler = ParselCrawler(max_request_retries=1) exceptions = [] @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: try: # Testing runtime enforcement of the overloads. await context.enqueue_links(requests=[Request.from_url(str(server_url / 'start_enqueue'))], selector='a') except Exception as e: exceptions.append(e) await crawler.run([str(server_url)]) assert len(exceptions) == 1 assert type(exceptions[0]) is ValueError async def test_enqueue_links_selector(server_url: URL, http_client: HttpClient) -> None: crawler = ParselCrawler(http_client=http_client) visit = mock.Mock() @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links(selector='a.foo') await crawler.run([str(server_url / 'start_enqueue')]) expected_visit_calls = [ mock.call(str(server_url / 'start_enqueue')), mock.call(str(server_url / 'sub_index')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpClient) -> None: start_urls = [str(server_url / 'start_enqueue')] processed_urls = [] # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately crawler = ParselCrawler( concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), max_requests_per_crawl=3, http_client=http_client, ) @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: await context.enqueue_links() processed_urls.append(context.request.url) stats = await crawler.run(start_urls) # Verify that only 3 out of the possible 5 requests were made assert len(processed_urls) == 3 assert stats.requests_total == 3 assert stats.requests_finished == 3 async def test_enqueue_links_with_transform_request_function(server_url: URL, http_client: HttpClient) -> None: crawler = ParselCrawler(http_client=http_client) visit = mock.Mock() headers = [] def test_transform_request_function( request_options: RequestOptions, ) -> RequestOptions | RequestTransformAction: if 'page_3' in request_options['url']: return 'skip' request_options['headers'] = HttpHeaders({'transform-header': 'my-header'}) return request_options @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: visit(context.request.url) headers.append(context.request.headers) await context.enqueue_links(transform_request_function=test_transform_request_function, label='test') await crawler.run([str(server_url / 'start_enqueue')]) # url /page_3 should not be visited expected_visit_calls = [ mock.call(str(server_url / 'start_enqueue')), mock.call(str(server_url / 'sub_index')), mock.call(str(server_url / 'page_1')), mock.call(str(server_url / 'page_2')), mock.call(str(server_url / 'page_4')), mock.call(str(server_url / 'base_page')), mock.call(str(server_url / 'base_subpath/page_5')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) # all urls added to `enqueue_links` must have a custom header assert headers[1]['transform-header'] == 'my-header' assert headers[2]['transform-header'] == 'my-header' assert headers[3]['transform-header'] == 'my-header' async def test_handle_blocked_request(server_url: URL, http_client: HttpClient) -> None: crawler = ParselCrawler(max_session_rotations=1, http_client=http_client) stats = await crawler.run([str(server_url / 'incapsula')]) assert stats.requests_failed == 1 async def test_handle_blocked_status_code(server_url: URL, http_client: HttpClient) -> None: crawler = ParselCrawler(max_session_rotations=1, http_client=http_client) # Patch internal calls and run crawler with ( mock.patch.object( crawler._statistics, 'record_request_processing_failure', wraps=crawler._statistics.record_request_processing_failure, ) as record_request_processing_failure, mock.patch.object( crawler._statistics.error_tracker, 'add', wraps=crawler._statistics.error_tracker.add ) as error_tracker_add, ): stats = await crawler.run([str(server_url / 'status/403')]) assert stats.requests_failed == 1 assert record_request_processing_failure.called assert error_tracker_add.called assert crawler._statistics.error_tracker.total == 1 # TODO: Remove the skip mark when the test is fixed: # https://github.com/apify/crawlee-python/issues/838 @pytest.mark.skip(reason='The test does not work with `crawlee._utils.try_import.ImportWrapper`.') def test_import_error_handled() -> None: # Simulate ImportError for parsel with mock.patch.dict('sys.modules', {'parsel': None}): # Invalidate ParselCrawler import sys.modules.pop('crawlee.crawlers', None) sys.modules.pop('crawlee.crawlers._parsel', None) with pytest.raises(ImportError) as import_error: from crawlee.crawlers import ParselCrawler # noqa: F401 PLC0415 # Check if the raised ImportError contains the expected message assert str(import_error.value) == ( "To import this, you need to install the 'parsel' extra." "For example, if you use pip, run `pip install 'crawlee[parsel]'`." ) async def test_json(server_url: URL, http_client: HttpClient) -> None: crawler = ParselCrawler(http_client=http_client) handler = mock.AsyncMock() @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: result = context.selector.jmespath('hello').getall() await handler(result) await crawler.run([str(server_url / 'json')]) assert handler.called assert handler.call_args[0][0] == ['world'] async def test_xml(server_url: URL, http_client: HttpClient) -> None: crawler = ParselCrawler(http_client=http_client) handler = mock.AsyncMock() @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: result = context.selector.css('hello').getall() await handler(result) await crawler.run([str(server_url / 'xml')]) assert handler.called assert handler.call_args[0][0] == ['world'] def test_default_logger() -> None: assert ParselCrawler().log.name == 'ParselCrawler' async def test_respect_robots_txt(server_url: URL, http_client: HttpClient) -> None: crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True) visit = mock.Mock() @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links() await crawler.run([str(server_url / 'start_enqueue')]) expected_visit_calls = [ mock.call(str(server_url / 'start_enqueue')), mock.call(str(server_url / 'sub_index')), mock.call(str(server_url / 'base_page')), mock.call(str(server_url / 'base_subpath/page_5')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None: """Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt.""" visit = mock.Mock() fail = mock.Mock() crawler = ParselCrawler( http_client=http_client, respect_robots_txt_file=True, max_request_retries=0, ) @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links(strategy='all') @crawler.failed_request_handler async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None: fail(context.request.url) await crawler.run([str(server_url / 'problematic_links')]) # Email must be skipped # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler. expected_visit_calls = [ mock.call(str(server_url / 'problematic_links')), mock.call('https://avatars.githubusercontent.com/apify'), ] visit.assert_has_calls(expected_visit_calls, any_order=True) # The budplaceholder.com does not exist. expected_fail_calls = [ mock.call('https://budplaceholder.com/'), ] fail.assert_has_calls(expected_fail_calls, any_order=True) async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None: crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True) skip = mock.Mock() @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: await context.enqueue_links() @crawler.on_skipped_request async def skipped_hook(url: str, _reason: SkippedReason) -> None: skip(url) await crawler.run([str(server_url / 'start_enqueue')]) expected_skip_calls = [ mock.call(str(server_url / 'page_1')), mock.call(str(server_url / 'page_2')), mock.call(str(server_url / 'page_3')), mock.call(str(server_url / 'page_4')), ] skip.assert_has_calls(expected_skip_calls, any_order=True) async def test_extract_links(server_url: URL, http_client: HttpClient) -> None: crawler = ParselCrawler(http_client=http_client) extracted_links: list[str] = [] @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: links = await context.extract_links(exclude=[Glob(f'{server_url}sub_index')]) extracted_links.extend(request.url for request in links) await crawler.run([str(server_url / 'start_enqueue')]) assert len(extracted_links) == 1 assert extracted_links[0] == str(server_url / 'page_1') async def test_extract_non_href_links(server_url: URL, http_client: HttpClient) -> None: crawler = ParselCrawler(http_client=http_client) extracted_links: list[str] = [] @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: links = await context.extract_links(selector='li', attribute='data-href') extracted_links.extend(request.url for request in links) await crawler.run([str(server_url / 'non_href_links')]) assert len(extracted_links) == 1 assert extracted_links[0] == str(server_url / 'page_2') @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'by_id'), [ pytest.param('named-queue', None, False, id='with rq_name'), pytest.param(None, 'alias-queue', False, id='with rq_alias'), pytest.param('id-queue', None, True, id='with rq_id'), ], ) async def test_enqueue_links_with_rq_param( server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool ) -> None: crawler = ParselCrawler(http_client=http_client) rq = await RequestQueue.open(name=queue_name, alias=queue_alias) if by_id: queue_name = None queue_id = rq.id else: queue_id = None visit_urls: set[str] = set() @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: visit_urls.add(context.request.url) await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias) await crawler.run([str(server_url / 'start_enqueue')]) requests_from_queue: list[str] = [] while request := await rq.fetch_next_request(): requests_from_queue.append(request.url) assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')} assert visit_urls == {str(server_url / 'start_enqueue')} await rq.drop() @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'by_id'), [ pytest.param('named-queue', None, False, id='with rq_name'), pytest.param(None, 'alias-queue', False, id='with rq_alias'), pytest.param('id-queue', None, True, id='with rq_id'), ], ) async def test_enqueue_links_requests_with_rq_param( server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool ) -> None: crawler = ParselCrawler(http_client=http_client) rq = await RequestQueue.open(name=queue_name, alias=queue_alias) if by_id: queue_name = None queue_id = rq.id else: queue_id = None visit_urls: set[str] = set() check_requests: list[str] = [ 'https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com', ] @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: visit_urls.add(context.request.url) await context.enqueue_links( requests=check_requests, rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias, strategy='all' ) await crawler.run([str(server_url / 'start_enqueue')]) requests_from_queue: list[str] = [] while request := await rq.fetch_next_request(): requests_from_queue.append(request.url) assert set(requests_from_queue) == set(check_requests) assert visit_urls == {str(server_url / 'start_enqueue')} await rq.drop() @pytest.mark.parametrize( ('queue_id', 'queue_name', 'queue_alias'), [ pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'), pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'), pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'), pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'), ], ) async def test_enqueue_links_error_with_multi_params( server_url: URL, http_client: HttpClient, queue_id: str | None, queue_name: str | None, queue_alias: str | None ) -> None: crawler = ParselCrawler(http_client=http_client) @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'): await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias) await crawler.run([str(server_url / 'start_enqueue')]) async def test_enqueue_links_with_limit(server_url: URL, http_client: HttpClient) -> None: start_url = str(server_url / 'sub_index') requests = [start_url] crawler = ParselCrawler(http_client=http_client) visit = mock.Mock() @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links(limit=1) await crawler.run(requests) # Only one link should be enqueued from sub_index due to the limit expected_visit_calls = [ mock.call(start_url), mock.call(str(server_url / 'page_3')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) ================================================ FILE: tests/unit/crawlers/_playwright/test_playwright_crawler.py ================================================ from __future__ import annotations import asyncio import json import logging from datetime import timedelta from typing import TYPE_CHECKING, Any, Literal from unittest import mock from unittest.mock import AsyncMock, Mock import pytest from crawlee import ( ConcurrencySettings, Glob, HttpHeaders, Request, RequestState, RequestTransformAction, SkippedReason, service_locator, ) from crawlee.configuration import Configuration from crawlee.crawlers import PlaywrightCrawler from crawlee.fingerprint_suite import ( DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions, ScreenOptions, ) from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type from crawlee.http_clients import ImpitHttpClient from crawlee.proxy_configuration import ProxyConfiguration from crawlee.sessions import Session, SessionPool from crawlee.statistics import Statistics from crawlee.statistics._error_snapshotter import ErrorSnapshotter from crawlee.storages import RequestQueue from tests.unit.server_endpoints import GENERIC_RESPONSE, HELLO_WORLD if TYPE_CHECKING: from pathlib import Path from yarl import URL from crawlee._request import RequestOptions from crawlee._types import HttpMethod, HttpPayload from crawlee.browsers._types import BrowserType from crawlee.crawlers import ( BasicCrawlingContext, PlaywrightCrawlingContext, PlaywrightPostNavCrawlingContext, PlaywrightPreNavCrawlingContext, ) @pytest.mark.parametrize( ('method', 'path', 'payload'), [ pytest.param('GET', 'get', None, id='get request'), pytest.param('POST', 'post', None, id='post request'), pytest.param('POST', 'post', b'Hello, world!', id='post request with payload'), ], ) async def test_basic_request(method: HttpMethod, path: str, payload: HttpPayload, server_url: URL) -> None: requests = [Request.from_url(str(server_url / path), method=method, payload=payload)] crawler = PlaywrightCrawler() result: dict = {} @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: assert context.page is not None result['request_url'] = context.request.url result['page_url'] = context.page.url result['page_content'] = await context.page.content() await crawler.run(requests) assert result.get('request_url') == result.get('page_url') == requests[0].url assert (payload.decode() if payload else '') in result.get('page_content', '') async def test_enqueue_links(redirect_server_url: URL, server_url: URL) -> None: redirect_target = str(server_url / 'start_enqueue') redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target)) requests = [redirect_url] crawler = PlaywrightCrawler() visit = mock.Mock() @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links() await crawler.run(requests) expected_visit_calls = [ mock.call(redirect_url), mock.call(str(server_url / 'sub_index')), mock.call(str(server_url / 'page_1')), mock.call(str(server_url / 'page_2')), mock.call(str(server_url / 'page_3')), mock.call(str(server_url / 'page_4')), mock.call(str(server_url / 'base_page')), mock.call(str(server_url / 'base_subpath/page_5')), ] assert visit.mock_calls[0] == expected_visit_calls[0] visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL) -> None: redirect_target = str(server_url / 'start_enqueue_non_href') redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target)) requests = [redirect_url] crawler = PlaywrightCrawler() visit = mock.Mock() @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links(selector='img', attribute='src') await crawler.run(requests) expected_visit_calls = [ mock.call(redirect_url), mock.call(str(server_url / 'base_subpath/image_1')), mock.call(str(server_url / 'image_2')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_enqueue_links_with_incompatible_kwargs_raises_error(server_url: URL) -> None: """Call `enqueue_links` with arguments that can't be used together.""" crawler = PlaywrightCrawler(max_request_retries=1) exceptions = [] @crawler.pre_navigation_hook async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None: await context.page.route('**/*', lambda route: route.fulfill(status=200)) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: try: # Testing runtime enforcement of the overloads. await context.enqueue_links(requests=[Request.from_url('https://www.whatever.com')], selector='a') except Exception as e: exceptions.append(e) await crawler.run([str(server_url)]) assert len(exceptions) == 1 assert type(exceptions[0]) is ValueError async def test_enqueue_links_with_transform_request_function(server_url: URL) -> None: crawler = PlaywrightCrawler() visit = mock.Mock() headers = [] def test_transform_request_function(request: RequestOptions) -> RequestOptions | RequestTransformAction: if request['url'] == str(server_url / 'sub_index'): request['headers'] = HttpHeaders({'transform-header': 'my-header'}) return request return 'skip' @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: visit(context.request.url) headers.append(context.request.headers) await context.enqueue_links(transform_request_function=test_transform_request_function) await crawler.run([str(server_url / 'start_enqueue')]) expected_visit_calls = [ mock.call(str(server_url / 'start_enqueue')), mock.call(str(server_url / 'sub_index')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) # all urls added to `enqueue_links` must have a custom header assert headers[1]['transform-header'] == 'my-header' async def test_nonexistent_url_invokes_error_handler() -> None: crawler = PlaywrightCrawler(max_request_retries=3, request_handler=mock.AsyncMock()) error_handler = mock.AsyncMock(return_value=None) crawler.error_handler(error_handler) failed_handler = mock.AsyncMock(return_value=None) crawler.failed_request_handler(failed_handler) await crawler.run(['https://this-does-not-exist-22343434.com']) assert error_handler.call_count == 3 assert failed_handler.call_count == 1 async def test_redirect_handling(server_url: URL, redirect_server_url: URL) -> None: # Set up a dummy crawler that tracks visited URLs crawler = PlaywrightCrawler() handled_urls = set[str]() redirect_target = str(server_url / 'start_enqueue') redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target)) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: handled_urls.add(context.request.loaded_url or '') # Request with redirects request = Request.from_url(url=redirect_url) # Ensure that the request uses the same origin strategy - `redirect_target` will be considered out of scope request.crawlee_data.enqueue_strategy = 'same-origin' # No URLs should be visited in the run await crawler.run([request]) assert handled_urls == set() @pytest.mark.parametrize( 'fingerprint_generator', [ pytest.param(None, id='No fingerprint generator. Headers generated by header generator.'), pytest.param( DefaultFingerprintGenerator(header_options=HeaderGeneratorOptions(browsers=['chrome'])), id='Explicitly passed fingerprint generator.', ), pytest.param('default', id='Default fingerprint generator.'), ], ) async def test_chromium_headless_headers( header_network: dict, fingerprint_generator: None | FingerprintGenerator | Literal['default'], server_url: URL ) -> None: browser_type: BrowserType = 'chromium' crawler = PlaywrightCrawler(headless=True, browser_type=browser_type, fingerprint_generator=fingerprint_generator) headers = dict[str, str]() @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: response = await context.response.text() response_headers = json.loads(response) for key, val in response_headers.items(): headers[key] = val await crawler.run([str(server_url / 'headers')]) user_agent = headers.get('user-agent') assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'}), user_agent assert any( keyword in user_agent for keyword in BROWSER_TYPE_HEADER_KEYWORD[fingerprint_browser_type_from_playwright_browser_type(browser_type)] ), user_agent assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua') assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile') assert headers.get('sec-ch-ua-platform') in get_available_header_values(header_network, 'sec-ch-ua-platform') assert 'headless' not in headers['sec-ch-ua'].lower() assert 'headless' not in headers['user-agent'].lower() @pytest.mark.flaky(reruns=3, reason='Test is flaky.') async def test_firefox_headless_headers(header_network: dict, server_url: URL) -> None: browser_type: BrowserType = 'firefox' crawler = PlaywrightCrawler(headless=True, browser_type=browser_type) headers = dict[str, str]() @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: response = await context.response.text() response_headers = json.loads(response) for key, val in response_headers.items(): headers[key] = val await crawler.run([str(server_url / 'headers')]) assert 'user-agent' in headers assert 'sec-ch-ua' not in headers assert 'sec-ch-ua-mobile' not in headers assert 'sec-ch-ua-platform' not in headers assert 'headless' not in headers['user-agent'].lower() user_agent = headers.get('user-agent') assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'}) assert any( keyword in user_agent for keyword in BROWSER_TYPE_HEADER_KEYWORD[fingerprint_browser_type_from_playwright_browser_type(browser_type)] ) async def test_custom_headers(server_url: URL) -> None: crawler = PlaywrightCrawler() response_headers = dict[str, str]() request_headers = {'Power-Header': 'ring', 'Library': 'storm', 'My-Test-Header': 'fuzz'} @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: response = await context.response.text() context_response_headers = json.loads(response) for key, val in context_response_headers.items(): response_headers[key] = val await crawler.run([Request.from_url(str(server_url / 'headers'), headers=request_headers)]) assert response_headers.get('power-header') == request_headers['Power-Header'] assert response_headers.get('library') == request_headers['Library'] assert response_headers.get('my-test-header') == request_headers['My-Test-Header'] async def test_pre_navigation_hook() -> None: crawler = PlaywrightCrawler(request_handler=mock.AsyncMock()) visit = mock.Mock() @crawler.pre_navigation_hook async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None: visit() await context.page.route('**/*', lambda route: route.fulfill(status=200)) await crawler.run(['https://test.com', 'https://test.io']) assert visit.call_count == 2 async def test_proxy_set() -> None: # Configure crawler with proxy settings proxy_value = 'http://1111:1111' crawler = PlaywrightCrawler(proxy_configuration=ProxyConfiguration(proxy_urls=[proxy_value])) handler_data = {} mock_handler = mock.AsyncMock(return_value=None) crawler.router.default_handler(mock_handler) # Use pre_navigation_hook to verify proxy and configure playwright route @crawler.pre_navigation_hook async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None: if context.proxy_info: # Store information about the used proxy handler_data['proxy'] = context.proxy_info.url # Emulate server response to prevent Playwright from making real requests await context.page.route('**/*', lambda route: route.fulfill(status=200)) await crawler.run(['https://test.com']) assert handler_data.get('proxy') == proxy_value @pytest.mark.run_alone @pytest.mark.parametrize( 'use_incognito_pages', [ pytest.param(False, id='without use_incognito_pages'), pytest.param(True, id='with use_incognito_pages'), ], ) async def test_isolation_cookies(*, use_incognito_pages: bool, server_url: URL) -> None: sessions_ids: list[str] = [] sessions: dict[str, Session] = {} sessions_cookies: dict[str, dict[str, str]] = {} response_cookies: dict[str, dict[str, str]] = {} crawler = PlaywrightCrawler( session_pool=SessionPool(max_pool_size=1), use_incognito_pages=use_incognito_pages, concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), ) @crawler.router.default_handler async def handler(context: PlaywrightCrawlingContext) -> None: if not context.session: return sessions_ids.append(context.session.id) sessions[context.session.id] = context.session if context.request.unique_key == '1': # With the second request, we check the cookies in the session and set retire await context.add_requests( [ Request.from_url( str(server_url.with_path('/cookies')), unique_key='2', user_data={'retire_session': True} ) ] ) return response_data = json.loads(await context.response.text()) response_cookies[context.session.id] = response_data.get('cookies') if context.request.user_data.get('retire_session'): context.session.retire() if context.request.unique_key == '2': # The third request is made with a new session to make sure it does not use another session's cookies await context.add_requests([Request.from_url(str(server_url.with_path('/cookies')), unique_key='3')]) await crawler.run( [ # The first request sets the cookie in the session Request.from_url(str(server_url.with_path('set_cookies').extend_query(a=1)), unique_key='1'), ] ) assert len(response_cookies) == 2 assert len(sessions) == 2 assert sessions_ids[0] == sessions_ids[1] sessions_cookies = { sessions_id: { cookie['name']: cookie['value'] for cookie in sessions[sessions_id].cookies.get_cookies_as_dicts() } for sessions_id in sessions_ids } assert len(sessions_cookies) == 2 cookie_session_id = sessions_ids[0] clean_session_id = sessions_ids[2] assert cookie_session_id != clean_session_id # When using `use_incognito_pages` there should be full cookie isolation if use_incognito_pages: # The initiated cookies must match in both the response and the session store assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'} # For a clean session, the cookie should not be in the sesstion store or in the response # This way we can be sure that no cookies are being leaked through the http client assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {} # Without `use_incognito_pages` we will have access to the session cookie, # but there will be a cookie leak via PlaywrightContext else: # The initiated cookies must match in both the response and the session store assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'} # PlaywrightContext makes cookies shared by all sessions that work with it. # So in this case a clean session contains the same cookies assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {'a': '1'} async def test_save_cookies_after_handler_processing(server_url: URL) -> None: """Test that cookies are saved correctly.""" async with SessionPool(max_pool_size=1) as session_pool: crawler = PlaywrightCrawler(session_pool=session_pool) session_ids = [] @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: # Simulate cookies installed from an external source in the browser await context.page.context.add_cookies([{'name': 'check', 'value': 'test', 'url': str(server_url)}]) if context.session: session_ids.append(context.session.id) await crawler.run([str(server_url)]) assert len(session_ids) == 1 check_session = await session_pool.get_session() assert check_session.id == session_ids[0] session_cookies = {cookie['name']: cookie['value'] for cookie in check_session.cookies.get_cookies_as_dicts()} assert session_cookies == {'check': 'test'} async def test_read_write_cookies(server_url: URL) -> None: """Test that cookies are reloaded correctly.""" async with SessionPool(max_pool_size=1) as session_pool: crawler = PlaywrightCrawler(session_pool=session_pool) playwright_cookies = [] session_cookies = [] # Check that no errors occur when reading and writing cookies. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: cookies = await context.page.context.cookies() playwright_cookies.extend(cookies) if context.session: context.session.cookies.set_cookies_from_playwright_format(cookies) session_cookies.extend(context.session.cookies.get_cookies_as_dicts()) await crawler.run([str(server_url / 'set_complex_cookies')]) # Check that the cookie was received with `partitionKey` assert any('partitionKey' in cookie for cookie in playwright_cookies) assert len(playwright_cookies) == len(session_cookies) async def test_custom_fingerprint_uses_generator_options(server_url: URL) -> None: min_width = 300 max_width = 600 min_height = 500 max_height = 1200 fingerprint_generator = DefaultFingerprintGenerator( header_options=HeaderGeneratorOptions(browsers=['firefox'], operating_systems=['android']), screen_options=ScreenOptions( min_width=min_width, max_width=max_width, min_height=min_height, max_height=max_height ), ) crawler = PlaywrightCrawler(headless=True, fingerprint_generator=fingerprint_generator) fingerprints = dict[str, Any]() @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: for relevant_key in ( 'window.navigator.userAgent', 'window.navigator.userAgentData', 'window.screen.height', 'window.screen.width', ): fingerprints[relevant_key] = await context.page.evaluate(f'()=>{relevant_key}') await crawler.run([str(server_url)]) assert 'Firefox' in fingerprints['window.navigator.userAgent'] assert fingerprints['window.navigator.userAgentData']['platform'] == 'Android' assert min_width <= int(fingerprints['window.screen.width']) <= max_width assert min_height <= int(fingerprints['window.screen.height']) <= max_height async def test_custom_fingerprint_matches_header_user_agent(server_url: URL) -> None: """Test that generated fingerprint and header have matching user agent.""" crawler = PlaywrightCrawler(headless=True, fingerprint_generator=DefaultFingerprintGenerator()) response_headers = dict[str, str]() fingerprints = dict[str, str]() @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: response = await context.response.text() context_response_headers = dict(json.loads(response)) response_headers['User-Agent'] = context_response_headers['user-agent'] fingerprints['window.navigator.userAgent'] = await context.page.evaluate('()=>window.navigator.userAgent') await crawler.run([str(server_url / 'headers')]) assert response_headers['User-Agent'] == fingerprints['window.navigator.userAgent'] async def test_ignore_http_error_status_codes(server_url: URL) -> None: """Test that error codes that would normally trigger session error can be ignored.""" crawler = PlaywrightCrawler(ignore_http_error_status_codes={403}) target_url = str(server_url / 'status/403') mocked_handler = Mock() @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: mocked_handler(context.request.url) await crawler.run([target_url]) mocked_handler.assert_called_once_with(target_url) async def test_additional_http_error_status_codes(server_url: URL) -> None: """Test that use of `additional_http_error_status_codes` can raise error on common status code.""" crawler = PlaywrightCrawler(additional_http_error_status_codes={200}) mocked_handler = Mock() @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: mocked_handler(context.request.url) await crawler.run([str(server_url)]) mocked_handler.assert_not_called() async def test_launch_with_user_data_dir(tmp_path: Path, server_url: URL) -> None: """Check that the persist context is created in the specified folder in `user_data_dir`.""" check_path = tmp_path / 'Default' crawler = PlaywrightCrawler( headless=True, user_data_dir=tmp_path, request_handler=mock.AsyncMock(return_value=None) ) assert not check_path.exists() await crawler.run([str(server_url)]) assert check_path.exists() async def test_launch_with_user_data_dir_and_fingerprint(tmp_path: Path, server_url: URL) -> None: """Check that the persist context works with fingerprints.""" check_path = tmp_path / 'Default' fingerprints = dict[str, str]() crawler = PlaywrightCrawler( headless=True, user_data_dir=tmp_path, request_handler=mock.AsyncMock(return_value=None), fingerprint_generator=DefaultFingerprintGenerator(), ) @crawler.pre_navigation_hook async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None: fingerprints['window.navigator.userAgent'] = await context.page.evaluate('()=>window.navigator.userAgent') assert not check_path.exists() await crawler.run([str(server_url)]) assert check_path.exists() assert fingerprints['window.navigator.userAgent'] assert 'headless' not in fingerprints['window.navigator.userAgent'].lower() async def test_get_snapshot(server_url: URL) -> None: crawler = PlaywrightCrawler() snapshot = None @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: nonlocal snapshot snapshot = await context.get_snapshot() await crawler.run([str(server_url)]) assert snapshot is not None assert snapshot.html is not None assert snapshot.screenshot is not None # Check at least jpeg start and end expected bytes. Content is not relevant for the test. assert snapshot.screenshot.startswith(b'\xff\xd8') assert snapshot.screenshot.endswith(b'\xff\xd9') assert snapshot.html == HELLO_WORLD.decode('utf-8') async def test_error_snapshot_through_statistics(server_url: URL) -> None: """Test correct use of error snapshotter by the Playwright crawler. In this test the crawler will visit 4 pages. - 2 x page endpoints will return the same error - homepage endpoint will return unique error - headers endpoint will return no error """ max_retries = 2 crawler = PlaywrightCrawler( statistics=Statistics.with_default_state(save_error_snapshots=True), max_request_retries=max_retries ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: if 'page' in context.request.url: raise RuntimeError('page error') if 'headers' in context.request.url: return raise RuntimeError('home error') await crawler.run( [str(server_url), str(server_url / 'page_1'), str(server_url / 'page_2'), str(server_url / 'headers')] ) kvs = await crawler.get_key_value_store() kvs_content = {} async for key_info in kvs.iterate_keys(): # Skip any non-error snapshot keys, e.g. __RQ_STATE_. if 'ERROR_SNAPSHOT' not in key_info.key: continue kvs_content[key_info.key] = await kvs.get_value(key_info.key) assert set(key_info.key).issubset(ErrorSnapshotter.ALLOWED_CHARACTERS) if key_info.key.endswith('.jpg'): # Check at least jpeg start and end expected bytes. Content is not relevant for the test. assert kvs_content[key_info.key].startswith(b'\xff\xd8') assert kvs_content[key_info.key].endswith(b'\xff\xd9') elif 'page' in key_info.key: assert kvs_content[key_info.key] == GENERIC_RESPONSE.decode('utf-8') else: assert kvs_content[key_info.key] == HELLO_WORLD.decode('utf-8') # Three errors twice retried errors, but only 2 unique -> 4 (2 x (html and jpg)) artifacts expected. assert crawler.statistics.error_tracker.total == 3 * (max_retries + 1) assert crawler.statistics.error_tracker.unique_error_count == 2 assert len(list(kvs_content.keys())) == 4 async def test_respect_robots_txt(server_url: URL) -> None: crawler = PlaywrightCrawler(respect_robots_txt_file=True) visit = mock.Mock() @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links() await crawler.run([str(server_url / 'start_enqueue')]) expected_visit_calls = [ mock.call(str(server_url / 'start_enqueue')), mock.call(str(server_url / 'sub_index')), mock.call(str(server_url / 'base_page')), mock.call(str(server_url / 'base_subpath/page_5')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_respect_robots_txt_with_problematic_links(server_url: URL) -> None: """Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt.""" visit = mock.Mock() fail = mock.Mock() crawler = PlaywrightCrawler( respect_robots_txt_file=True, max_request_retries=0, ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links(strategy='all') @crawler.failed_request_handler async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None: fail(context.request.url) await crawler.run([str(server_url / 'problematic_links')]) # Email must be skipped # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler. expected_visit_calls = [ mock.call(str(server_url / 'problematic_links')), mock.call('https://avatars.githubusercontent.com/apify'), ] visit.assert_has_calls(expected_visit_calls, any_order=True) # The budplaceholder.com does not exist. expected_fail_calls = [ mock.call('https://budplaceholder.com/'), ] fail.assert_has_calls(expected_fail_calls, any_order=True) async def test_on_skipped_request(server_url: URL) -> None: crawler = PlaywrightCrawler(respect_robots_txt_file=True) skip = mock.Mock() @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: await context.enqueue_links() @crawler.on_skipped_request async def skipped_hook(url: str, _reason: SkippedReason) -> None: skip(url) await crawler.run([str(server_url / 'start_enqueue')]) expected_skip_calls = [ mock.call(str(server_url / 'page_1')), mock.call(str(server_url / 'page_2')), mock.call(str(server_url / 'page_3')), mock.call(str(server_url / 'page_4')), ] skip.assert_has_calls(expected_skip_calls, any_order=True) async def test_send_request(server_url: URL) -> None: check_data: dict[str, Any] = {} crawler = PlaywrightCrawler() @crawler.pre_navigation_hook async def pre_hook(context: PlaywrightPreNavCrawlingContext) -> None: send_request_response = await context.send_request(str(server_url / 'user-agent')) check_data['pre_send_request'] = dict(json.loads(await send_request_response.read())) @crawler.post_navigation_hook async def post_hook(context: PlaywrightPostNavCrawlingContext) -> None: send_request_response = await context.send_request(str(server_url / 'user-agent')) check_data['post_send_request'] = dict(json.loads(await send_request_response.read())) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: response = await context.response.text() check_data['default'] = dict(json.loads(response)) send_request_response = await context.send_request(str(server_url / 'user-agent')) check_data['send_request'] = dict(json.loads(await send_request_response.read())) await crawler.run([str(server_url / 'user-agent')]) assert check_data['default'].get('user-agent') is not None assert check_data['send_request'].get('user-agent') is not None assert check_data['pre_send_request'] == check_data['send_request'] assert check_data['post_send_request'] == check_data['send_request'] assert check_data['default'] == check_data['send_request'] async def test_send_request_with_client(server_url: URL) -> None: """Check that the persist context works with fingerprints.""" check_data: dict[str, Any] = {} crawler = PlaywrightCrawler(http_client=ImpitHttpClient(headers={'user-agent': 'My User-Agent'})) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: response = await context.response.text() check_data['default'] = dict(json.loads(response)) send_request_response = await context.send_request(str(server_url / 'user-agent')) check_data['send_request'] = dict(json.loads(await send_request_response.read())) await crawler.run([str(server_url / 'user-agent')]) assert check_data['default'].get('user-agent') is not None assert check_data['send_request']['user-agent'] == 'My User-Agent' assert check_data['default'] != check_data['send_request'] async def test_passing_configuration() -> None: """Check that the configuration is allowed to be passed to the Playwrightcrawler.""" service_locator.set_configuration(Configuration(log_level='INFO')) configuration = Configuration(log_level='WARNING') crawler = PlaywrightCrawler(configuration=configuration) assert service_locator.get_configuration().log_level == 'INFO' assert crawler._service_locator.get_configuration().log_level == 'WARNING' async def test_extract_links(server_url: URL) -> None: crawler = PlaywrightCrawler() extracted_links: list[str] = [] @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: links = await context.extract_links(exclude=[Glob(f'{server_url}sub_index')]) extracted_links.extend(request.url for request in links) await crawler.run([str(server_url / 'start_enqueue')]) assert len(extracted_links) == 1 assert extracted_links[0] == str(server_url / 'page_1') async def test_extract_non_href_links(server_url: URL) -> None: crawler = PlaywrightCrawler() extracted_links: list[str] = [] @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: links = await context.extract_links(selector='li', attribute='data-href') extracted_links.extend(request.url for request in links) await crawler.run([str(server_url / 'non_href_links')]) assert len(extracted_links) == 1 assert extracted_links[0] == str(server_url / 'page_2') async def test_reduced_logs_from_playwright_navigation_timeout(caplog: pytest.LogCaptureFixture) -> None: caplog.set_level(logging.INFO) crawler = PlaywrightCrawler(configure_logging=False) non_existent_page = 'https://totally-non-existing-site.com/blablablba' # Capture all logs from the 'crawlee' logger at INFO level or higher with caplog.at_level(logging.INFO, logger='crawlee'): await crawler.run([Request.from_url(non_existent_page)]) expected_summarized_log = ( f'Retrying request to {non_existent_page} due to: Page.goto: net::ERR_NAME_NOT_RESOLVED at {non_existent_page}' ) # Find the Playwright specific error message in the logs found_playwright_message = False for record in caplog.records: if record.message and expected_summarized_log in record.message: full_message = (record.message or '') + (record.exc_text or '') assert '\n' not in full_message found_playwright_message = True break assert found_playwright_message, 'Expected log message about request handler error was not found.' @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'by_id'), [ pytest.param('named-queue', None, False, id='with rq_name'), pytest.param(None, 'alias-queue', False, id='with rq_alias'), pytest.param('id-queue', None, True, id='with rq_id'), ], ) async def test_enqueue_links_with_rq_param( server_url: URL, queue_name: str | None, queue_alias: str | None, *, by_id: bool ) -> None: crawler = PlaywrightCrawler() rq = await RequestQueue.open(name=queue_name, alias=queue_alias) if by_id: queue_name = None queue_id = rq.id else: queue_id = None visit_urls: set[str] = set() @crawler.router.default_handler async def handler(context: PlaywrightCrawlingContext) -> None: visit_urls.add(context.request.url) await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias) await crawler.run([str(server_url / 'start_enqueue')]) requests_from_queue: list[str] = [] while request := await rq.fetch_next_request(): requests_from_queue.append(request.url) assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')} assert visit_urls == {str(server_url / 'start_enqueue')} await rq.drop() @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'by_id'), [ pytest.param('named-queue', None, False, id='with rq_name'), pytest.param(None, 'alias-queue', False, id='with rq_alias'), pytest.param('id-queue', None, True, id='with rq_id'), ], ) async def test_enqueue_links_requests_with_rq_param( server_url: URL, queue_name: str | None, queue_alias: str | None, *, by_id: bool ) -> None: crawler = PlaywrightCrawler() rq = await RequestQueue.open(name=queue_name, alias=queue_alias) if by_id: queue_name = None queue_id = rq.id else: queue_id = None visit_urls: set[str] = set() check_requests: list[str] = [ 'https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com', ] @crawler.router.default_handler async def handler(context: PlaywrightCrawlingContext) -> None: visit_urls.add(context.request.url) await context.enqueue_links( requests=check_requests, rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias, strategy='all' ) await crawler.run([str(server_url / 'start_enqueue')]) requests_from_queue: list[str] = [] while request := await rq.fetch_next_request(): requests_from_queue.append(request.url) assert set(requests_from_queue) == set(check_requests) assert visit_urls == {str(server_url / 'start_enqueue')} await rq.drop() @pytest.mark.parametrize( ('queue_id', 'queue_name', 'queue_alias'), [ pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'), pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'), pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'), pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'), ], ) async def test_enqueue_links_error_with_multi_params( server_url: URL, queue_id: str | None, queue_name: str | None, queue_alias: str | None ) -> None: crawler = PlaywrightCrawler() @crawler.router.default_handler async def handler(context: PlaywrightCrawlingContext) -> None: with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'): await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias) await crawler.run([str(server_url / 'start_enqueue')]) async def test_navigation_timeout_on_slow_page_load(server_url: URL) -> None: crawler = PlaywrightCrawler( navigation_timeout=timedelta(seconds=1), max_request_retries=0, ) request_handler = AsyncMock() crawler.router.default_handler(request_handler) failed_request_handler = AsyncMock() crawler.failed_request_handler(failed_request_handler) result = await crawler.run([str((server_url / 'slow').with_query(delay=2))]) assert result.requests_failed == 1 assert result.requests_finished == 0 assert request_handler.call_count == 0 assert failed_request_handler.call_count == 1 assert isinstance(failed_request_handler.call_args[0][1], asyncio.TimeoutError) async def test_navigation_timeout_applies_to_hooks(server_url: URL) -> None: crawler = PlaywrightCrawler( navigation_timeout=timedelta(seconds=0.5), max_request_retries=0, ) request_handler = AsyncMock() crawler.router.default_handler(request_handler) crawler.pre_navigation_hook(lambda _: asyncio.sleep(1)) # Pre-navigation hook takes 1 second (exceeds navigation timeout), so the URL will not be handled result = await crawler.run([str(server_url)]) assert result.requests_failed == 1 assert result.requests_finished == 0 assert request_handler.call_count == 0 async def test_slow_navigation_does_not_count_toward_handler_timeout(server_url: URL) -> None: crawler = PlaywrightCrawler( request_handler_timeout=timedelta(seconds=0.5), max_request_retries=0, ) request_handler = AsyncMock() crawler.router.default_handler(request_handler) # Navigation takes 1 second (exceeds handler timeout), but should still succeed result = await crawler.run([str((server_url / 'slow').with_query(delay=1))]) assert result.requests_failed == 0 assert result.requests_finished == 1 assert request_handler.call_count == 1 async def test_request_state(server_url: URL) -> None: queue = await RequestQueue.open(alias='playwright_request_state') crawler = PlaywrightCrawler(request_manager=queue) success_request = Request.from_url(str(server_url)) assert success_request.state == RequestState.UNPROCESSED error_request = Request.from_url(str(server_url / 'error'), user_data={'cause_error': True}) requests_states: dict[str, dict[str, RequestState]] = {success_request.unique_key: {}, error_request.unique_key: {}} @crawler.pre_navigation_hook async def pre_navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None: requests_states[context.request.unique_key]['pre_navigation'] = context.request.state @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: if context.request.user_data.get('cause_error'): raise ValueError('Caused error as requested') requests_states[context.request.unique_key]['request_handler'] = context.request.state @crawler.error_handler async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None: requests_states[context.request.unique_key]['error_handler'] = context.request.state @crawler.failed_request_handler async def failed_request_handler(context: BasicCrawlingContext, _error: Exception) -> None: requests_states[context.request.unique_key]['failed_request_handler'] = context.request.state await crawler.run([success_request, error_request]) handled_success_request = await queue.get_request(success_request.unique_key) assert handled_success_request is not None assert handled_success_request.state == RequestState.DONE assert requests_states[success_request.unique_key] == { 'pre_navigation': RequestState.BEFORE_NAV, 'request_handler': RequestState.REQUEST_HANDLER, } handled_error_request = await queue.get_request(error_request.unique_key) assert handled_error_request is not None assert handled_error_request.state == RequestState.ERROR assert requests_states[error_request.unique_key] == { 'pre_navigation': RequestState.BEFORE_NAV, 'error_handler': RequestState.ERROR_HANDLER, 'failed_request_handler': RequestState.ERROR, } await queue.drop() async def test_enqueue_links_with_limit(server_url: URL) -> None: start_url = str(server_url / 'sub_index') requests = [start_url] crawler = PlaywrightCrawler() visit = mock.Mock() @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: visit(context.request.url) await context.enqueue_links(limit=1) await crawler.run(requests) # Only one link should be enqueued from sub_index due to the limit expected_visit_calls = [ mock.call(start_url), mock.call(str(server_url / 'page_3')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) async def test_playwright_crawler_pre_navigation_hook_execution(server_url: URL) -> None: """Test that pre-navigation hooks are executed.""" crawler = PlaywrightCrawler(request_handler=AsyncMock()) call_mock = AsyncMock() # Register pre navigation hook. @crawler.pre_navigation_hook async def pre_nav_hook(context: PlaywrightPreNavCrawlingContext) -> None: await call_mock(context.page.url) await crawler.run([str(server_url)]) # `pre_navigation_hook` is called before the request is made, so the loaded URL should be 'about:blank'. call_mock.assert_called_once_with('about:blank') async def test_playwright_crawler_post_navigation_hook_execution(server_url: URL) -> None: """Test that post-navigation hooks are executed.""" crawler = PlaywrightCrawler(request_handler=AsyncMock()) call_mock = AsyncMock() # Register post navigation hook. @crawler.post_navigation_hook async def post_nav_hook(context: PlaywrightPostNavCrawlingContext) -> None: await call_mock(context.page.url) await crawler.run([str(server_url)]) # `post_navigation_hook` is called after the request is made, so the loaded URL should be the result URL. call_mock.assert_called_once_with(str(server_url)) async def test_playwright_navigation_hooks_order(server_url: URL) -> None: """Test that post-navigation hooks are executed in correct order.""" execution_order = [] crawler = PlaywrightCrawler() # Register final context handler. @crawler.router.default_handler async def default_request_handler(_context: PlaywrightCrawlingContext) -> None: execution_order.append('final handler') # Register pre navigation hook. @crawler.pre_navigation_hook async def pre_nav_hook_1(_context: PlaywrightPreNavCrawlingContext) -> None: execution_order.append('pre-navigation-hook 1') # Register pre navigation hook. @crawler.pre_navigation_hook async def pre_nav_hook(_context: PlaywrightPreNavCrawlingContext) -> None: execution_order.append('pre-navigation-hook 2') # Register post navigation hook. @crawler.post_navigation_hook async def post_nav_hook_1(_context: PlaywrightPostNavCrawlingContext) -> None: execution_order.append('post-navigation-hook 1') # Register post navigation hook. @crawler.post_navigation_hook async def post_nav_hook_2(_context: PlaywrightPostNavCrawlingContext) -> None: execution_order.append('post-navigation-hook 2') await crawler.run([str(server_url)]) assert execution_order == [ 'pre-navigation-hook 1', 'pre-navigation-hook 2', 'post-navigation-hook 1', 'post-navigation-hook 2', 'final handler', ] ================================================ FILE: tests/unit/crawlers/_playwright/test_utils.py ================================================ from playwright.async_api import async_playwright from yarl import URL from crawlee.crawlers._playwright._utils import block_requests, infinite_scroll async def test_infinite_scroll_on_dynamic_page(server_url: URL) -> None: """Checks that infinite_scroll loads all items on a page with infinite scrolling.""" async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() target_url = str(server_url / 'infinite_scroll') # Get data with manual scrolling await page.goto(target_url) manual_items = [] for _ in range(4): items = await page.query_selector_all('.item') manual_items = items await page.evaluate('window.scrollTo(0, document.body.scrollHeight)') await page.wait_for_timeout(1000) # Reset page await page.close() page = await browser.new_page() await page.goto(target_url) # Get data with infinite_scroll utility before_scroll = await page.query_selector_all('.item') assert len(before_scroll) != len(manual_items) assert len(before_scroll) == 10 await infinite_scroll(page) after_scroll = await page.query_selector_all('.item') assert len(before_scroll) < len(after_scroll) assert len(manual_items) == len(after_scroll) await browser.close() async def test_infinite_scroll_no_page_without_scroll(server_url: URL) -> None: """Checks that infinite_scroll does not call error on a page without infinite scrolling.""" async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() await page.goto(str(server_url)) await infinite_scroll(page) title = await page.title() assert title == 'Hello, world!' await browser.close() async def test_double_call_infinite_scroll(server_url: URL) -> None: """Checks that calling infinite_scroll twice does not load more items the second time.""" async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() await page.goto(str(server_url / 'infinite_scroll')) await infinite_scroll(page) first_count = len(await page.query_selector_all('.item')) await infinite_scroll(page) second_count = len(await page.query_selector_all('.item')) assert first_count == second_count await browser.close() async def test_block_requests_default(server_url: URL) -> None: """Checks that block_requests blocks the correct resources by default.""" async with async_playwright() as p: browser = await p.chromium.launch() target_url = str(server_url / 'resource_loading_page') # Default behavior, all resources load page = await browser.new_page() loaded_urls_no_block = [] page.on('requestfinished', lambda req: loaded_urls_no_block.append(req.url.rsplit('/', 1)[-1])) await page.goto(target_url) await page.wait_for_load_state('networkidle') await page.close() # With blocking — collect loaded resources page = await browser.new_page() loaded_urls_blocked = [] page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1])) await block_requests(page) await page.goto(target_url) await page.wait_for_load_state('networkidle') await page.close() await browser.close() # Without blocking, both resources should load assert set(loaded_urls_no_block) == {'resource_loading_page', 'test.js', 'test.png'} # With blocking, only JS should load assert set(loaded_urls_blocked) == {'resource_loading_page', 'test.js'} async def test_block_requests_with_extra_patterns(server_url: URL) -> None: """Checks that block_requests blocks the correct resources with extra patterns.""" async with async_playwright() as p: browser = await p.chromium.launch() target_url = str(server_url / 'resource_loading_page') page = await browser.new_page() loaded_urls_blocked = [] page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1])) await block_requests(page, extra_url_patterns=['*.js']) await page.goto(target_url) await page.wait_for_load_state('networkidle') await page.close() await browser.close() # With blocking, only HTML should load assert set(loaded_urls_blocked) == {'resource_loading_page'} async def test_block_requests_with_custom_patterns(server_url: URL) -> None: """Checks that block_requests blocks the correct resources with custom patterns.""" async with async_playwright() as p: browser = await p.chromium.launch() target_url = str(server_url / 'resource_loading_page') page = await browser.new_page() loaded_urls_blocked = [] page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1])) await block_requests(page, url_patterns=['*.js']) await page.goto(target_url) await page.wait_for_load_state('networkidle') await page.close() await browser.close() # With blocking, only PNG should load assert set(loaded_urls_blocked) == {'resource_loading_page', 'test.png'} ================================================ FILE: tests/unit/events/test_event_manager.py ================================================ from __future__ import annotations import asyncio import logging from datetime import timedelta from functools import update_wrapper from typing import TYPE_CHECKING, Any from unittest import mock from unittest.mock import AsyncMock, MagicMock import pytest from crawlee.events import Event, EventManager, EventSystemInfoData if TYPE_CHECKING: from collections.abc import AsyncGenerator @pytest.fixture async def event_manager() -> AsyncGenerator[EventManager, None]: async with EventManager() as event_manager: yield event_manager @pytest.fixture def event_system_info_data() -> EventSystemInfoData: return MagicMock(spec=EventSystemInfoData) @pytest.fixture def async_listener() -> AsyncMock: async def async_listener(payload: Any) -> None: pass al = AsyncMock() update_wrapper(al, async_listener) return al @pytest.fixture def sync_listener() -> MagicMock: def sync_listener(payload: Any) -> None: pass sl = MagicMock() update_wrapper(sl, sync_listener) return sl async def test_emit_invokes_registered_sync_listener( sync_listener: MagicMock, event_manager: EventManager, event_system_info_data: EventSystemInfoData, ) -> None: event_manager.on(event=Event.SYSTEM_INFO, listener=sync_listener) event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data) await asyncio.sleep(0.1) # Allow some time for the event to be processed assert sync_listener.call_count == 1 assert sync_listener.call_args[0] == (event_system_info_data,) async def test_emit_invokes_both_sync_and_async_listeners( sync_listener: MagicMock, async_listener: AsyncMock, event_manager: EventManager, event_system_info_data: EventSystemInfoData, ) -> None: event_manager.on(event=Event.SYSTEM_INFO, listener=sync_listener) event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener) event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data) await asyncio.sleep(0.1) # Allow some time for the event to be processed assert async_listener.call_count == 1 assert async_listener.call_args[0] == (event_system_info_data,) assert sync_listener.call_count == 1 assert sync_listener.call_args[0] == (event_system_info_data,) async def test_emit_event_with_no_listeners( event_manager: EventManager, event_system_info_data: EventSystemInfoData, async_listener: AsyncMock, ) -> None: # Register a listener for a different event event_manager.on(event=Event.ABORTING, listener=async_listener) # Attempt to emit an event for which no listeners are registered, it should not fail event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data) await asyncio.sleep(0.1) # Allow some time for the event to be processed # Ensure the listener for the other event was not called assert async_listener.call_count == 0 async def test_emit_invokes_parameterless_listener( event_manager: EventManager, event_system_info_data: EventSystemInfoData, ) -> None: sync_mock = MagicMock() def sync_listener() -> None: sync_mock() async_mock = MagicMock() async def async_listener() -> None: async_mock() event_manager.on(event=Event.SYSTEM_INFO, listener=sync_listener) event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener) event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data) await asyncio.sleep(0.1) # Allow some time for the event to be processed assert sync_mock.call_count == 1 assert async_mock.call_count == 1 async def test_remove_nonexistent_listener_does_not_fail( async_listener: AsyncMock, event_manager: EventManager, ) -> None: # Attempt to remove a specific listener that was never added. event_manager.off(event=Event.SYSTEM_INFO, listener=async_listener) # Attempt to remove all listeners. event_manager.off(event=Event.ABORTING) async def test_removed_listener_not_invoked_on_emit( async_listener: AsyncMock, event_manager: EventManager, event_system_info_data: EventSystemInfoData, ) -> None: event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener) event_manager.off(event=Event.SYSTEM_INFO, listener=async_listener) event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data) await asyncio.sleep(0.1) # Allow some time for the event to be processed assert async_listener.call_count == 0 async def test_close_clears_listeners_and_tasks(async_listener: AsyncMock) -> None: async with EventManager() as event_manager: event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener) assert async_listener.call_count == 0 assert len(event_manager._listener_tasks) == 0 assert len(event_manager._listeners_to_wrappers) == 0 async def test_close_after_emit_processes_event( async_listener: AsyncMock, event_system_info_data: EventSystemInfoData, ) -> None: async with EventManager() as event_manager: event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener) event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data) # Event should be processed before the event manager is closed assert async_listener.call_count == 1 assert async_listener.call_args[0] == (event_system_info_data,) assert len(event_manager._listener_tasks) == 0 assert len(event_manager._listeners_to_wrappers) == 0 async def test_wait_for_all_listeners_cancelled_error( monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture, ) -> None: # Simulate long-running listener tasks async def long_running_listener() -> None: await asyncio.sleep(10) # Define a side effect function that raises CancelledError async def mock_async_wait(*_: Any, **__: Any) -> None: raise asyncio.CancelledError with pytest.raises(asyncio.CancelledError), caplog.at_level(logging.WARNING): # noqa: PT012 async with EventManager(close_timeout=timedelta(milliseconds=10)) as event_manager: event_manager.on(event=Event.SYSTEM_INFO, listener=long_running_listener) # Use monkeypatch to replace asyncio.wait with mock_async_wait monkeypatch.setattr('asyncio.wait', mock_async_wait) async def test_methods_raise_error_when_not_active(event_system_info_data: EventSystemInfoData) -> None: event_manager = EventManager() assert event_manager.active is False with pytest.raises(RuntimeError, match=r'EventManager is not active.'): event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data) with pytest.raises(RuntimeError, match=r'EventManager is not active.'): await event_manager.wait_for_all_listeners_to_complete() with pytest.raises(RuntimeError, match=r'EventManager is already active.'): async with event_manager, event_manager: pass async with event_manager: event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data) await event_manager.wait_for_all_listeners_to_complete() assert event_manager.active is True async def test_event_manager_in_context_persistence() -> None: """Test that entering the `EventManager` context emits persist state event at least once.""" event_manager = EventManager() with mock.patch.object(event_manager, '_emit_persist_state_event', AsyncMock()) as mocked_emit_persist_state_event: async with event_manager: pass assert mocked_emit_persist_state_event.call_count >= 1 ================================================ FILE: tests/unit/events/test_local_event_manager.py ================================================ from __future__ import annotations import asyncio from datetime import timedelta from typing import Any from unittest.mock import AsyncMock from crawlee.events import LocalEventManager from crawlee.events._types import Event, EventSystemInfoData async def test_emit_system_info_event() -> None: mocked_listener = AsyncMock() async def async_listener(payload: Any) -> None: await mocked_listener(payload) system_info_interval = timedelta(milliseconds=50) test_tolerance_coefficient = 10 async with LocalEventManager(system_info_interval=system_info_interval) as event_manager: event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener) await asyncio.sleep(system_info_interval.total_seconds() * test_tolerance_coefficient) assert mocked_listener.call_count >= 1 assert isinstance(mocked_listener.call_args[0][0], EventSystemInfoData) ================================================ FILE: tests/unit/fingerprint_suite/test_adapters.py ================================================ from collections.abc import Iterable import pytest from browserforge.headers import Browser from crawlee.fingerprint_suite import ( DefaultFingerprintGenerator, HeaderGeneratorOptions, ScreenOptions, ) from crawlee.fingerprint_suite._browserforge_adapter import PatchedHeaderGenerator from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD def test_fingerprint_generator_has_default() -> None: """Test that header generator can work without any options.""" assert DefaultFingerprintGenerator().generate() def test_fingerprint_generator_some_options_stress_test() -> None: """Test that header generator can work consistently.""" fingerprint_generator = DefaultFingerprintGenerator( mock_web_rtc=True, screen_options=ScreenOptions(min_width=500), header_options=HeaderGeneratorOptions(strict=True), ) for _ in range(20): fingerprint = fingerprint_generator.generate() assert fingerprint.mockWebRTC is True assert fingerprint.screen.availWidth > 500 def test_fingerprint_generator_all_options() -> None: """Test that header generator can work with all the options. Some most basic checks of fingerprint. Fingerprint generation option might have no effect if there is no fingerprint sample present in collected data. """ min_width = 600 max_width = 1800 min_height = 400 max_height = 1200 fingerprint = DefaultFingerprintGenerator( mock_web_rtc=True, slim=True, screen_options=ScreenOptions( min_width=min_width, max_width=max_width, min_height=min_height, max_height=max_height, ), header_options=HeaderGeneratorOptions( strict=True, browsers=['firefox'], operating_systems=['windows'], devices=['mobile'], locales=['en'], # Does not generate any other values than `en-US` regardless of the input in browserforge http_version='2', # Http1 does not work in browserforge ), ).generate() assert fingerprint.screen.availWidth >= min_width assert fingerprint.screen.availWidth <= max_width assert fingerprint.screen.availHeight >= min_height assert fingerprint.screen.availHeight <= max_height assert fingerprint.mockWebRTC is True assert fingerprint.slim is True assert 'Firefox' in fingerprint.navigator.userAgent assert 'Win' in fingerprint.navigator.oscpu assert 'en-US' in fingerprint.navigator.languages @pytest.mark.parametrize( 'browser', [ 'firefox', ['firefox'], [Browser(name='firefox')], ], ) def test_patched_header_generator_generate(browser: Iterable[str | Browser]) -> None: """Test that PatchedHeaderGenerator works with all the possible types correctly.""" header = PatchedHeaderGenerator().generate(browser=browser) assert any(keyword in header['User-Agent'] for keyword in BROWSER_TYPE_HEADER_KEYWORD['firefox']) ================================================ FILE: tests/unit/fingerprint_suite/test_header_generator.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING import pytest from crawlee.fingerprint_suite import HeaderGenerator from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values from crawlee.fingerprint_suite._consts import ( BROWSER_TYPE_HEADER_KEYWORD, ) if TYPE_CHECKING: from crawlee.fingerprint_suite._types import SupportedBrowserType def test_get_common_headers(header_network: dict) -> None: header_generator = HeaderGenerator() headers = header_generator.get_common_headers() assert 'Accept' in headers assert headers['Accept'] in get_available_header_values(header_network, {'Accept', 'accept'}) assert 'Accept-Language' in headers def test_get_random_user_agent_header() -> None: """Test that a random User-Agent header is generated.""" header_generator = HeaderGenerator() headers = header_generator.get_random_user_agent_header() assert 'User-Agent' in headers assert headers['User-Agent'] @pytest.mark.parametrize('browser_type', ['chrome', 'firefox', 'edge', 'safari']) def test_get_user_agent_header_stress_test(browser_type: SupportedBrowserType, header_network: dict) -> None: """Test that the User-Agent header is consistently generated correctly. (Very fast even when stress tested.)""" for _ in range(100): header_generator = HeaderGenerator() headers = header_generator.get_user_agent_header(browser_type=browser_type) assert 'User-Agent' in headers assert any(keyword in headers['User-Agent'] for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type]) assert headers['User-Agent'] in get_available_header_values(header_network, {'user-agent', 'User-Agent'}) def test_get_user_agent_header_invalid_browser_type() -> None: """Test that an invalid browser type raises a ValueError.""" header_generator = HeaderGenerator() with pytest.raises(ValueError, match=r'Unsupported browser type'): header_generator.get_user_agent_header(browser_type='invalid_browser') # ty: ignore[invalid-argument-type] def test_get_sec_ch_ua_headers_chromium(header_network: dict) -> None: """Test that Sec-Ch-Ua headers are generated correctly for Chrome.""" header_generator = HeaderGenerator() headers = header_generator.get_sec_ch_ua_headers(browser_type='chrome') assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua') assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile') assert headers.get('sec-ch-ua-platform') in get_available_header_values(header_network, 'sec-ch-ua-platform') def test_get_sec_ch_ua_headers_firefox() -> None: """Test that sec-ch-ua headers are not generated for Firefox.""" header_generator = HeaderGenerator() headers = header_generator.get_sec_ch_ua_headers(browser_type='firefox') assert not headers def test_get_sec_ch_ua_headers_invalid_browser_type() -> None: """Test that an invalid browser type raises a ValueError for sec-ch-ua headers.""" header_generator = HeaderGenerator() with pytest.raises(ValueError, match=r'Unsupported browser type'): header_generator.get_sec_ch_ua_headers(browser_type='invalid_browser') # ty: ignore[invalid-argument-type] ================================================ FILE: tests/unit/http_clients/test_http_clients.py ================================================ from __future__ import annotations import os from typing import TYPE_CHECKING import pytest from curl_cffi import CurlHttpVersion from crawlee import Request from crawlee.errors import ProxyError from crawlee.http_clients import CurlImpersonateHttpClient, HttpClient, HttpxHttpClient, ImpitHttpClient from crawlee.statistics import Statistics from tests.unit.server_endpoints import HELLO_WORLD if TYPE_CHECKING: from collections.abc import AsyncGenerator from _pytest.fixtures import SubRequest from yarl import URL from crawlee.proxy_configuration import ProxyInfo @pytest.fixture async def custom_http_client(request: SubRequest) -> AsyncGenerator[HttpClient]: """Helper fixture to reduce code duplication. If clients are not initialized, create their default instances. Return client in active context, leave the context after the test.""" client = request.param if isinstance(request.param, HttpClient) else request.param() async with client as _: yield _ async def test_http_1(http_client: HttpClient, server_url: URL) -> None: response = await http_client.send_request(str(server_url)) assert response.http_version == 'HTTP/1.1' @pytest.mark.parametrize( 'custom_http_client', [ pytest.param(CurlImpersonateHttpClient(http_version=CurlHttpVersion.V2_0), id='curl'), pytest.param(HttpxHttpClient(http1=False, http2=True), id='httpx'), pytest.param(ImpitHttpClient(), id='impit'), ], indirect=['custom_http_client'], ) async def test_http_2(custom_http_client: HttpClient) -> None: response = await custom_http_client.send_request('https://apify.com/') assert response.http_version == 'HTTP/2' @pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows') async def test_crawl_with_proxy( http_client: HttpClient, proxy: ProxyInfo, server_url: URL, ) -> None: url = str(server_url / 'status/222') request = Request.from_url(url) async with Statistics.with_default_state() as statistics: result = await http_client.crawl(request, proxy_info=proxy, statistics=statistics) assert result.http_response.status_code == 222 # 222 - authentication successful @pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows') async def test_crawl_with_proxy_disabled( http_client: HttpClient, disabled_proxy: ProxyInfo, ) -> None: url = 'https://apify.com/' request = Request.from_url(url) with pytest.raises(ProxyError): async with Statistics.with_default_state() as statistics: await http_client.crawl(request, proxy_info=disabled_proxy, statistics=statistics) @pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows') async def test_send_request_with_proxy( http_client: HttpClient, proxy: ProxyInfo, server_url: URL, ) -> None: url = str(server_url / 'status/222') response = await http_client.send_request(url, proxy_info=proxy) assert response.status_code == 222 # 222 - authentication successful @pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows') async def test_send_request_with_proxy_disabled( http_client: HttpClient, disabled_proxy: ProxyInfo, ) -> None: url = 'https://apify.com/' with pytest.raises(ProxyError): await http_client.send_request(url, proxy_info=disabled_proxy) async def test_crawl_allow_redirects_by_default(http_client: HttpClient, server_url: URL) -> None: target_url = str(server_url / 'status/200') redirect_url = str((server_url / 'redirect').update_query(url=target_url)) request = Request.from_url(redirect_url) crawling_result = await http_client.crawl(request) assert crawling_result.http_response.status_code == 200 assert request.loaded_url == target_url @pytest.mark.parametrize( 'custom_http_client', [ pytest.param(CurlImpersonateHttpClient(allow_redirects=False), id='curl'), pytest.param(HttpxHttpClient(follow_redirects=False), id='httpx'), pytest.param(ImpitHttpClient(follow_redirects=False), id='impit'), ], indirect=['custom_http_client'], ) async def test_crawl_allow_redirects_false(custom_http_client: HttpClient, server_url: URL) -> None: target_url = str(server_url / 'status/200') redirect_url = str((server_url / 'redirect').update_query(url=target_url)) request = Request.from_url(redirect_url) crawling_result = await custom_http_client.crawl(request) assert crawling_result.http_response.status_code == 302 assert crawling_result.http_response.headers['Location'] == target_url assert request.loaded_url == redirect_url async def test_send_request_allow_redirects_by_default(http_client: HttpClient, server_url: URL) -> None: target_url = str(server_url / 'status/200') redirect_url = str((server_url / 'redirect').update_query(url=target_url)) response = await http_client.send_request(redirect_url) assert response.status_code == 200 @pytest.mark.parametrize( 'custom_http_client', [ pytest.param(CurlImpersonateHttpClient(allow_redirects=False), id='curl'), pytest.param(HttpxHttpClient(follow_redirects=False), id='httpx'), pytest.param(ImpitHttpClient(follow_redirects=False), id='impit'), ], indirect=['custom_http_client'], ) async def test_send_request_allow_redirects_false(custom_http_client: HttpClient, server_url: URL) -> None: target_url = str(server_url / 'status/200') redirect_url = str((server_url / 'redirect').update_query(url=target_url)) response = await custom_http_client.send_request(redirect_url) assert response.status_code == 302 assert response.headers['Location'] == target_url async def test_stream(http_client: HttpClient, server_url: URL) -> None: content_body: bytes = b'' async with http_client.stream(str(server_url)) as response: assert response.status_code == 200 async for chunk in response.read_stream(): content_body += chunk assert content_body == HELLO_WORLD async def test_stream_error_double_read_stream(http_client: HttpClient, server_url: URL) -> None: async with http_client.stream(str(server_url)) as response: assert response.status_code == 200 content_body_first: bytes = b'' async for chunk in response.read_stream(): content_body_first += chunk with pytest.raises(RuntimeError): [chunk async for chunk in response.read_stream()] assert content_body_first == HELLO_WORLD async def test_stream_error_for_read(http_client: HttpClient, server_url: URL) -> None: async with http_client.stream(str(server_url)) as response: assert response.status_code == 200 with pytest.raises(RuntimeError): await response.read() async def test_send_request_error_for_read_stream(http_client: HttpClient, server_url: URL) -> None: response = await http_client.send_request(str(server_url)) assert response.status_code == 200 with pytest.raises(RuntimeError): [item async for item in response.read_stream()] async def test_send_crawl_error_for_read_stream(http_client: HttpClient, server_url: URL) -> None: response = await http_client.crawl(Request.from_url(str(server_url))) http_response = response.http_response assert http_response.status_code == 200 with pytest.raises(RuntimeError): [item async for item in http_response.read_stream()] @pytest.mark.parametrize( 'custom_http_client', [ pytest.param(CurlImpersonateHttpClient(), id='curl'), pytest.param(HttpxHttpClient(), id='httpx'), pytest.param(ImpitHttpClient(), id='impit'), ], ) async def test_reuse_context_manager(custom_http_client: HttpClient, server_url: URL) -> None: async with custom_http_client: response = await custom_http_client.send_request(str(server_url)) assert response.status_code == 200 # Reusing the context manager should not raise an error async with custom_http_client: response = await custom_http_client.send_request(str(server_url)) assert response.status_code == 200 async def test_work_after_cleanup(http_client: HttpClient, server_url: URL) -> None: response = await http_client.send_request(str(server_url)) assert response.status_code == 200 # Cleanup the client await http_client.cleanup() # After cleanup, the client should still work response = await http_client.send_request(str(server_url)) assert response.status_code == 200 async def test_compressed_chunked_stream(http_client: HttpClient, server_url: URL) -> None: content_body: bytes = b'' async with http_client.stream(str(server_url / 'get_compressed')) as response: assert response.status_code == 200 async for chunk in response.read_stream(): content_body += chunk assert content_body == HELLO_WORLD * 1000 ================================================ FILE: tests/unit/http_clients/test_httpx.py ================================================ from __future__ import annotations import json from typing import TYPE_CHECKING import pytest from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values from crawlee.fingerprint_suite._consts import COMMON_ACCEPT_LANGUAGE from crawlee.http_clients import HttpxHttpClient if TYPE_CHECKING: from collections.abc import AsyncGenerator from yarl import URL from crawlee.http_clients import HttpClient @pytest.fixture async def http_client() -> AsyncGenerator[HttpClient]: async with HttpxHttpClient(http2=False) as client: yield client async def test_common_headers_and_user_agent(server_url: URL, header_network: dict) -> None: """Test that the relevant headers use header values from header generator instead of default Httpx headers. Httpx uses own headers by default which is not desired as it could increase blocking chances. """ client = HttpxHttpClient() response = await client.send_request(str(server_url / 'headers')) response_headers = json.loads((await response.read()).decode()) assert 'accept' in response_headers assert response_headers['accept'] in get_available_header_values(header_network, {'Accept', 'accept'}) assert 'accept-language' in response_headers assert response_headers['accept-language'] == COMMON_ACCEPT_LANGUAGE # By default, HTTPX uses its own User-Agent, which should be replaced by the one from the header generator. assert 'user-agent' in response_headers assert 'python-httpx' not in response_headers['user-agent'] assert response_headers['user-agent'] in get_available_header_values(header_network, {'User-Agent', 'user-agent'}) ================================================ FILE: tests/unit/otel/test_crawler_instrumentor.py ================================================ import io import json import re from unittest import mock from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor from opentelemetry.trace import set_tracer_provider from yarl import URL from crawlee import ConcurrencySettings from crawlee.crawlers import ParselCrawler from crawlee.otel.crawler_instrumentor import CrawlerInstrumentor from crawlee.storages import Dataset async def test_crawler_instrumentor_capability(server_url: URL) -> None: """Test OpenTelemetry instrumentation capability of the crawler. Instrument the crawler and one additional class and check that telemetry data is generated correctly. Telemetry data is redirected to an in-memory file for testing purposes.""" resource = Resource.create( { 'service.name': 'ExampleCrawler', 'service.version': '1.0.0', 'environment': 'development', } ) # Set up the OpenTelemetry tracer provider and exporter provider = TracerProvider(resource=resource) in_memory_sink_for_telemetry = io.StringIO(newline='\n') exporter = ConsoleSpanExporter(out=in_memory_sink_for_telemetry) provider.add_span_processor(SimpleSpanProcessor(exporter)) set_tracer_provider(provider) # Instrument the crawler with OpenTelemetry instrumentor = CrawlerInstrumentor(instrument_classes=[Dataset]) instrumentor.instrument() # Generate first telemetry data from `Dataset` public methods. # `Dataset` is in `instrument_classes` argument, and thus it's public methods are instrumented. dataset = await Dataset.open(name='test-dataset') await dataset.drop() # Other traces will be from crawler run. crawler = ParselCrawler( max_requests_per_crawl=1, request_handler=mock.AsyncMock(), concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), ) # Run crawler and generate more telemetry data. await crawler.run([str(server_url)]) # Telemetry jsons are packed together in one string. Unpack them and load as json objects. telemetry_strings = in_memory_sink_for_telemetry.getvalue() telemetry_data = [ json.loads(telemetry_string) for telemetry_string in re.split(r'(?<=\})\s*(?=\{)', telemetry_strings) ] # Do some basic checks on the telemetry data. # The point of this test is not to check completeness of the data, but telemetry capability. # Extra `instrument_classes` telemetry - KeyValueStore.open() is parent to KeyValueStore.__init__() span. assert telemetry_data[0]['name'] == '__init__' assert telemetry_data[0]['attributes']['code.function.name'] == 'Dataset.__init__' assert telemetry_data[0]['resource']['attributes'] == dict(resource.attributes) assert telemetry_data[1]['name'] == 'open' assert telemetry_data[1]['attributes']['code.function.name'] == 'Dataset.open' assert telemetry_data[1]['resource']['attributes'] == dict(resource.attributes) # Opening KeyValueStore creates a new trace. assert telemetry_data[0]['context']['trace_id'] == telemetry_data[1]['context']['trace_id'] assert telemetry_data[2]['name'] == 'drop' assert telemetry_data[2]['attributes']['code.function.name'] == 'Dataset.drop' assert telemetry_data[2]['resource']['attributes'] == dict(resource.attributes) # Dropping KeyValueStore creates a new trace. assert telemetry_data[2]['context']['trace_id'] != telemetry_data[1]['context']['trace_id'] # Crawler telemetry - all crawler spans will be in one trace as there is only one request in this test. assert telemetry_data[3]['name'] == '_execute_pre_navigation_hooks, action' assert telemetry_data[3]['attributes']['code.function.name'] == 'AbstractHttpCrawler._execute_pre_navigation_hooks' assert telemetry_data[3]['attributes']['url.full'] == str(server_url) assert telemetry_data[3]['resource']['attributes'] == dict(resource.attributes) assert telemetry_data[-1]['name'] == '__run_task_function' assert telemetry_data[-1]['attributes']['code.function.name'] == 'BasicCrawler.__run_task_function' assert telemetry_data[-1]['resource']['attributes'] == dict(resource.attributes) # Processing of the request is in the same trace. assert telemetry_data[3]['context']['trace_id'] == telemetry_data[-1]['context']['trace_id'] # Check that trace_ids of unrelated traces are not the same. assert telemetry_data[0]['context']['trace_id'] != telemetry_data[-1]['context']['trace_id'] ================================================ FILE: tests/unit/proxy_configuration/test_new_proxy_info.py ================================================ from __future__ import annotations from itertools import cycle import pytest from crawlee import Request from crawlee.proxy_configuration import ProxyConfiguration async def test_returns_proxy_info() -> None: """Test that proxy_urls can return contain both string and None.""" config = ProxyConfiguration(proxy_urls=[None, 'http://proxy.com:1111']) proxy_info = await config.new_proxy_info(None, None, None) assert proxy_info is None proxy_info = await config.new_proxy_info(None, None, None) assert proxy_info is not None assert proxy_info.url == 'http://proxy.com:1111' assert proxy_info.hostname == 'proxy.com' assert proxy_info.username == '' assert proxy_info.password == '' assert proxy_info.port == 1111 async def test_throws_on_invalid_new_url_function() -> None: config = ProxyConfiguration( new_url_function=lambda session_id=None, request=None: 'http://proxy.com:1111*invalid_url' # noqa: ARG005 ) with pytest.raises(ValueError): # noqa: PT011 await config.new_proxy_info(None, None, None) async def test_returns_proxy_info_with_new_url_function() -> None: """Test that new_url_function can return string and None.""" proxy_iterator = cycle([None, 'http://proxy.com:1111']) config = ProxyConfiguration(new_url_function=lambda session_id=None, request=None: next(proxy_iterator)) # noqa: ARG005 proxy_info = await config.new_proxy_info(None, None, None) assert proxy_info is None proxy_info = await config.new_proxy_info(None, None, None) assert proxy_info is not None assert proxy_info.url == 'http://proxy.com:1111' assert proxy_info.hostname == 'proxy.com' assert proxy_info.username == '' assert proxy_info.password == '' assert proxy_info.port == 1111 async def test_returns_proxy_info_with_new_url_function_async() -> None: async def new_url(session_id: str | None = None, request: Request | None = None) -> str: # noqa: ARG001 return 'http://proxy.com:1111' config = ProxyConfiguration(new_url_function=new_url) proxy_info = await config.new_proxy_info(None, None, None) assert proxy_info is not None assert proxy_info.url == 'http://proxy.com:1111' assert proxy_info.hostname == 'proxy.com' assert proxy_info.username == '' assert proxy_info.password == '' assert proxy_info.port == 1111 async def test_rotates_proxies() -> None: proxy_urls: list[str | None] = ['http://proxy:1111', 'http://proxy:2222', 'http://proxy:3333'] config = ProxyConfiguration(proxy_urls=proxy_urls) info = await config.new_proxy_info(None, None, None) assert info is not None assert info.url == proxy_urls[0] info = await config.new_proxy_info(None, None, None) assert info is not None assert info.url == proxy_urls[1] info = await config.new_proxy_info(None, None, None) assert info is not None assert info.url == proxy_urls[2] async def test_rotates_proxies_with_sessions() -> None: proxy_urls: list[str | None] = ['http://proxy:1111', 'http://proxy:2222', 'http://proxy:3333'] request = Request(url='http://some.domain/abc', unique_key='1') sessions = [f'session_{i}' for i in range(6)] config = ProxyConfiguration(proxy_urls=proxy_urls) # A single session should always receive the same proxy info = await config.new_proxy_info(sessions[0], None, None) assert info is not None assert info.url == proxy_urls[0] info = await config.new_proxy_info(sessions[0], None, None) assert info is not None assert info.url == proxy_urls[0] info = await config.new_proxy_info(sessions[0], None, None) assert info is not None assert info.url == proxy_urls[0] info = await config.new_proxy_info(sessions[0], request, None) assert info is not None assert info.url == proxy_urls[0] info = await config.new_proxy_info(sessions[0], request, None) assert info is not None assert info.url == proxy_urls[0] # Different sessions should get rotated proxies info = await config.new_proxy_info(sessions[1], None, None) assert info is not None assert info.url == proxy_urls[1] info = await config.new_proxy_info(sessions[2], request, None) assert info is not None assert info.url == proxy_urls[2] info = await config.new_proxy_info(sessions[3], None, None) assert info is not None assert info.url == proxy_urls[0] info = await config.new_proxy_info(sessions[4], None, None) assert info is not None assert info.url == proxy_urls[1] info = await config.new_proxy_info(sessions[5], request, None) assert info is not None assert info.url == proxy_urls[2] # Without sessions should get rotated proxies info = await config.new_proxy_info(None, None, None) assert info is not None assert info.url == proxy_urls[0] info = await config.new_proxy_info(None, request, None) assert info is not None assert info.url == proxy_urls[1] info = await config.new_proxy_info(None, None, None) assert info is not None assert info.url == proxy_urls[2] info = await config.new_proxy_info(None, None, None) assert info is not None assert info.url == proxy_urls[0] info = await config.new_proxy_info(None, request, None) assert info is not None assert info.url == proxy_urls[1] @pytest.mark.parametrize( ('url', 'expected_port'), [ # Default ports based on the URL scheme ('http://proxy.com', 80), ('https://proxy.com', 443), # Explicit ports specified in the URL ('http://proxy.com:80', 80), ('http://proxy.com:1234', 1234), ], ) async def test_sets_port(url: str, expected_port: int) -> None: """Test that the port property is set correctly. The port is inferred from the URL scheme if it is not specified in the URL. """ config = ProxyConfiguration(proxy_urls=[url]) info = await config.new_proxy_info(None, None, None) assert info is not None assert info.port == expected_port ================================================ FILE: tests/unit/proxy_configuration/test_tiers.py ================================================ from __future__ import annotations from crawlee import Request from crawlee.proxy_configuration import ProxyConfiguration async def test_rotates_proxies_uniformly_with_no_request() -> None: tiered_proxy_urls: list[list[str | None]] = [ ['http://proxy:1111', 'http://proxy:2222'], ['http://proxy:3333', 'http://proxy:4444'], ] config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls) info = await config.new_proxy_info(None, None, None) assert info is not None assert info.url == tiered_proxy_urls[0][0] info = await config.new_proxy_info(None, None, None) assert info is not None assert info.url == tiered_proxy_urls[0][1] info = await config.new_proxy_info(None, None, None) assert info is not None assert info.url == tiered_proxy_urls[1][0] info = await config.new_proxy_info(None, None, None) assert info is not None assert info.url == tiered_proxy_urls[1][1] info = await config.new_proxy_info(None, None, None) assert info is not None assert info.url == tiered_proxy_urls[0][0] async def test_retrying_request_makes_tier_go_up() -> None: tiered_proxy_urls: list[list[str | None]] = [ ['http://proxy:1111'], ['http://proxy:2222'], ['http://proxy:3333'], ['http://proxy:4444'], ] config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls) # Calling `new_proxy_info` with the same request most probably means it's being retried request_1 = Request(url='http://some.domain/abc', unique_key='1') info = await config.new_proxy_info(None, request_1, None) assert info is not None assert info.url == tiered_proxy_urls[0][0] info = await config.new_proxy_info(None, request_1, None) assert info is not None assert info.url == tiered_proxy_urls[1][0] info = await config.new_proxy_info(None, request_1, None) assert info is not None assert info.url == tiered_proxy_urls[2][0] # Subsequent requests with the same domain should use the same tier request_2 = Request(url='http://some.domain/xyz', unique_key='2') info = await config.new_proxy_info(None, request_2, None) assert info is not None assert info.url == tiered_proxy_urls[2][0] async def test_retrying_request_makes_tier_go_up_with_sessions() -> None: tiered_proxy_urls: list[list[str | None]] = [ ['http://proxy:1111'], ['http://proxy:2222'], ['http://proxy:3333'], ['http://proxy:4444'], ] config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls) request = Request(url='http://some.domain/abc', unique_key='1') # Calling `new_proxy_info` with the same request likely means that it is being retried. # However, a single session should always receive the same proxy info = await config.new_proxy_info('session_id', request, None) assert info is not None assert info.url == tiered_proxy_urls[0][0] info = await config.new_proxy_info('session_id', request, None) assert info is not None assert info.url == tiered_proxy_urls[0][0] info = await config.new_proxy_info('session_id', request, None) assert info is not None assert info.url == tiered_proxy_urls[0][0] # For a new session, we will get a proxy from the corresponding tier info = await config.new_proxy_info('session_id2', request, None) assert info is not None assert info.url == tiered_proxy_urls[3][0] info = await config.new_proxy_info('session_id2', request, None) assert info is not None assert info.url == tiered_proxy_urls[3][0] async def test_successful_request_makes_tier_go_down() -> None: """Repeatedly requesting a proxy for a single request will cause the proxy tier to go up - ProxyConfiguration assumes those are retries. Then, requesting a proxy for different requests to the same domain will cause the tier to drop back down.""" tiered_proxy_urls: list[list[str | None]] = [ ['http://proxy:1111'], ['http://proxy:2222'], ['http://proxy:3333'], ['http://proxy:4444'], ] config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls) request_1 = Request(url='http://some.domain/abc', unique_key='1') info = None for tier in tiered_proxy_urls: info = await config.new_proxy_info(None, request_1, None) assert info is not None assert info.url == tier[0] for i in range(100): new_request = Request(url=f'http://some.domain/{i}', unique_key=str(i)) info = await config.new_proxy_info(None, new_request, None) assert info is not None assert info.url == tiered_proxy_urls[0][0] async def test_none_proxy_retrying_request_makes_tier_go_up() -> None: tiered_proxy_urls: list[list[str | None]] = [ [None], ['http://proxy:1111'], ] config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls) # Calling `new_proxy_info` with the same request most probably means it's being retried request_1 = Request(url='http://some.domain/abc', unique_key='1') # No proxy used. info = await config.new_proxy_info(None, request_1, None) assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.' # Proxy should go up one tier for same request that was already sent before. info = await config.new_proxy_info(None, request_1, None) assert info is not None, ( 'config.new_proxy_info is expected to generate non-none proxy info from non-none tiered_proxy_urls.' ) assert info.url == tiered_proxy_urls[1][0] async def test_none_proxy_rotates_proxies_uniformly_with_no_request() -> None: tiered_proxy_urls = [ [None, 'http://proxy:1111'], ] config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls) # No proxy used. info = await config.new_proxy_info(None, None, None) assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.' # Proxy should be rotated on the same proxy tier for a new request. info = await config.new_proxy_info(None, None, None) assert info is not None, ( 'config.new_proxy_info is expected to generate non-none proxy info from non-none tiered_proxy_urls.' ) assert info.url == tiered_proxy_urls[0][1] # Proxy rotation starts from the beginning of the proxy list after last proxy in tier was used. No proxy used again. info = await config.new_proxy_info(None, None, None) assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.' ================================================ FILE: tests/unit/request_loaders/test_request_list.py ================================================ from collections.abc import AsyncGenerator from crawlee.request_loaders._request_list import RequestList from crawlee.storages import KeyValueStore async def test_sync_traversal() -> None: request_list = RequestList(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) while not await request_list.is_finished(): item = await request_list.fetch_next_request() assert item is not None await request_list.mark_request_as_handled(item) assert await request_list.is_empty() async def test_async_traversal() -> None: async def generator() -> AsyncGenerator[str]: yield 'https://a.placeholder.com' yield 'https://b.placeholder.com' yield 'https://c.placeholder.com' request_list = RequestList(generator()) while not await request_list.is_finished(): item = await request_list.fetch_next_request() assert item is not None await request_list.mark_request_as_handled(item) assert await request_list.is_empty() async def test_is_empty_does_not_depend_on_fetch_next_request() -> None: request_list = RequestList(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) item_1 = await request_list.fetch_next_request() assert item_1 is not None assert not await request_list.is_finished() item_2 = await request_list.fetch_next_request() assert item_2 is not None assert not await request_list.is_finished() item_3 = await request_list.fetch_next_request() assert item_3 is not None assert not await request_list.is_finished() assert await request_list.is_empty() assert not await request_list.is_finished() await request_list.mark_request_as_handled(item_1) await request_list.mark_request_as_handled(item_2) await request_list.mark_request_as_handled(item_3) assert await request_list.is_empty() assert await request_list.is_finished() async def test_persist_requests_key_with_sync_iterable() -> None: """Test that persist_requests_key persists request data from a sync iterable.""" persist_key = 'test_requests_persist_sync' urls = ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'] # Create a request list with persistence enabled request_list = RequestList(urls, persist_requests_key=persist_key) # Fetch one request to trigger initialization first_request = await request_list.fetch_next_request() assert first_request is not None assert first_request.url == 'https://a.placeholder.com' # Check that the requests were persisted kvs = await KeyValueStore.open() persisted_data = await kvs.get_value(persist_key) assert persisted_data is not None async def test_persist_requests_key_with_empty_iterator() -> None: """Test behavior when persist_requests_key is provided but the iterator is empty.""" persist_key = 'test_empty_iterator' # Create request list with empty iterator request_list = RequestList([], persist_requests_key=persist_key) # Should be empty immediately assert await request_list.is_empty() assert await request_list.is_finished() # Check that empty requests were persisted kvs = await KeyValueStore.open() persisted_data = await kvs.get_value(persist_key) assert persisted_data is not None async def test_requests_restoration_without_state() -> None: """Test that persisted request data is properly restored on subsequent RequestList creation.""" persist_requests_key = 'test_requests_restoration' urls = ['https://restore1.placeholder.com', 'https://restore2.placeholder.com'] # Create first request list and process one request request_list_1 = RequestList(urls, persist_requests_key=persist_requests_key) first_request = await request_list_1.fetch_next_request() assert first_request is not None assert first_request.url == 'https://restore1.placeholder.com' await request_list_1.mark_request_as_handled(first_request) # Create second request list with same persist key (simulating restart) # Since we don't have state persistence, it will start from the beginning of the persisted data spy = iter(['1', '2', '3']) request_list_2 = RequestList(spy, persist_requests_key=persist_requests_key) # Should be able to fetch requests from persisted data, but starts from beginning first_request_again = await request_list_2.fetch_next_request() assert first_request_again is not None assert first_request_again.url == 'https://restore1.placeholder.com' await request_list_2.mark_request_as_handled(first_request_again) # Make sure that the second instance did not consume the input iterator assert len(list(spy)) == 3 async def test_state_restoration() -> None: """Test that persisted processing state is properly restored on subsequent RequestList creation.""" persist_state_key = 'test_state_restoration' urls = [ 'https://restore1.placeholder.com', 'https://restore2.placeholder.com', 'https://restore3.placeholder.com', 'https://restore4.placeholder.com', ] # Create first request list and process one request request_list_1 = RequestList( urls, persist_state_key=persist_state_key, ) first_request = await request_list_1.fetch_next_request() assert first_request is not None assert first_request.url == 'https://restore1.placeholder.com' await request_list_1.mark_request_as_handled(first_request) await request_list_1._state.persist_state() # Create second request list with same persist key (simulating restart) request_list_2 = RequestList( urls, persist_state_key=persist_state_key, ) # Should be able to continue where the previous instance left off next_request = await request_list_2.fetch_next_request() assert next_request is not None assert next_request.url == 'https://restore2.placeholder.com' await request_list_2.mark_request_as_handled(next_request) next_request = await request_list_2.fetch_next_request() assert next_request is not None assert next_request.url == 'https://restore3.placeholder.com' await request_list_2.mark_request_as_handled(next_request) next_request = await request_list_2.fetch_next_request() assert next_request is not None assert next_request.url == 'https://restore4.placeholder.com' await request_list_2.mark_request_as_handled(next_request) async def test_requests_and_state_restoration() -> None: """Test that persisted request data and processing state is properly restored on subsequent RequestList creation.""" persist_requests_key = 'test_requests_restoration' persist_state_key = 'test_state_restoration' urls = [ 'https://restore1.placeholder.com', 'https://restore2.placeholder.com', 'https://restore3.placeholder.com', ] # Create first request list and process one request request_list_1 = RequestList( urls, persist_requests_key=persist_requests_key, persist_state_key=persist_state_key, ) first_request = await request_list_1.fetch_next_request() assert first_request is not None assert first_request.url == 'https://restore1.placeholder.com' await request_list_1.mark_request_as_handled(first_request) await request_list_1._state.persist_state() # Create second request list with same persist key (simulating restart) spy = iter(['1', '2', '3']) request_list_2 = RequestList( spy, persist_requests_key=persist_requests_key, persist_state_key=persist_state_key, ) # Should be able to fetch requests from persisted data and continue where the previous instance left off next_request = await request_list_2.fetch_next_request() assert next_request is not None assert next_request.url == 'https://restore2.placeholder.com' await request_list_2.mark_request_as_handled(next_request) next_request = await request_list_2.fetch_next_request() assert next_request is not None assert next_request.url == 'https://restore3.placeholder.com' await request_list_2.mark_request_as_handled(next_request) # Make sure that the second instance did not consume the input iterator assert len(list(spy)) == 3 async def test_persist_requests_key_only_persists_once() -> None: """Test that requests are only persisted once, even with multiple RequestList instances.""" persist_key = 'test_requests_once' urls = ['https://once1.placeholder.com', 'https://once2.placeholder.com'] # Create first request list request_list_1 = RequestList(urls, persist_requests_key=persist_key) await request_list_1.fetch_next_request() # Trigger persistence # Get initial persisted data kvs = await KeyValueStore.open() initial_data = await kvs.get_value(persist_key) assert initial_data is not None # Create second request list with different data different_urls = ['https://different.placeholder.com'] request_list_2 = RequestList(different_urls, persist_requests_key=persist_key) await request_list_2.fetch_next_request() # Should use persisted data, not new data # Verify the persisted data hasn't changed current_data = await kvs.get_value(persist_key) assert current_data == initial_data # The request should come from the original persisted data, not the new iterator fetched_request = await request_list_2.fetch_next_request() assert fetched_request is not None assert fetched_request.url == 'https://once2.placeholder.com' # From original data ================================================ FILE: tests/unit/request_loaders/test_sitemap_request_loader.py ================================================ import asyncio import base64 import gzip from typing import TYPE_CHECKING from yarl import URL from crawlee import RequestOptions, RequestTransformAction from crawlee.http_clients._base import HttpClient from crawlee.request_loaders._sitemap_request_loader import SitemapRequestLoader from crawlee.storages import KeyValueStore if TYPE_CHECKING: from crawlee._types import JsonSerializable BASIC_SITEMAP = """ http://not-exists.com/ 2005-02-03 monthly 0.8 http://not-exists.com/catalog?item=12&desc=vacation_hawaii weekly http://not-exists.com/catalog?item=73&desc=vacation_new_zealand 2004-12-23 weekly http://not-exists.com/catalog?item=74&desc=vacation_newfoundland 2004-12-23T18:00:15+00:00 0.3 http://not-exists.com/catalog?item=83&desc=vacation_usa 2004-11-23 """.strip() def compress_gzip(data: str) -> bytes: """Compress a string using gzip.""" return gzip.compress(data.encode()) def encode_base64(data: bytes) -> str: """Encode bytes to a base64 string.""" return base64.b64encode(data).decode('utf-8') async def test_sitemap_traversal(server_url: URL, http_client: HttpClient) -> None: sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client) while not await sitemap_loader.is_finished(): item = await sitemap_loader.fetch_next_request() if item: await sitemap_loader.mark_request_as_handled(item) assert await sitemap_loader.is_empty() assert await sitemap_loader.is_finished() assert await sitemap_loader.get_total_count() == 5 assert await sitemap_loader.get_handled_count() == 5 async def test_is_empty_does_not_depend_on_fetch_next_request(server_url: URL, http_client: HttpClient) -> None: sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client) items = [] for _ in range(5): item = await sitemap_loader.fetch_next_request() assert item is not None assert not await sitemap_loader.is_finished() items.append(item) assert await sitemap_loader.is_empty() assert not await sitemap_loader.is_finished() for item in items: await sitemap_loader.mark_request_as_handled(item) assert await sitemap_loader.is_empty() await asyncio.sleep(0.1) assert await sitemap_loader.is_finished() async def test_abort_sitemap_loading(server_url: URL, http_client: HttpClient) -> None: sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) sitemap_loader = SitemapRequestLoader([str(sitemap_url)], max_buffer_size=2, http_client=http_client) item = await sitemap_loader.fetch_next_request() assert item is not None await sitemap_loader.mark_request_as_handled(item) assert not await sitemap_loader.is_empty() assert not await sitemap_loader.is_finished() await sitemap_loader.abort_loading() item = await sitemap_loader.fetch_next_request() assert item is not None await sitemap_loader.mark_request_as_handled(item) assert await sitemap_loader.is_finished() async def test_create_persist_state_for_sitemap_loading( server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore ) -> None: sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) persist_key = 'create_persist_state' sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, persist_state_key=persist_key) assert await sitemap_loader.is_finished() is False await sitemap_loader.close() state_data = await key_value_store.get_value(persist_key) assert state_data is not None assert state_data['handledCount'] == 0 async def test_data_persistence_for_sitemap_loading( server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore ) -> None: async def wait_for_sitemap_loader_not_empty(sitemap_loader: SitemapRequestLoader) -> None: while await sitemap_loader.is_empty() and not await sitemap_loader.is_finished(): # noqa: ASYNC110 await asyncio.sleep(0.1) sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) persist_key = 'data_persist_state' sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, persist_state_key=persist_key) # Give time to load await asyncio.wait_for(wait_for_sitemap_loader_not_empty(sitemap_loader), timeout=2) await sitemap_loader.close() state_data = await key_value_store.get_value(persist_key) assert state_data is not None assert state_data['handledCount'] == 0 assert state_data['totalCount'] == 5 assert len(state_data['urlQueue']) == 5 async def test_recovery_data_persistence_for_sitemap_loading( server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore ) -> None: sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) persist_key = 'recovery_persist_state' sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, persist_state_key=persist_key) item = await sitemap_loader.fetch_next_request() assert item is not None await sitemap_loader.mark_request_as_handled(item) await sitemap_loader.close() state_data = await key_value_store.get_value(persist_key) assert state_data is not None next_item_in_kvs = state_data['urlQueue'][0] sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, persist_state_key=persist_key) item = await sitemap_loader.fetch_next_request() assert item is not None assert item.url == next_item_in_kvs async def test_transform_request_function(server_url: URL, http_client: HttpClient) -> None: sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction: user_data: dict[str, JsonSerializable] = {'transformed': True} request_options['user_data'] = user_data return request_options sitemap_loader = SitemapRequestLoader( [str(sitemap_url)], http_client=http_client, transform_request_function=transform_request, ) extracted_urls = set() while not await sitemap_loader.is_finished(): request = await sitemap_loader.fetch_next_request() if request: assert request.user_data.get('transformed') is True extracted_urls.add(request.url) await sitemap_loader.mark_request_as_handled(request) assert len(extracted_urls) == 5 assert extracted_urls == { 'http://not-exists.com/', 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', 'http://not-exists.com/catalog?item=83&desc=vacation_usa', } ================================================ FILE: tests/unit/server.py ================================================ from __future__ import annotations import asyncio import base64 import gzip import json import sys import threading import time from collections.abc import Awaitable, Callable, Coroutine, Iterator from typing import TYPE_CHECKING, Any from urllib.parse import parse_qs from uvicorn.server import Server from yarl import URL from tests.unit.server_endpoints import ( BASE_INDEX, GENERIC_RESPONSE, HELLO_WORLD, INCAPSULA, INFINITE_SCROLL, NON_HREF_LINKS, PROBLEMATIC_LINKS, RESOURCE_LOADING_PAGE, ROBOTS_TXT, SECONDARY_INDEX, START_ENQUEUE, START_ENQUEUE_NON_HREF, ) if TYPE_CHECKING: from socket import socket Receive = Callable[[], Awaitable[dict[str, Any]]] Send = Callable[[dict[str, Any]], Coroutine[None, None, None]] PathHandler = Callable[[dict[str, Any], Receive, Send], Coroutine[None, None, None]] def get_headers_dict(scope: dict[str, Any]) -> dict[str, str]: """Extract request headers and return them as a dictionary.""" headers = {} for name, value in scope.get('headers', []): headers[name.decode()] = value.decode() return headers def get_query_params(query_string: bytes) -> dict[str, str]: """Extract and parse query parameters from the request.""" args = parse_qs(query_string.decode(), keep_blank_values=True) result_args = {} for key, values in args.items(): if values: result_args[key] = values[0] return result_args def get_cookies_from_headers(headers: dict[str, Any]) -> dict[str, str]: """Extract cookies from request headers.""" cookies = {} cookie_header: str = headers.get('cookie', '') if cookie_header: for cookie in cookie_header.split(';'): name, value = cookie.strip().split('=') cookies[name] = value return cookies async def send_json_response(send: Send, data: Any, status: int = 200) -> None: """Send a JSON response to the client.""" await send( { 'type': 'http.response.start', 'status': status, 'headers': [[b'content-type', b'application/json']], } ) await send({'type': 'http.response.body', 'body': json.dumps(data, indent=2).encode()}) async def send_html_response(send: Send, html_content: bytes, status: int = 200) -> None: """Send an HTML response to the client.""" await send( { 'type': 'http.response.start', 'status': status, 'headers': [[b'content-type', b'text/html; charset=utf-8']], } ) await send({'type': 'http.response.body', 'body': html_content}) async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None: """Main ASGI application handler that routes requests to specific handlers. Args: scope: The ASGI connection scope. receive: The ASGI receive function. send: The ASGI send function. """ assert scope['type'] == 'http' paths: dict[str, PathHandler] = { 'start_enqueue': start_enqueue_endpoint, 'start_enqueue_non_href': start_enqueue_non_href_endpoint, 'sub_index': secondary_index_endpoint, 'incapsula': incapsula_endpoint, 'page_1': generic_response_endpoint, 'page_2': generic_response_endpoint, 'page_3': generic_response_endpoint, 'base_page': base_index_endpoint, 'problematic_links': problematic_links_endpoint, 'non_href_links': non_href_links_endpoint, 'set_cookies': set_cookies, 'set_complex_cookies': set_complex_cookies, 'cookies': get_cookies, 'status': echo_status, 'headers': echo_headers, 'user-agent': echo_user_agent, 'echo_content': echo_content, 'sitemap.txt': echo_content, 'sitemap.xml': echo_content, 'sitemap.xml.gz': echo_content, 'get': get_echo, 'post': post_echo, 'redirect': redirect_to_url, 'json': hello_world_json, 'xml': hello_world_xml, 'robots.txt': robots_txt, 'get_compressed': get_compressed, 'slow': slow_response, 'infinite_scroll': infinite_scroll_endpoint, 'resource_loading_page': resource_loading_endpoint, } path = URL(scope['path']).parts[1] # Route requests to appropriate handlers if path in paths: path_func = paths[path] await path_func(scope, receive, send) else: await hello_world(scope, receive, send) async def get_cookies(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests to retrieve cookies sent in the request.""" headers = get_headers_dict(scope) cookies = get_cookies_from_headers(headers) await send_json_response(send, {'cookies': cookies}) async def set_cookies(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests to set cookies from query parameters and redirect.""" query_params = get_query_params(scope.get('query_string', b'')) headers = [ [b'content-type', b'text/plain; charset=utf-8'], [b'location', b'/cookies'], # Redirect header ] for key, values in query_params.items(): if values: # Only add if there's at least one value cookie_value = f'{key}={values[0]}; Path=/' headers.append([b'set-cookie', cookie_value.encode()]) await send( { 'type': 'http.response.start', 'status': 302, # 302 Found for redirect 'headers': headers, } ) await send({'type': 'http.response.body', 'body': b'Redirecting to get_cookies...'}) async def hello_world(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle basic requests with a simple HTML response.""" await send_html_response( send, HELLO_WORLD, ) async def hello_world_json(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle basic requests with a simple JSON response.""" await send_json_response( send, {'hello': 'world'}, ) async def hello_world_xml(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle basic requests with a simple XML response.""" await send_html_response( send, b""" world""", ) async def post_echo(scope: dict[str, Any], receive: Receive, send: Send) -> None: """Echo back POST request details similar to httpbin.org/post.""" # Extract basic request info path = scope.get('path', '') query_string = scope.get('query_string', b'') args = get_query_params(query_string) # Extract headers and cookies headers = get_headers_dict(scope) # Read the request body body = b'' form = {} json_data = None more_body = True while more_body: message = await receive() if message['type'] == 'http.request': body += message.get('body', b'') more_body = message.get('more_body', False) # Parse body based on content type content_type = headers.get('content-type', '').lower() if body and 'application/json' in content_type: json_data = json.loads(body.decode()) if body and 'application/x-www-form-urlencoded' in content_type: form_data = parse_qs(body.decode()) for key, values in form_data.items(): form[key] = values[0] if len(values) == 1 else values body_text = '' if form else body.decode('utf-8', errors='replace') # Prepare response response = { 'args': args, 'data': body_text, 'files': {}, # Not handling multipart file uploads 'form': form, 'headers': headers, 'json': json_data, 'origin': headers.get('host', ''), 'url': f'http://{headers["host"]}{path}', } await send_json_response(send, response) async def echo_status(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Echo the status code from the URL path.""" status_code = int(scope['path'].replace('/status/', '')) await send( { 'type': 'http.response.start', 'status': status_code, 'headers': [[b'content-type', b'text/plain']], } ) await send({'type': 'http.response.body', 'body': b''}) async def echo_headers(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Echo back the request headers as JSON.""" headers = get_headers_dict(scope) await send_json_response(send, headers) async def start_enqueue_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests for the main page with links.""" await send_html_response( send, START_ENQUEUE, ) async def secondary_index_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests for the secondary page with links.""" await send_html_response( send, SECONDARY_INDEX, ) async def incapsula_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests for a page with an incapsula iframe.""" await send_html_response( send, INCAPSULA, ) async def generic_response_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests with a generic HTML response.""" await send_html_response( send, GENERIC_RESPONSE, ) async def problematic_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests with a page containing problematic links.""" await send_html_response( send, PROBLEMATIC_LINKS, ) async def non_href_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests with a page containing non-href links.""" await send_html_response( send, NON_HREF_LINKS, ) async def redirect_to_url(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests that should redirect to a specified full URL.""" query_params = get_query_params(scope.get('query_string', b'')) target_url = query_params.get('url', 'http://example.com') status_code = int(query_params.get('status', 302)) await send( { 'type': 'http.response.start', 'status': status_code, 'headers': [ [b'content-type', b'text/plain; charset=utf-8'], [b'location', target_url.encode()], ], } ) await send({'type': 'http.response.body', 'body': f'Redirecting to {target_url}...'.encode()}) async def echo_user_agent(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Echo back the user agent header as a response.""" headers = get_headers_dict(scope) user_agent = headers.get('user-agent', 'Not provided') await send_json_response(send, {'user-agent': user_agent}) async def get_echo(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Echo back GET request details similar to httpbin.org/get.""" path = scope.get('path', '') query_string = scope.get('query_string', b'') args = get_query_params(query_string) headers = get_headers_dict(scope) origin = scope.get('client', ('unknown', 0))[0] host = headers.get('host', 'localhost') scheme = headers.get('x-forwarded-proto', 'http') url = f'{scheme}://{host}{path}' if query_string: url += f'?{query_string}' response = { 'args': args, 'headers': headers, 'origin': origin, 'url': url, } await send_json_response(send, response) async def set_complex_cookies(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests to set specific cookies with various attributes.""" headers = [ [b'content-type', b'text/plain; charset=utf-8'], [b'set-cookie', b'basic=1; Path=/; HttpOnly; SameSite=Lax'], [b'set-cookie', b'withpath=2; Path=/html; SameSite=None'], [b'set-cookie', b'strict=3; Path=/; SameSite=Strict'], [b'set-cookie', b'secure=4; Path=/; HttpOnly; Secure; SameSite=Strict; Partitioned'], [b'set-cookie', b'short=5; Path=/;'], [b'set-cookie', b'domain=6; Path=/; Domain=.127.0.0.1;'], ] await send( { 'type': 'http.response.start', 'status': 200, 'headers': headers, } ) await send({'type': 'http.response.body', 'body': b'Cookies have been set!'}) async def echo_content(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Echo back content (plain text or base64) with specified content-type.""" query_params = get_query_params(scope.get('query_string', b'')) content = query_params.get('content', '') base64_content = query_params.get('base64', '') c_type = query_params.get('c_type', 'text/html; charset=utf-8') out_content = base64.b64decode(base64_content) if base64_content else content.encode() await send( { 'type': 'http.response.start', 'status': 200, 'headers': [[b'content-type', c_type.encode()]], } ) await send({'type': 'http.response.body', 'body': out_content}) async def robots_txt(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests for the robots.txt file.""" await send_html_response(send, ROBOTS_TXT) async def get_compressed(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Return large gzip compressed content.""" await send( { 'type': 'http.response.start', 'status': 200, 'headers': [[b'content-encoding', b'gzip']], } ) await send({'type': 'http.response.body', 'body': gzip.compress(HELLO_WORLD * 1000)}) async def slow_response(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests with a configurable delay to test timeouts.""" query_params = get_query_params(scope.get('query_string', b'')) delay = float(query_params.get('delay', '5')) # Default 5 second delay await asyncio.sleep(delay) await send_html_response(send, HELLO_WORLD) async def infinite_scroll_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests for the infinite scroll page.""" await send_html_response( send, INFINITE_SCROLL, ) async def resource_loading_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests for the resource loading page.""" await send_html_response( send, RESOURCE_LOADING_PAGE, ) async def base_index_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests for the base index page.""" host = f'http://{get_headers_dict(_scope).get("host", "localhost")}' content = BASE_INDEX.format(host=host).encode() await send_html_response( send, content, ) async def start_enqueue_non_href_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests for the base index page.""" host = f'http://{get_headers_dict(_scope).get("host", "localhost")}' content = START_ENQUEUE_NON_HREF.format(host=host).encode() await send_html_response( send, content, ) class TestServer(Server): """A test HTTP server implementation based on Uvicorn Server.""" @property def url(self) -> URL: """Get the base URL of the server. Returns: A URL instance with the server's base URL. """ protocol = 'https' if self.config.is_ssl else 'http' return URL(f'{protocol}://{self.config.host}:{self.config.port}/') async def serve(self, sockets: list[socket] | None = None) -> None: """Run the server and set up restart capability. Args: sockets: Optional list of sockets to bind to. """ self.restart_requested = asyncio.Event() loop = asyncio.get_event_loop() tasks = { loop.create_task(super().serve(sockets=sockets)), loop.create_task(self.watch_restarts()), } await asyncio.wait(tasks) async def restart(self) -> None: """Request server restart and wait for it to complete. This method can be called from a different thread than the one the server is running on, and from a different async environment. """ self.started = False self.restart_requested.set() while not self.started: # noqa: ASYNC110 await asyncio.sleep(0.2) async def watch_restarts(self) -> None: """Watch for and handle restart requests.""" while True: if self.should_exit: return try: await asyncio.wait_for(self.restart_requested.wait(), timeout=0.1) except asyncio.TimeoutError: continue self.restart_requested.clear() await self.shutdown() await self.startup() def run(self, sockets: list[socket] | None = None) -> None: """Run the server.""" # Set the event loop policy in thread with server for Windows and Python 3.12+. # This is necessary because there are problems with closing connections when using `ProactorEventLoop` if sys.version_info >= (3, 12) and sys.platform == 'win32': return asyncio.run(self.serve(sockets=sockets), loop_factory=asyncio.SelectorEventLoop) super().run(sockets=sockets) return None def serve_in_thread(server: TestServer) -> Iterator[TestServer]: """Run a server in a background thread and yield it.""" thread = threading.Thread(target=server.run) thread.start() try: while not server.started: time.sleep(1e-3) yield server finally: server.should_exit = True thread.join() ================================================ FILE: tests/unit/server_endpoints.py ================================================ # Test server response content for testing HELLO_WORLD = b"""\ Hello, world! """ START_ENQUEUE = b"""\ Hello Link 1 Link 2 test@test.com """ START_ENQUEUE_NON_HREF = """\ Hello Link A Link B """ SECONDARY_INDEX = b"""\ Hello Link 3 Link 4 Base Page """ BASE_INDEX = """\ Hello Link 5 Link 6 """ INCAPSULA = b"""\ Hello """ PROBLEMATIC_LINKS = b"""\ Hello Placeholder test@test.com Apify avatar/a> """ NON_HREF_LINKS = b"""\ Hello
  • """ GENERIC_RESPONSE = b"""\ Hello Insightful content """ ROBOTS_TXT = b"""\ User-agent: * Disallow: *deny_all/ Disallow: /page_ crawl-delay: 10 User-agent: Googlebot Disallow: *deny_googlebot/ crawl-delay: 1 user-agent: Mozilla crawl-delay: 2 sitemap: http://not-exists.com/sitemap_1.xml sitemap: http://not-exists.com/sitemap_2.xml""" INFINITE_SCROLL = b"""\
    """ RESOURCE_LOADING_PAGE = b"""\ """ ================================================ FILE: tests/unit/server_static/test.js ================================================ ================================================ FILE: tests/unit/sessions/test_cookies.py ================================================ from __future__ import annotations import pytest from crawlee.sessions._cookies import CookieParam, PlaywrightCookieParam, SessionCookies @pytest.fixture def cookie_dict() -> CookieParam: return CookieParam( { 'name': 'test_cookie', 'value': 'test_value', 'domain': 'example.com', 'path': '/test', 'expires': 1735689600, 'http_only': True, 'secure': True, 'same_site': 'Strict', } ) @pytest.fixture def session_cookies(cookie_dict: CookieParam) -> SessionCookies: session_cookies = SessionCookies() session_cookies.set(**cookie_dict) return session_cookies def test_set_basic_cookie() -> None: """Test setting a basic cookie with minimal attributes.""" session_cookies = SessionCookies() session_cookies.set('test', 'value') cookies = list(session_cookies.jar) assert len(cookies) == 1 cookie = cookies[0] assert cookie.name == 'test' assert cookie.value == 'value' assert cookie.path == '/' assert not cookie.secure assert not cookie.has_nonstandard_attr('httpOnpy') def test_set_cookie_with_all_attributes(session_cookies: SessionCookies, cookie_dict: CookieParam) -> None: """Test setting a cookie with all available attributes.""" cookies = list(session_cookies.jar) assert len(cookies) == 1 cookie = cookies[0] assert cookie.name == cookie_dict.get('name') assert cookie.value == cookie_dict.get('value') assert cookie.path == cookie_dict.get('path') assert cookie.domain == cookie_dict.get('domain') assert cookie.expires == cookie_dict.get('expires') assert cookie.has_nonstandard_attr('HttpOnly') assert cookie.secure assert cookie.get_nonstandard_attr('SameSite') == 'Strict' def test_convert_cookie_to_dict(session_cookies: SessionCookies, cookie_dict: CookieParam) -> None: """Test converting Cookie object to dictionary representation.""" cookies = list(session_cookies.jar) assert len(cookies) == 1 cookie = cookies[0] converted_cookie_dict = session_cookies._convert_cookie_to_dict(cookie) assert converted_cookie_dict == cookie_dict def test_convert_dict_format(session_cookies: SessionCookies) -> None: """Test normalizing cookie attributes between internal and browser formats.""" internal_format = CookieParam({'name': 'test', 'value': 'value', 'http_only': True, 'same_site': 'Lax'}) # Test internal to browser format browser_format = session_cookies._to_playwright(internal_format) assert 'httpOnly' in browser_format assert 'sameSite' in browser_format assert 'http_only' not in browser_format assert 'same_site' not in browser_format # Test browser to internal format browser_format = PlaywrightCookieParam({'name': 'test', 'value': 'value', 'httpOnly': True, 'sameSite': 'Lax'}) internal_format = session_cookies._from_playwright(browser_format) assert 'http_only' in internal_format assert 'same_site' in internal_format assert 'httpOnly' not in internal_format assert 'sameSite' not in internal_format def test_get_cookies_as_browser_format(session_cookies: SessionCookies, cookie_dict: CookieParam) -> None: """Test getting cookies in browser-compatible format.""" browser_cookies = session_cookies.get_cookies_as_playwright_format() assert len(browser_cookies) == 1 cookie = browser_cookies[0] assert 'httpOnly' in cookie assert 'sameSite' in cookie assert cookie['httpOnly'] == cookie_dict.get('http_only') assert cookie['sameSite'] == cookie_dict.get('same_site') def test_get_cookies_as_dicts(session_cookies: SessionCookies, cookie_dict: CookieParam) -> None: """Test get list of dictionary from a SessionCookies.""" test_session_cookies = session_cookies.get_cookies_as_dicts() assert [cookie_dict] == test_session_cookies def test_store_cookie(session_cookies: SessionCookies) -> None: """Test storing a Cookie object directly.""" test_session_cookies = SessionCookies() cookies = list(session_cookies.jar) test_session_cookies.store_cookie(cookies[0]) assert test_session_cookies == session_cookies def test_store_multidomain_cookies() -> None: """Test of storing cookies with the same name for different domains""" session_cookies = SessionCookies() session_cookies.set(name='a', value='1', domain='test.io') session_cookies.set(name='a', value='2', domain='notest.io') check_cookies = { item.get('domain'): (item['name'], item['value']) for item in session_cookies.get_cookies_as_dicts() } assert len(check_cookies) == 2 assert check_cookies['test.io'] == ('a', '1') assert check_cookies['notest.io'] == ('a', '2') ================================================ FILE: tests/unit/sessions/test_models.py ================================================ from __future__ import annotations from datetime import datetime, timedelta, timezone import pytest from crawlee.sessions._cookies import CookieParam from crawlee.sessions._models import SessionModel SESSION_CREATED_AT = datetime.now(timezone.utc) @pytest.fixture def session_direct() -> SessionModel: """Provide a SessionModel instance directly using fixed parameters.""" return SessionModel( id='test_session', max_age=timedelta(minutes=30), user_data={'user_key': 'user_value'}, max_error_score=3.0, error_score_decrement=0.5, created_at=SESSION_CREATED_AT, usage_count=0, max_usage_count=10, error_score=0.0, cookies=[CookieParam({'name': 'cookie_key', 'value': 'cookie_value'})], blocked_status_codes=[401, 403, 429], ) @pytest.fixture def session_args_camel() -> dict: """Provide session parameters as dictionary with camel case keys.""" return { 'id': 'test_session', 'maxAge': '00:30:00', 'userData': {'user_key': 'user_value'}, 'maxErrorScore': 3.0, 'errorScoreDecrement': 0.5, 'createdAt': SESSION_CREATED_AT, 'usageCount': 0, 'maxUsageCount': 10, 'errorScore': 0.0, 'cookies': [CookieParam({'name': 'cookie_key', 'value': 'cookie_value'})], 'blockedStatusCodes': [401, 403, 429], } @pytest.fixture def session_args_snake() -> dict: """Provide session parameters as dictionary with snake case keys.""" return { 'id': 'test_session', 'max_age': '00:30:00', 'user_data': {'user_key': 'user_value'}, 'max_error_score': 3.0, 'error_score_decrement': 0.5, 'created_at': SESSION_CREATED_AT, 'usage_count': 0, 'max_usage_count': 10, 'error_score': 0.0, 'cookies': [CookieParam({'name': 'cookie_key', 'value': 'cookie_value'})], 'blocked_status_codes': [401, 403, 429], } def test_session_model( session_direct: SessionModel, session_args_camel: dict, session_args_snake: dict, ) -> None: """Test equivalence of SessionModel instances created directly and from camelCase, and snake_case kwargs.""" session_camel = SessionModel(**session_args_camel) session_snake = SessionModel(**session_args_snake) assert session_direct == session_camel == session_snake assert session_direct.id == session_camel.id == session_snake.id == 'test_session' # Check that max_age is correctly parsed into a timedelta object assert session_direct.max_age == session_camel.max_age == session_snake.max_age == timedelta(minutes=30) ================================================ FILE: tests/unit/sessions/test_session.py ================================================ from __future__ import annotations from datetime import datetime, timedelta, timezone import pytest from crawlee.sessions._cookies import SessionCookies from crawlee.sessions._session import Session @pytest.fixture def session() -> Session: return Session( id='test_session', max_age=timedelta(minutes=30), user_data={'user_key': 'user_value'}, max_error_score=3.0, error_score_decrement=0.5, created_at=datetime.now(timezone.utc), usage_count=0, max_usage_count=10, error_score=0.0, cookies={'cookie_key': 'cookie_value'}, blocked_status_codes=[401, 403, 429], ) def test_session_init(session: Session) -> None: """Verify that the session initializes correctly with the expected properties.""" assert session.id == 'test_session' assert session.user_data == {'user_key': 'user_value'} assert session.cookies == SessionCookies({'cookie_key': 'cookie_value'}) assert session.expires_at >= datetime.now(timezone.utc) assert not session.is_blocked assert not session.is_expired assert not session.is_max_usage_count_reached assert session.is_usable def test_session_get_state(session: Session) -> None: """Check if the session state is correctly retrievable in both dict and model forms.""" session_state_dict = session.get_state(as_dict=True) assert session_state_dict['id'] == 'test_session' session_state_model = session.get_state(as_dict=False) assert session_state_model.id == 'test_session' session_2 = Session.from_model(session_state_model) assert session_2.id == 'test_session' def test_mark_good(session: Session) -> None: """Test the mark_good method increases usage count and potentially decreases error score.""" initial_usage_count = session.usage_count session.mark_good() assert session.usage_count == initial_usage_count + 1 assert session.error_score == 0 def test_mark_bad(session: Session) -> None: """Test the mark_bad method affects the session's error score and usage.""" initial_error_score = session.error_score session.mark_bad() assert session.error_score == initial_error_score + 1 def test_multiple_marks(session: Session) -> None: """Test the mark_good and mark_bad methods in sequence.""" initial_usage_count = session.usage_count session.mark_bad() session.mark_bad() assert session.error_score == initial_usage_count + 2 session.mark_good() session.mark_good() assert session.error_score == initial_usage_count + 1 session.mark_bad() session.mark_bad() session.mark_good() assert session.is_blocked assert not session.is_usable def test_retire_method(session: Session) -> None: """Test that retire method properly sets the session as unusable.""" session.retire() assert not session.is_usable assert session.error_score == 3.0 def test_retire_on_blocked_status_code(session: Session) -> None: """Test retiring the session based on specific HTTP status codes.""" status_code = 403 result = session.is_blocked_status_code(status_code=status_code) assert result is True def test_not_retire_on_not_block_status_code(session: Session) -> None: """Test that the session is not retired on a non-blocked status code.""" status_code = 200 result = session.is_blocked_status_code(status_code=status_code) assert result is False def test_session_expiration() -> None: """Test the expiration logic of the session.""" session = Session(created_at=datetime.now(timezone.utc) - timedelta(hours=1)) assert session.is_expired ================================================ FILE: tests/unit/sessions/test_session_pool.py ================================================ from __future__ import annotations import logging from datetime import datetime, timezone from typing import TYPE_CHECKING import pytest from crawlee import service_locator from crawlee.events import EventManager from crawlee.events._types import Event, EventPersistStateData from crawlee.sessions import Session, SessionPool from crawlee.sessions._models import SessionPoolModel from crawlee.storages import KeyValueStore if TYPE_CHECKING: from collections.abc import AsyncGenerator MAX_POOL_SIZE = 3 KVS_NAME = 'test-session-pool' PERSIST_STATE_KEY = 'crawlee_session_pool_state' @pytest.fixture async def kvs() -> AsyncGenerator[KeyValueStore, None]: kvs = await KeyValueStore.open(name=KVS_NAME) yield kvs await kvs.drop() @pytest.fixture async def event_manager() -> AsyncGenerator[EventManager, None]: async with EventManager() as em: yield em @pytest.fixture async def session_pool() -> AsyncGenerator[SessionPool, None]: async with SessionPool(max_pool_size=MAX_POOL_SIZE, persistence_enabled=False) as sp: yield sp async def test_session_pool_init(session_pool: SessionPool) -> None: """Ensure that the session pool initializes correctly with predefined parameters.""" assert session_pool.session_count == MAX_POOL_SIZE assert session_pool.usable_session_count == MAX_POOL_SIZE assert session_pool.retired_session_count == 0 async def test_add_session(session_pool: SessionPool) -> None: """Test adding sessions to the session pool increases session counts appropriately.""" session_01 = Session(id='test_session_01') session_02 = Session(id='test_session_02') session_pool.add_session(session=session_01) session_pool.add_session(session=session_02) assert session_pool.session_count == MAX_POOL_SIZE + 2 assert session_pool.usable_session_count == MAX_POOL_SIZE + 2 assert session_pool.retired_session_count == 0 async def test_add_session_duplicate(caplog: pytest.LogCaptureFixture, session_pool: SessionPool) -> None: """Verify that adding a duplicate session logs a warning and does not increase count.""" session_01 = Session(id='test_session_01') session_02 = Session(id='test_session_01') session_pool.add_session(session=session_01) assert session_pool.session_count == MAX_POOL_SIZE + 1 with caplog.at_level(logging.WARNING): session_pool.add_session(session=session_02) assert session_pool.session_count == MAX_POOL_SIZE + 1 async def test_get_session(session_pool: SessionPool) -> None: """Check retrieval of a session from the pool and verify its properties.""" session = await session_pool.get_session() assert session is not None assert session.expires_at >= datetime.now(timezone.utc) assert not session.is_blocked assert not session.is_expired assert not session.is_max_usage_count_reached assert session.is_usable async def test_get_session_no_usable(caplog: pytest.LogCaptureFixture, session_pool: SessionPool) -> None: """Ensure that retrieval of a non-existent or retired session returns None and logs warning.""" session = await session_pool.get_session_by_id('non_existent') assert session is None session = Session(id='test_session_not_usable') session.retire() assert not session.is_usable session_pool.add_session(session=session) assert session_pool.session_count == MAX_POOL_SIZE + 1 with caplog.at_level(logging.WARNING): session = await session_pool.get_session_by_id('test_session_not_usable') assert session is None async def test_create_session_function() -> None: """Validate that a session created via a custom function works and has the expected fields set.""" user_data = {'created_by': 'test_create_session_function'} async with SessionPool( max_pool_size=MAX_POOL_SIZE, persistence_enabled=False, create_session_function=lambda: Session(user_data=user_data), ) as sp: session = await sp.get_session() assert session is not None assert session.user_data == user_data @pytest.mark.parametrize('kvs_name', [KVS_NAME, None]) async def test_session_pool_persist(event_manager: EventManager, kvs_name: str | None) -> None: """Test persistence of session pool state to KVS and validate stored data integrity.""" service_locator.set_event_manager(event_manager) async with SessionPool( max_pool_size=MAX_POOL_SIZE, persistence_enabled=True, persist_state_kvs_name=kvs_name, persist_state_key=PERSIST_STATE_KEY, ) as sp: # Emit persist state event and wait for the persistence to complete event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False)) await event_manager.wait_for_all_listeners_to_complete() # Get the persisted state from the key-value store kvs = await KeyValueStore.open(name=kvs_name) previous_state = await kvs.get_value(key=PERSIST_STATE_KEY) assert isinstance(previous_state, dict) sp_model = SessionPoolModel(**previous_state) # Check if the state is correctly persisted assert sp_model.session_count == sp.session_count assert sp_model.usable_session_count == sp.usable_session_count assert sp_model.retired_session_count == sp.retired_session_count # Check if all the sessions are correctly persisted for kvs_session in sp_model.sessions.values(): session = await sp.get_session_by_id(kvs_session.id) assert kvs_session == session async def test_session_pool_persist_and_restore(event_manager: EventManager, kvs: KeyValueStore) -> None: """Check session pool's ability to persist its state and then restore it accurately after reset.""" service_locator.set_event_manager(event_manager) async with SessionPool( max_pool_size=MAX_POOL_SIZE, persistence_enabled=True, persist_state_kvs_name=KVS_NAME, persist_state_key=PERSIST_STATE_KEY, ): # Emit persist state event and wait for the persistence to complete event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False)) await event_manager.wait_for_all_listeners_to_complete() async with SessionPool( max_pool_size=MAX_POOL_SIZE, persistence_enabled=True, persist_state_kvs_name=KVS_NAME, persist_state_key=PERSIST_STATE_KEY, ) as sp: # Not just reset the store and check it's empty await sp.reset_store() previous_state = await kvs.get_value(key=PERSIST_STATE_KEY) assert previous_state is None async def test_methods_raise_error_when_not_active() -> None: session = Session() session_pool = SessionPool() assert session_pool.active is False with pytest.raises(RuntimeError, match=r'SessionPool is not active.'): session_pool.get_state(as_dict=True) with pytest.raises(RuntimeError, match=r'SessionPool is not active.'): session_pool.add_session(session) with pytest.raises(RuntimeError, match=r'SessionPool is not active.'): await session_pool.get_session() with pytest.raises(RuntimeError, match=r'SessionPool is not active.'): await session_pool.get_session_by_id(session.id) await session_pool.reset_store() with pytest.raises(RuntimeError, match=r'SessionPool is already active.'): async with session_pool, session_pool: pass async with session_pool: assert session_pool.active is True ================================================ FILE: tests/unit/storage_clients/_file_system/test_fs_dataset_client.py ================================================ from __future__ import annotations import asyncio import json from pathlib import Path from typing import TYPE_CHECKING import pytest from crawlee._consts import METADATA_FILENAME from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient if TYPE_CHECKING: from collections.abc import AsyncGenerator from crawlee.storage_clients._file_system import FileSystemDatasetClient @pytest.fixture def configuration(tmp_path: Path) -> Configuration: return Configuration( storage_dir=str(tmp_path), ) @pytest.fixture async def dataset_client(configuration: Configuration) -> AsyncGenerator[FileSystemDatasetClient, None]: """A fixture for a file system dataset client.""" client = await FileSystemStorageClient().create_dataset_client(name='test-dataset', configuration=configuration) yield client await client.drop() async def test_file_and_directory_creation(configuration: Configuration) -> None: """Test that file system dataset creates proper files and directories.""" client = await FileSystemStorageClient().create_dataset_client(name='new-dataset', configuration=configuration) # Verify files were created assert client.path_to_dataset.exists() assert client.path_to_metadata.exists() # Verify metadata file structure with client.path_to_metadata.open() as f: metadata = json.load(f) client_metadata = await client.get_metadata() assert metadata['id'] == client_metadata.id assert metadata['name'] == 'new-dataset' assert metadata['item_count'] == 0 await client.drop() async def test_file_persistence_and_content_verification(dataset_client: FileSystemDatasetClient) -> None: """Test that data is properly persisted to files with correct content.""" item = {'key': 'value', 'number': 42} await dataset_client.push_data(item) # Verify files are created on disk all_files = list(dataset_client.path_to_dataset.glob('*.json')) assert len(all_files) == 2 # 1 data file + 1 metadata file # Verify actual file content data_files = [item for item in all_files if item.name != METADATA_FILENAME] assert len(data_files) == 1 with Path(data_files[0]).open() as f: saved_item = json.load(f) assert saved_item == item # Test multiple items file creation items = [{'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}] await dataset_client.push_data(items) all_files = list(dataset_client.path_to_dataset.glob('*.json')) assert len(all_files) == 5 # 4 data files + 1 metadata file data_files = [f for f in all_files if f.name != METADATA_FILENAME] assert len(data_files) == 4 # Original item + 3 new items async def test_drop_removes_files_from_disk(dataset_client: FileSystemDatasetClient) -> None: """Test that dropping a dataset removes the entire dataset directory from disk.""" await dataset_client.push_data({'test': 'data'}) assert dataset_client.path_to_dataset.exists() # Drop the dataset await dataset_client.drop() assert not dataset_client.path_to_dataset.exists() async def test_metadata_file_updates(dataset_client: FileSystemDatasetClient) -> None: """Test that metadata file is updated correctly after operations.""" # Record initial timestamps metadata = await dataset_client.get_metadata() initial_created = metadata.created_at initial_accessed = metadata.accessed_at initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform an operation that updates accessed_at await dataset_client.get_data() # Verify timestamps metadata = await dataset_client.get_metadata() assert metadata.created_at == initial_created assert metadata.accessed_at > initial_accessed assert metadata.modified_at == initial_modified accessed_after_get = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform an operation that updates modified_at await dataset_client.push_data({'new': 'item'}) # Verify timestamps again metadata = await dataset_client.get_metadata() assert metadata.created_at == initial_created assert metadata.modified_at > initial_modified assert metadata.accessed_at > accessed_after_get # Verify metadata file is updated on disk with dataset_client.path_to_metadata.open() as f: metadata_json = json.load(f) assert metadata_json['item_count'] == 1 async def test_data_persistence_across_reopens() -> None: """Test that data persists correctly when reopening the same dataset.""" storage_client = FileSystemStorageClient() # Create dataset and add data original_client = await storage_client.create_dataset_client(name='persistence-test') test_data = {'test_item': 'test_value', 'id': 123} await original_client.push_data(test_data) dataset_id = (await original_client.get_metadata()).id # Reopen by ID and verify data persists reopened_client = await storage_client.create_dataset_client(id=dataset_id) data = await reopened_client.get_data() assert len(data.items) == 1 assert data.items[0] == test_data await reopened_client.drop() ================================================ FILE: tests/unit/storage_clients/_file_system/test_fs_kvs_client.py ================================================ from __future__ import annotations import asyncio import json from typing import TYPE_CHECKING import pytest from crawlee._consts import METADATA_FILENAME from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient if TYPE_CHECKING: from collections.abc import AsyncGenerator from pathlib import Path from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient @pytest.fixture def configuration(tmp_path: Path) -> Configuration: return Configuration( storage_dir=str(tmp_path), ) @pytest.fixture async def kvs_client(configuration: Configuration) -> AsyncGenerator[FileSystemKeyValueStoreClient, None]: """A fixture for a file system key-value store client.""" client = await FileSystemStorageClient().create_kvs_client(name='test-kvs', configuration=configuration) yield client await client.drop() async def test_file_and_directory_creation(configuration: Configuration) -> None: """Test that file system KVS creates proper files and directories.""" client = await FileSystemStorageClient().create_kvs_client(name='new-kvs', configuration=configuration) # Verify files were created assert client.path_to_kvs.exists() assert client.path_to_metadata.exists() # Verify metadata file structure with client.path_to_metadata.open() as f: metadata = json.load(f) assert metadata['id'] == (await client.get_metadata()).id assert metadata['name'] == 'new-kvs' await client.drop() async def test_value_file_creation_and_content(kvs_client: FileSystemKeyValueStoreClient) -> None: """Test that values are properly persisted to files with correct content and metadata.""" test_key = 'test-key' test_value = 'Hello, world!' await kvs_client.set_value(key=test_key, value=test_value) # Check if the files were created key_path = kvs_client.path_to_kvs / test_key key_metadata_path = kvs_client.path_to_kvs / f'{test_key}.{METADATA_FILENAME}' assert key_path.exists() assert key_metadata_path.exists() # Check file content content = key_path.read_text(encoding='utf-8') assert content == test_value # Check record metadata file with key_metadata_path.open() as f: metadata = json.load(f) assert metadata['key'] == test_key assert metadata['content_type'] == 'text/plain; charset=utf-8' assert metadata['size'] == len(test_value.encode('utf-8')) async def test_binary_data_persistence(kvs_client: FileSystemKeyValueStoreClient) -> None: """Test that binary data is stored correctly without corruption.""" test_key = 'test-binary' test_value = b'\x00\x01\x02\x03\x04' await kvs_client.set_value(key=test_key, value=test_value) # Verify binary file exists key_path = kvs_client.path_to_kvs / test_key assert key_path.exists() # Verify binary content is preserved content = key_path.read_bytes() assert content == test_value # Verify retrieval works correctly record = await kvs_client.get_value(key=test_key) assert record is not None assert record.value == test_value assert record.content_type == 'application/octet-stream' async def test_json_serialization_to_file(kvs_client: FileSystemKeyValueStoreClient) -> None: """Test that JSON objects are properly serialized to files.""" test_key = 'test-json' test_value = {'name': 'John', 'age': 30, 'items': [1, 2, 3]} await kvs_client.set_value(key=test_key, value=test_value) # Check if file content is valid JSON key_path = kvs_client.path_to_kvs / test_key with key_path.open() as f: file_content = json.load(f) assert file_content == test_value async def test_file_deletion_on_value_delete(kvs_client: FileSystemKeyValueStoreClient) -> None: """Test that deleting a value removes its files from disk.""" test_key = 'test-delete' test_value = 'Delete me' # Set a value await kvs_client.set_value(key=test_key, value=test_value) # Verify files exist key_path = kvs_client.path_to_kvs / test_key metadata_path = kvs_client.path_to_kvs / f'{test_key}.{METADATA_FILENAME}' assert key_path.exists() assert metadata_path.exists() # Delete the value await kvs_client.delete_value(key=test_key) # Verify files were deleted assert not key_path.exists() assert not metadata_path.exists() async def test_drop_removes_directory(kvs_client: FileSystemKeyValueStoreClient) -> None: """Test that drop removes the entire store directory from disk.""" await kvs_client.set_value(key='test', value='test-value') assert kvs_client.path_to_kvs.exists() # Drop the store await kvs_client.drop() assert not kvs_client.path_to_kvs.exists() async def test_metadata_file_updates(kvs_client: FileSystemKeyValueStoreClient) -> None: """Test that read/write operations properly update metadata file timestamps.""" # Record initial timestamps metadata = await kvs_client.get_metadata() initial_created = metadata.created_at initial_accessed = metadata.accessed_at initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a read operation await kvs_client.get_value(key='nonexistent') # Verify accessed timestamp was updated metadata = await kvs_client.get_metadata() assert metadata.created_at == initial_created assert metadata.accessed_at > initial_accessed assert metadata.modified_at == initial_modified accessed_after_read = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a write operation await kvs_client.set_value(key='test', value='test-value') # Verify modified timestamp was updated metadata = await kvs_client.get_metadata() assert metadata.created_at == initial_created assert metadata.modified_at > initial_modified assert metadata.accessed_at > accessed_after_read async def test_data_persistence_across_reopens(configuration: Configuration) -> None: """Test that data persists correctly when reopening the same KVS.""" storage_client = FileSystemStorageClient() # Create KVS and add data original_client = await storage_client.create_kvs_client(name='persistence-test', configuration=configuration) test_key = 'persistent-key' test_value = 'persistent-value' await original_client.set_value(key=test_key, value=test_value) kvs_id = (await original_client.get_metadata()).id # Reopen by ID and verify data persists reopened_client = await storage_client.create_kvs_client( id=kvs_id, ) record = await reopened_client.get_value(key=test_key) assert record is not None assert record.value == test_value await reopened_client.drop() ================================================ FILE: tests/unit/storage_clients/_file_system/test_fs_rq_client.py ================================================ from __future__ import annotations import asyncio import json from typing import TYPE_CHECKING import pytest from crawlee import Request, service_locator from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient if TYPE_CHECKING: from collections.abc import AsyncGenerator from pathlib import Path from crawlee.storage_clients._file_system import FileSystemRequestQueueClient @pytest.fixture def configuration(tmp_path: Path) -> Configuration: return Configuration( storage_dir=str(tmp_path), ) @pytest.fixture async def rq_client() -> AsyncGenerator[FileSystemRequestQueueClient, None]: """A fixture for a file system request queue client.""" client = await FileSystemStorageClient().create_rq_client( name='test-request-queue', ) yield client await client.drop() async def test_file_and_directory_creation() -> None: """Test that file system RQ creates proper files and directories.""" client = await FileSystemStorageClient().create_rq_client(name='new-request-queue') # Verify files were created assert client.path_to_rq.exists() assert client.path_to_metadata.exists() # Verify metadata file structure with client.path_to_metadata.open() as f: metadata = json.load(f) assert metadata['id'] == (await client.get_metadata()).id assert metadata['name'] == 'new-request-queue' await client.drop() async def test_request_file_persistence(rq_client: FileSystemRequestQueueClient) -> None: """Test that requests are properly persisted to files.""" requests = [ Request.from_url('https://example.com/1'), Request.from_url('https://example.com/2'), Request.from_url('https://example.com/3'), ] await rq_client.add_batch_of_requests(requests) # Verify request files are created request_files = list(rq_client.path_to_rq.glob('*.json')) # Should have 3 request files + 1 metadata file assert len(request_files) == 4 assert rq_client.path_to_metadata in request_files # Verify actual request file content data_files = [f for f in request_files if f != rq_client.path_to_metadata] assert len(data_files) == 3 for req_file in data_files: with req_file.open() as f: request_data = json.load(f) assert 'url' in request_data assert request_data['url'].startswith('https://example.com/') async def test_opening_rq_does_not_have_side_effect_on_service_locator(configuration: Configuration) -> None: """Opening request queue client should cause setting storage client in the global service locator.""" await FileSystemStorageClient().create_rq_client(name='test_request_queue', configuration=configuration) # Set some specific storage client in the service locator. There should be no `ServiceConflictError`. service_locator.set_storage_client(MemoryStorageClient()) async def test_drop_removes_directory(rq_client: FileSystemRequestQueueClient) -> None: """Test that drop removes the entire RQ directory from disk.""" await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) rq_path = rq_client.path_to_rq assert rq_path.exists() # Drop the request queue await rq_client.drop() assert not rq_path.exists() async def test_metadata_file_updates(rq_client: FileSystemRequestQueueClient) -> None: """Test that metadata file is updated correctly after operations.""" # Record initial timestamps metadata = await rq_client.get_metadata() initial_created = metadata.created_at initial_accessed = metadata.accessed_at initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a read operation await rq_client.is_empty() # Verify accessed timestamp was updated metadata = await rq_client.get_metadata() assert metadata.created_at == initial_created assert metadata.accessed_at > initial_accessed assert metadata.modified_at == initial_modified accessed_after_read = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a write operation await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) # Verify modified timestamp was updated metadata = await rq_client.get_metadata() assert metadata.created_at == initial_created assert metadata.modified_at > initial_modified assert metadata.accessed_at > accessed_after_read # Verify metadata file is updated on disk with rq_client.path_to_metadata.open() as f: metadata_json = json.load(f) assert metadata_json['total_request_count'] == 1 async def test_data_persistence_across_reopens() -> None: """Test that requests persist correctly when reopening the same RQ.""" storage_client = FileSystemStorageClient() # Create RQ and add requests original_client = await storage_client.create_rq_client( name='persistence-test', ) test_requests = [ Request.from_url('https://example.com/1'), Request.from_url('https://example.com/2'), ] await original_client.add_batch_of_requests(test_requests) rq_id = (await original_client.get_metadata()).id # Reopen by ID and verify requests persist reopened_client = await storage_client.create_rq_client( id=rq_id, ) metadata = await reopened_client.get_metadata() assert metadata.total_request_count == 2 # Fetch requests to verify they're still there request1 = await reopened_client.fetch_next_request() request2 = await reopened_client.fetch_next_request() assert request1 is not None assert request2 is not None assert {request1.url, request2.url} == {'https://example.com/1', 'https://example.com/2'} await reopened_client.drop() async def test_get_request_does_not_mark_in_progress(rq_client: FileSystemRequestQueueClient) -> None: """Test that get_request does not block a request from being fetched.""" request = Request.from_url('https://example.com/blocked') await rq_client.add_batch_of_requests([request]) fetched = await rq_client.get_request(request.unique_key) assert fetched is not None assert fetched.unique_key == request.unique_key next_request = await rq_client.fetch_next_request() assert next_request is not None assert next_request.unique_key == request.unique_key ================================================ FILE: tests/unit/storage_clients/_memory/test_memory_dataset_client.py ================================================ from __future__ import annotations import asyncio from typing import TYPE_CHECKING import pytest from crawlee.storage_clients import MemoryStorageClient if TYPE_CHECKING: from collections.abc import AsyncGenerator from crawlee.storage_clients._memory import MemoryDatasetClient @pytest.fixture async def dataset_client() -> AsyncGenerator[MemoryDatasetClient, None]: """Fixture that provides a fresh memory dataset client for each test.""" client = await MemoryStorageClient().create_dataset_client(name='test-dataset') yield client await client.drop() async def test_memory_specific_purge_behavior() -> None: """Test memory-specific purge behavior and in-memory storage characteristics.""" # Create dataset and add data dataset_client1 = await MemoryStorageClient().create_dataset_client( name='test-purge-dataset', ) await dataset_client1.push_data({'item': 'initial data'}) # Verify data was added items = await dataset_client1.get_data() assert len(items.items) == 1 # Reopen with same storage client instance dataset_client2 = await MemoryStorageClient().create_dataset_client( name='test-purge-dataset', ) # Verify data was purged (memory storage specific behavior) items = await dataset_client2.get_data() assert len(items.items) == 0 async def test_memory_metadata_updates(dataset_client: MemoryDatasetClient) -> None: """Test that metadata timestamps are updated correctly in memory storage.""" # Record initial timestamps metadata = await dataset_client.get_metadata() initial_created = metadata.created_at initial_accessed = metadata.accessed_at initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a read operation await dataset_client.get_data() # Verify timestamps (memory-specific behavior) metadata = await dataset_client.get_metadata() assert metadata.created_at == initial_created assert metadata.accessed_at > initial_accessed assert metadata.modified_at == initial_modified accessed_after_read = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a write operation await dataset_client.push_data({'new': 'item'}) # Verify timestamps were updated metadata = await dataset_client.get_metadata() assert metadata.created_at == initial_created assert metadata.modified_at > initial_modified assert metadata.accessed_at > accessed_after_read ================================================ FILE: tests/unit/storage_clients/_memory/test_memory_kvs_client.py ================================================ from __future__ import annotations import asyncio from typing import TYPE_CHECKING import pytest from crawlee.storage_clients import MemoryStorageClient if TYPE_CHECKING: from collections.abc import AsyncGenerator from crawlee.storage_clients._memory import MemoryKeyValueStoreClient @pytest.fixture async def kvs_client() -> AsyncGenerator[MemoryKeyValueStoreClient, None]: """Fixture that provides a fresh memory key-value store client for each test.""" client = await MemoryStorageClient().create_kvs_client(name='test-kvs') yield client await client.drop() async def test_memory_specific_purge_behavior() -> None: """Test memory-specific purge behavior and in-memory storage characteristics.""" # Create KVS and add data kvs_client1 = await MemoryStorageClient().create_kvs_client( name='test-purge-kvs', ) await kvs_client1.set_value(key='test-key', value='initial value') # Verify value was set record = await kvs_client1.get_value(key='test-key') assert record is not None assert record.value == 'initial value' # Reopen with same storage client instance kvs_client2 = await MemoryStorageClient().create_kvs_client( name='test-purge-kvs', ) # Verify value was purged (memory storage specific behavior) record = await kvs_client2.get_value(key='test-key') assert record is None async def test_memory_metadata_updates(kvs_client: MemoryKeyValueStoreClient) -> None: """Test that metadata timestamps are updated correctly in memory storage.""" # Record initial timestamps metadata = await kvs_client.get_metadata() initial_created = metadata.created_at initial_accessed = metadata.accessed_at initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a read operation await kvs_client.get_value(key='nonexistent') # Verify timestamps (memory-specific behavior) metadata = await kvs_client.get_metadata() assert metadata.created_at == initial_created assert metadata.accessed_at > initial_accessed assert metadata.modified_at == initial_modified accessed_after_read = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a write operation await kvs_client.set_value(key='test', value='test-value') # Verify timestamps were updated metadata = await kvs_client.get_metadata() assert metadata.created_at == initial_created assert metadata.modified_at > initial_modified assert metadata.accessed_at > accessed_after_read ================================================ FILE: tests/unit/storage_clients/_memory/test_memory_rq_client.py ================================================ from __future__ import annotations import asyncio from typing import TYPE_CHECKING import pytest from crawlee import Request from crawlee.storage_clients import MemoryStorageClient if TYPE_CHECKING: from collections.abc import AsyncGenerator from crawlee.storage_clients._memory import MemoryRequestQueueClient @pytest.fixture async def rq_client() -> AsyncGenerator[MemoryRequestQueueClient, None]: """Fixture that provides a fresh memory request queue client for each test.""" client = await MemoryStorageClient().create_rq_client(name='test-rq') yield client await client.drop() async def test_memory_specific_purge_behavior() -> None: """Test memory-specific purge behavior and in-memory storage characteristics.""" # Create RQ and add data rq_client1 = await MemoryStorageClient().create_rq_client( name='test-purge-rq', ) request = Request.from_url(url='https://example.com/initial') await rq_client1.add_batch_of_requests([request]) # Verify request was added assert await rq_client1.is_empty() is False # Reopen with same storage client instance rq_client2 = await MemoryStorageClient().create_rq_client( name='test-purge-rq', ) # Verify queue was purged (memory storage specific behavior) assert await rq_client2.is_empty() is True async def test_memory_metadata_updates(rq_client: MemoryRequestQueueClient) -> None: """Test that metadata timestamps are updated correctly in memory storage.""" # Record initial timestamps metadata = await rq_client.get_metadata() initial_created = metadata.created_at initial_accessed = metadata.accessed_at initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a read operation await rq_client.is_empty() # Verify timestamps (memory-specific behavior) metadata = await rq_client.get_metadata() assert metadata.created_at == initial_created assert metadata.accessed_at > initial_accessed assert metadata.modified_at == initial_modified accessed_after_read = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a write operation await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) # Verify timestamps were updated metadata = await rq_client.get_metadata() assert metadata.created_at == initial_created assert metadata.modified_at > initial_modified assert metadata.accessed_at > accessed_after_read ================================================ FILE: tests/unit/storage_clients/_redis/test_redis_dataset_client.py ================================================ from __future__ import annotations import asyncio from typing import TYPE_CHECKING import pytest from crawlee.storage_clients import RedisStorageClient from crawlee.storage_clients._redis._utils import await_redis_response if TYPE_CHECKING: from collections.abc import AsyncGenerator from fakeredis import FakeAsyncRedis from crawlee.storage_clients._redis import RedisDatasetClient @pytest.fixture async def dataset_client( redis_client: FakeAsyncRedis, suppress_user_warning: None, # noqa: ARG001 ) -> AsyncGenerator[RedisDatasetClient, None]: """A fixture for a Redis dataset client.""" client = await RedisStorageClient(redis=redis_client).create_dataset_client( name='test_dataset', ) yield client await client.drop() async def test_base_keys_creation(dataset_client: RedisDatasetClient) -> None: """Test that Redis dataset client creates proper keys.""" metadata = await dataset_client.get_metadata() name = await await_redis_response(dataset_client.redis.hget('datasets:id_to_name', metadata.id)) assert name is not None assert (name.decode() if isinstance(name, bytes) else name) == 'test_dataset' dataset_id = await await_redis_response(dataset_client.redis.hget('datasets:name_to_id', 'test_dataset')) assert dataset_id is not None assert (dataset_id.decode() if isinstance(dataset_id, bytes) else dataset_id) == metadata.id items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$')) assert items is not None assert len(items) == 0 metadata_data = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:metadata')) assert isinstance(metadata_data, dict) assert metadata_data['id'] == metadata.id async def test_record_and_content_verification(dataset_client: RedisDatasetClient) -> None: """Test that data is properly persisted to Redis with correct content.""" item = {'key': 'value', 'number': 42} await dataset_client.push_data(item) # Verify metadata record metadata = await dataset_client.get_metadata() assert metadata.item_count == 1 assert metadata.created_at is not None assert metadata.modified_at is not None assert metadata.accessed_at is not None # Verify records in Redis all_items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$')) assert all_items is not None assert len(all_items) == 1 # Verify actual file content assert all_items[0] == item # Test multiple records items = [{'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}] await dataset_client.push_data(items) all_items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$')) assert all_items is not None assert len(all_items) == 4 async def test_drop_removes_records(dataset_client: RedisDatasetClient) -> None: """Test that dropping a dataset removes all records from Redis.""" await dataset_client.push_data({'test': 'data'}) metadata = await dataset_client.get_metadata() name = await await_redis_response(dataset_client.redis.hget('datasets:id_to_name', metadata.id)) dataset_id = await await_redis_response(dataset_client.redis.hget('datasets:name_to_id', 'test_dataset')) items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$')) assert name is not None assert (name.decode() if isinstance(name, bytes) else name) == 'test_dataset' assert dataset_id is not None assert (dataset_id.decode() if isinstance(dataset_id, bytes) else dataset_id) == metadata.id assert items is not None assert len(items) == 1 # Drop the dataset await dataset_client.drop() # Verify removal of all records name_after_drop = await await_redis_response(dataset_client.redis.hget('datasets:id_to_name', metadata.id)) dataset_id_after_drop = await await_redis_response(dataset_client.redis.hget('datasets:name_to_id', 'test_dataset')) items_after_drop = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$')) assert name_after_drop is None assert dataset_id_after_drop is None assert items_after_drop is None async def test_metadata_record_updates(dataset_client: RedisDatasetClient) -> None: """Test that metadata record is updated correctly after operations.""" # Record initial timestamps metadata = await dataset_client.get_metadata() initial_created = metadata.created_at initial_accessed = metadata.accessed_at initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform an operation that updates accessed_at await dataset_client.get_data() # Verify timestamps metadata = await dataset_client.get_metadata() assert metadata.created_at == initial_created assert metadata.accessed_at > initial_accessed assert metadata.modified_at == initial_modified accessed_after_get = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform an operation that updates modified_at await dataset_client.push_data({'new': 'item'}) # Verify timestamps again metadata = await dataset_client.get_metadata() assert metadata.created_at == initial_created assert metadata.modified_at > initial_modified assert metadata.accessed_at > accessed_after_get ================================================ FILE: tests/unit/storage_clients/_redis/test_redis_kvs_client.py ================================================ from __future__ import annotations import asyncio import json from typing import TYPE_CHECKING import pytest from crawlee.storage_clients import RedisStorageClient from crawlee.storage_clients._redis._utils import await_redis_response if TYPE_CHECKING: from collections.abc import AsyncGenerator from fakeredis import FakeAsyncRedis from crawlee.storage_clients._redis import RedisKeyValueStoreClient @pytest.fixture async def kvs_client( redis_client: FakeAsyncRedis, suppress_user_warning: None, # noqa: ARG001 ) -> AsyncGenerator[RedisKeyValueStoreClient, None]: """A fixture for a Redis KVS client.""" client = await RedisStorageClient(redis=redis_client).create_kvs_client( name='test_kvs', ) yield client await client.drop() async def test_base_keys_creation(kvs_client: RedisKeyValueStoreClient) -> None: """Test that Redis KVS client creates proper keys.""" metadata = await kvs_client.get_metadata() name = await await_redis_response(kvs_client.redis.hget('key_value_stores:id_to_name', metadata.id)) assert name is not None assert (name.decode() if isinstance(name, bytes) else name) == 'test_kvs' kvs_id = await await_redis_response(kvs_client.redis.hget('key_value_stores:name_to_id', 'test_kvs')) assert kvs_id is not None assert (kvs_id.decode() if isinstance(kvs_id, bytes) else kvs_id) == metadata.id metadata_data = await await_redis_response(kvs_client.redis.json().get('key_value_stores:test_kvs:metadata')) assert isinstance(metadata_data, dict) assert metadata_data['id'] == metadata.id async def test_value_record_creation_and_content(kvs_client: RedisKeyValueStoreClient) -> None: """Test that values are properly persisted to records with correct content and metadata.""" test_key = 'test-key' test_value = 'Hello, world!' await kvs_client.set_value(key=test_key, value=test_value) # Check if the records were created records_key = 'key_value_stores:test_kvs:items' records_items_metadata = 'key_value_stores:test_kvs:metadata_items' record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key)) metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key)) assert record_exists is True assert metadata_exists is True # Check record content content = await await_redis_response(kvs_client.redis.hget(records_key, test_key)) content = content.decode() if isinstance(content, bytes) else content assert content == test_value # Check record metadata record_metadata = await await_redis_response(kvs_client.redis.hget(records_items_metadata, test_key)) assert record_metadata is not None assert isinstance(record_metadata, (str, bytes)) metadata = json.loads(record_metadata) # Check record metadata assert metadata['key'] == test_key assert metadata['content_type'] == 'text/plain; charset=utf-8' assert metadata['size'] == len(test_value.encode('utf-8')) # Verify retrieval works correctly check_value = await kvs_client.get_value(key=test_key) assert check_value is not None assert check_value.value == test_value async def test_binary_data_persistence(kvs_client: RedisKeyValueStoreClient) -> None: """Test that binary data is stored correctly without corruption.""" test_key = 'test-binary' test_value = b'\x00\x01\x02\x03\x04' records_key = 'key_value_stores:test_kvs:items' records_items_metadata = 'key_value_stores:test_kvs:metadata_items' await kvs_client.set_value(key=test_key, value=test_value) # Verify binary file exists record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key)) metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key)) assert record_exists is True assert metadata_exists is True # Verify binary content is preserved content = await await_redis_response(kvs_client.redis.hget(records_key, test_key)) assert content == test_value # Verify retrieval works correctly record = await kvs_client.get_value(key=test_key) assert record is not None assert record.value == test_value assert record.content_type == 'application/octet-stream' async def test_json_serialization_to_record(kvs_client: RedisKeyValueStoreClient) -> None: """Test that JSON objects are properly serialized to records.""" test_key = 'test-json' test_value = {'name': 'John', 'age': 30, 'items': [1, 2, 3]} await kvs_client.set_value(key=test_key, value=test_value) # Check if record content is valid JSON records_key = 'key_value_stores:test_kvs:items' record = await await_redis_response(kvs_client.redis.hget(records_key, test_key)) assert record is not None assert isinstance(record, (str, bytes)) assert json.loads(record) == test_value async def test_records_deletion_on_value_delete(kvs_client: RedisKeyValueStoreClient) -> None: """Test that deleting a value removes its records from Redis.""" test_key = 'test-delete' test_value = 'Delete me' records_key = 'key_value_stores:test_kvs:items' records_items_metadata = 'key_value_stores:test_kvs:metadata_items' # Set a value await kvs_client.set_value(key=test_key, value=test_value) # Verify records exist record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key)) metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key)) assert record_exists is True assert metadata_exists is True # Delete the value await kvs_client.delete_value(key=test_key) # Verify files were deleted record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key)) metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key)) assert record_exists is False assert metadata_exists is False async def test_drop_removes_keys(kvs_client: RedisKeyValueStoreClient) -> None: """Test that drop removes the entire store directory from disk.""" await kvs_client.set_value(key='test', value='test-value') metadata = await kvs_client.get_metadata() name = await await_redis_response(kvs_client.redis.hget('key_value_stores:id_to_name', metadata.id)) kvs_id = await await_redis_response(kvs_client.redis.hget('key_value_stores:name_to_id', 'test_kvs')) items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:items')) metadata_items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:metadata_items')) assert name is not None assert (name.decode() if isinstance(name, bytes) else name) == 'test_kvs' assert kvs_id is not None assert (kvs_id.decode() if isinstance(kvs_id, bytes) else kvs_id) == metadata.id assert items is not None assert items != {} assert metadata_items is not None assert metadata_items != {} # Drop the store await kvs_client.drop() name = await await_redis_response(kvs_client.redis.hget('key_value_stores:id_to_name', metadata.id)) kvs_id = await await_redis_response(kvs_client.redis.hget('key_value_stores:name_to_id', 'test_kvs')) items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:items')) metadata_items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:metadata_items')) assert name is None assert kvs_id is None assert items == {} assert metadata_items == {} async def test_metadata_record_updates(kvs_client: RedisKeyValueStoreClient) -> None: """Test that read/write operations properly update metadata file timestamps.""" # Record initial timestamps metadata = await kvs_client.get_metadata() initial_created = metadata.created_at initial_accessed = metadata.accessed_at initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a read operation await kvs_client.get_value(key='nonexistent') # Verify accessed timestamp was updated metadata = await kvs_client.get_metadata() assert metadata.created_at == initial_created assert metadata.accessed_at > initial_accessed assert metadata.modified_at == initial_modified accessed_after_read = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a write operation await kvs_client.set_value(key='test', value='test-value') # Verify modified timestamp was updated metadata = await kvs_client.get_metadata() assert metadata.created_at == initial_created assert metadata.modified_at > initial_modified assert metadata.accessed_at > accessed_after_read ================================================ FILE: tests/unit/storage_clients/_redis/test_redis_rq_client.py ================================================ from __future__ import annotations import asyncio import json from typing import TYPE_CHECKING import pytest from crawlee import Request from crawlee.storage_clients import RedisStorageClient from crawlee.storage_clients._redis._utils import await_redis_response if TYPE_CHECKING: from collections.abc import AsyncGenerator from fakeredis import FakeAsyncRedis from crawlee.storage_clients._redis import RedisRequestQueueClient @pytest.fixture(params=['default', 'bloom']) async def rq_client( redis_client: FakeAsyncRedis, request: pytest.FixtureRequest, suppress_user_warning: None, # noqa: ARG001 ) -> AsyncGenerator[RedisRequestQueueClient, None]: """A fixture for a Redis RQ client.""" client = await RedisStorageClient(redis=redis_client, queue_dedup_strategy=request.param).create_rq_client( name='test_request_queue' ) yield client await client.drop() async def test_base_keys_creation(rq_client: RedisRequestQueueClient) -> None: """Test that Redis RQ client creates proper keys.""" metadata = await rq_client.get_metadata() name = await await_redis_response(rq_client.redis.hget('request_queues:id_to_name', metadata.id)) assert name is not None assert (name.decode() if isinstance(name, bytes) else name) == 'test_request_queue' kvs_id = await await_redis_response(rq_client.redis.hget('request_queues:name_to_id', 'test_request_queue')) assert kvs_id is not None assert (kvs_id.decode() if isinstance(kvs_id, bytes) else kvs_id) == metadata.id if rq_client._dedup_strategy == 'bloom': added_bf = await await_redis_response( rq_client.redis.exists('request_queues:test_request_queue:added_bloom_filter') ) assert added_bf == 1 handled_bf = await await_redis_response( rq_client.redis.exists('request_queues:test_request_queue:handled_bloom_filter') ) assert handled_bf == 1 metadata_data = await await_redis_response(rq_client.redis.json().get('request_queues:test_request_queue:metadata')) assert isinstance(metadata_data, dict) assert metadata_data['id'] == metadata.id async def test_request_records_persistence(rq_client: RedisRequestQueueClient) -> None: """Test that requests are properly persisted to Redis.""" requests = [ Request.from_url('https://example.com/1'), Request.from_url('https://example.com/2'), Request.from_url('https://example.com/3'), ] await rq_client.add_batch_of_requests(requests) # Verify request records are created request_queue_response = await await_redis_response( rq_client.redis.lmpop(1, 'request_queues:test_request_queue:queue', direction='left', count=10) ) assert request_queue_response is not None assert isinstance(request_queue_response, list) request_keys = request_queue_response[1] assert isinstance(request_keys, list) assert len(request_keys) == 3 # Verify actual request file content requests_records_data = await await_redis_response( rq_client.redis.hgetall('request_queues:test_request_queue:data') ) assert isinstance(requests_records_data, dict) for key in request_keys: request_data = json.loads(requests_records_data[key]) # ty: ignore[invalid-argument-type] assert 'url' in request_data assert request_data['url'].startswith('https://example.com/') async def test_drop_removes_records(rq_client: RedisRequestQueueClient) -> None: """Test that drop removes all request records from Redis.""" await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) rq_queue = 'request_queues:test_request_queue:queue' rq_data = 'request_queues:test_request_queue:data' added_bf = 'request_queues:test_request_queue:added_bloom_filter' handled_bf = 'request_queues:test_request_queue:handled_bloom_filter' pending_set = 'request_queues:test_request_queue:pending_set' handled_set = 'request_queues:test_request_queue:handled_set' metadata_key = 'request_queues:test_request_queue:metadata' metadata = await rq_client.get_metadata() name = await await_redis_response(rq_client.redis.hget('request_queues:id_to_name', metadata.id)) assert name is not None assert (name.decode() if isinstance(name, bytes) else name) == 'test_request_queue' rq_id = await await_redis_response(rq_client.redis.hget('request_queues:name_to_id', 'test_request_queue')) assert rq_id is not None assert rq_id.decode() if isinstance(rq_id, bytes) else rq_id rq_queue_exists = await await_redis_response(rq_client.redis.exists(rq_queue)) rq_data_exists = await await_redis_response(rq_client.redis.exists(rq_data)) metadata_exists = await await_redis_response(rq_client.redis.exists(metadata_key)) assert rq_queue_exists == 1 assert rq_data_exists == 1 assert metadata_exists == 1 if rq_client._dedup_strategy == 'bloom': added_bf_exists = await await_redis_response(rq_client.redis.exists(added_bf)) handled_bf_exists = await await_redis_response(rq_client.redis.exists(handled_bf)) assert added_bf_exists == 1 assert handled_bf_exists == 1 elif rq_client._dedup_strategy == 'default': pending_set_exists = await await_redis_response(rq_client.redis.exists(pending_set)) handled_set_exists = await await_redis_response(rq_client.redis.exists(handled_set)) assert pending_set_exists == 1 # No requests marked as handled assert handled_set_exists == 0 # Drop the request queue await rq_client.drop() # Verify removal of all records name_after_drop = await await_redis_response(rq_client.redis.hget('request_queues:id_to_name', metadata.id)) rq_id_after_drop = await await_redis_response( rq_client.redis.hget('request_queues:name_to_id', 'test_request_queue') ) rq_queue_exists = await await_redis_response(rq_client.redis.exists(rq_queue)) rq_data_exists = await await_redis_response(rq_client.redis.exists(rq_data)) metadata_exists = await await_redis_response(rq_client.redis.exists(metadata_key)) assert name_after_drop is None assert rq_id_after_drop is None assert rq_queue_exists == 0 assert rq_data_exists == 0 assert metadata_exists == 0 if rq_client._dedup_strategy == 'bloom': added_bf_exists = await await_redis_response(rq_client.redis.exists(added_bf)) handled_bf_exists = await await_redis_response(rq_client.redis.exists(handled_bf)) assert added_bf_exists == 0 assert handled_bf_exists == 0 elif rq_client._dedup_strategy == 'default': pending_set_exists = await await_redis_response(rq_client.redis.exists(pending_set)) handled_set_exists = await await_redis_response(rq_client.redis.exists(handled_set)) assert pending_set_exists == 0 assert handled_set_exists == 0 async def test_metadata_file_updates(rq_client: RedisRequestQueueClient) -> None: """Test that metadata file is updated correctly after operations.""" # Record initial timestamps metadata = await rq_client.get_metadata() initial_created = metadata.created_at initial_accessed = metadata.accessed_at initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a read operation await rq_client.is_empty() # Verify accessed timestamp was updated metadata = await rq_client.get_metadata() assert metadata.created_at == initial_created assert metadata.accessed_at > initial_accessed assert metadata.modified_at == initial_modified accessed_after_read = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a write operation await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) # Verify modified timestamp was updated metadata = await rq_client.get_metadata() assert metadata.created_at == initial_created assert metadata.modified_at > initial_modified assert metadata.accessed_at > accessed_after_read async def test_get_request(rq_client: RedisRequestQueueClient) -> None: """Test that get_request works correctly.""" requests = [ Request.from_url('https://example.com/1'), Request.from_url('https://example.com/2'), Request.from_url('https://example.com/3'), ] added_requests = await rq_client.add_batch_of_requests(requests) assert len(added_requests.processed_requests) == 3 for req in requests: fetched_request = await rq_client.get_request(req.unique_key) assert fetched_request is not None assert fetched_request.unique_key == req.unique_key assert fetched_request.url == req.url # Test fetching a non-existent request non_existent = await rq_client.get_request('non-existent-id') assert non_existent is None async def test_deduplication(rq_client: RedisRequestQueueClient) -> None: """Test that request deduplication works correctly.""" requests = [ Request.from_url('https://example.com/1'), Request.from_url('https://example.com/1'), Request.from_url('https://example.com/3'), ] await rq_client.add_batch_of_requests(requests) # Verify only unique requests are added metadata = await rq_client.get_metadata() assert metadata.pending_request_count == 2 assert metadata.total_request_count == 2 # Fetch requests and verify order request1 = await rq_client.fetch_next_request() assert request1 is not None assert request1 == requests[0] # Fetch the next request, which should skip the duplicate request2 = await rq_client.fetch_next_request() assert request2 is not None assert request2 == requests[2] # Verify no more requests are available request3 = await rq_client.fetch_next_request() assert request3 is None ================================================ FILE: tests/unit/storage_clients/_sql/test_sql_dataset_client.py ================================================ from __future__ import annotations import asyncio from typing import TYPE_CHECKING import pytest from sqlalchemy import inspect, select from sqlalchemy.ext.asyncio import create_async_engine from crawlee.configuration import Configuration from crawlee.storage_clients import SqlStorageClient from crawlee.storage_clients._sql._db_models import DatasetItemDb, DatasetMetadataDb if TYPE_CHECKING: from collections.abc import AsyncGenerator from pathlib import Path from sqlalchemy import Connection from crawlee.storage_clients._sql import SqlDatasetClient @pytest.fixture def configuration(tmp_path: Path) -> Configuration: """Temporary configuration for tests.""" return Configuration( storage_dir=str(tmp_path), ) # Helper function that allows you to use inspect with an asynchronous engine def get_tables(sync_conn: Connection) -> list[str]: inspector = inspect(sync_conn) return inspector.get_table_names() @pytest.fixture async def dataset_client( configuration: Configuration, ) -> AsyncGenerator[SqlDatasetClient, None]: """A fixture for a SQL dataset client.""" async with SqlStorageClient() as storage_client: client = await storage_client.create_dataset_client( name='test-dataset', configuration=configuration, ) yield client await client.drop() async def test_create_tables_with_connection_string(configuration: Configuration, tmp_path: Path) -> None: """Test that SQL dataset client creates tables with a connection string.""" storage_dir = tmp_path / 'test_table.db' async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client: await storage_client.create_dataset_client( name='new-dataset', configuration=configuration, ) async with storage_client.engine.begin() as conn: tables = await conn.run_sync(get_tables) assert 'dataset_records' in tables assert 'datasets' in tables async def test_create_tables_with_engine(configuration: Configuration, tmp_path: Path) -> None: """Test that SQL dataset client creates tables with a pre-configured engine.""" storage_dir = tmp_path / 'test_table.db' engine = create_async_engine(f'sqlite+aiosqlite:///{storage_dir}', future=True, echo=False) async with SqlStorageClient(engine=engine) as storage_client: await storage_client.create_dataset_client( name='new-dataset', configuration=configuration, ) async with engine.begin() as conn: tables = await conn.run_sync(get_tables) assert 'dataset_records' in tables assert 'datasets' in tables async def test_tables_and_metadata_record(configuration: Configuration) -> None: """Test that SQL dataset creates proper tables and metadata records.""" async with SqlStorageClient() as storage_client: client = await storage_client.create_dataset_client( name='new-dataset', configuration=configuration, ) client_metadata = await client.get_metadata() async with storage_client.engine.begin() as conn: tables = await conn.run_sync(get_tables) assert 'dataset_records' in tables assert 'datasets' in tables async with client.get_session() as session: stmt = select(DatasetMetadataDb).where(DatasetMetadataDb.name == 'new-dataset') result = await session.execute(stmt) orm_metadata = result.scalar_one_or_none() assert orm_metadata is not None assert orm_metadata.id == client_metadata.id assert orm_metadata.name == 'new-dataset' assert orm_metadata.item_count == 0 await client.drop() async def test_record_and_content_verification(dataset_client: SqlDatasetClient) -> None: """Test that dataset client can push data and verify its content.""" item = {'key': 'value', 'number': 42} await dataset_client.push_data(item) # Verify metadata record metadata = await dataset_client.get_metadata() assert metadata.item_count == 1 assert metadata.created_at is not None assert metadata.modified_at is not None assert metadata.accessed_at is not None async with dataset_client.get_session() as session: stmt = select(DatasetItemDb).where(DatasetItemDb.dataset_id == metadata.id) result = await session.execute(stmt) records = result.scalars().all() assert len(records) == 1 saved_item = records[0].data assert saved_item == item # Test pushing multiple items and verify total count items = [{'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}] await dataset_client.push_data(items) async with dataset_client.get_session() as session: stmt = select(DatasetItemDb).where(DatasetItemDb.dataset_id == metadata.id) result = await session.execute(stmt) records = result.scalars().all() assert len(records) == 4 async def test_drop_removes_records(dataset_client: SqlDatasetClient) -> None: """Test that dropping a dataset removes all records from the database.""" await dataset_client.push_data({'test': 'data'}) client_metadata = await dataset_client.get_metadata() async with dataset_client.get_session() as session: stmt = select(DatasetItemDb).where(DatasetItemDb.dataset_id == client_metadata.id) result = await session.execute(stmt) records = result.scalars().all() assert len(records) == 1 # Drop the dataset await dataset_client.drop() async with dataset_client.get_session() as session: stmt = select(DatasetItemDb).where(DatasetItemDb.dataset_id == client_metadata.id) result = await session.execute(stmt) records = result.scalars().all() assert len(records) == 0 metadata = await session.get(DatasetMetadataDb, client_metadata.id) assert metadata is None async def test_metadata_record_updates(dataset_client: SqlDatasetClient) -> None: """Test that metadata record is updated correctly after operations.""" # Record initial timestamps metadata = await dataset_client.get_metadata() initial_created = metadata.created_at initial_accessed = metadata.accessed_at initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform an operation that updates accessed_at await dataset_client.get_data() # Verify timestamps metadata = await dataset_client.get_metadata() assert metadata.created_at == initial_created assert metadata.accessed_at > initial_accessed assert metadata.modified_at == initial_modified accessed_after_get = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform an operation that updates modified_at await dataset_client.push_data({'new': 'item'}) # Verify timestamps again metadata = await dataset_client.get_metadata() assert metadata.created_at == initial_created assert metadata.modified_at > initial_modified assert metadata.accessed_at > accessed_after_get # Verify metadata record is updated in db async with dataset_client.get_session() as session: orm_metadata = await session.get(DatasetMetadataDb, metadata.id) assert orm_metadata is not None orm_metadata.item_count = 1 assert orm_metadata.created_at == initial_created assert orm_metadata.accessed_at == metadata.accessed_at assert orm_metadata.modified_at == metadata.modified_at async def test_data_persistence_across_reopens(configuration: Configuration) -> None: """Test that data persists correctly when reopening the same dataset.""" async with SqlStorageClient() as storage_client: original_client = await storage_client.create_dataset_client( name='persistence-test', configuration=configuration, ) test_data = {'test_item': 'test_value', 'id': 123} await original_client.push_data(test_data) dataset_id = (await original_client.get_metadata()).id reopened_client = await storage_client.create_dataset_client( id=dataset_id, configuration=configuration, ) data = await reopened_client.get_data() assert len(data.items) == 1 assert data.items[0] == test_data await reopened_client.drop() ================================================ FILE: tests/unit/storage_clients/_sql/test_sql_kvs_client.py ================================================ from __future__ import annotations import asyncio import json from typing import TYPE_CHECKING import pytest from sqlalchemy import inspect, select from sqlalchemy.ext.asyncio import create_async_engine from crawlee.configuration import Configuration from crawlee.storage_clients import SqlStorageClient from crawlee.storage_clients._sql._db_models import KeyValueStoreMetadataDb, KeyValueStoreRecordDb from crawlee.storage_clients.models import KeyValueStoreMetadata if TYPE_CHECKING: from collections.abc import AsyncGenerator from pathlib import Path from sqlalchemy import Connection from crawlee.storage_clients._sql import SqlKeyValueStoreClient @pytest.fixture def configuration(tmp_path: Path) -> Configuration: """Temporary configuration for tests.""" return Configuration( storage_dir=str(tmp_path), ) @pytest.fixture async def kvs_client( configuration: Configuration, ) -> AsyncGenerator[SqlKeyValueStoreClient, None]: """A fixture for a SQL key-value store client.""" async with SqlStorageClient() as storage_client: client = await storage_client.create_kvs_client( name='test-kvs', configuration=configuration, ) yield client await client.drop() # Helper function that allows you to use inspect with an asynchronous engine def get_tables(sync_conn: Connection) -> list[str]: inspector = inspect(sync_conn) return inspector.get_table_names() async def test_create_tables_with_connection_string(configuration: Configuration, tmp_path: Path) -> None: """Test that SQL key-value store client creates tables with a connection string.""" storage_dir = tmp_path / 'test_table.db' async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client: await storage_client.create_kvs_client( name='new-kvs', configuration=configuration, ) async with storage_client.engine.begin() as conn: tables = await conn.run_sync(get_tables) assert 'key_value_stores' in tables assert 'key_value_store_records' in tables async def test_create_tables_with_engine(configuration: Configuration, tmp_path: Path) -> None: """Test that SQL key-value store client creates tables with a pre-configured engine.""" storage_dir = tmp_path / 'test_table.db' engine = create_async_engine(f'sqlite+aiosqlite:///{storage_dir}', future=True, echo=False) async with SqlStorageClient(engine=engine) as storage_client: await storage_client.create_kvs_client( name='new-kvs', configuration=configuration, ) async with engine.begin() as conn: tables = await conn.run_sync(get_tables) assert 'key_value_stores' in tables assert 'key_value_store_records' in tables async def test_tables_and_metadata_record(configuration: Configuration) -> None: """Test that SQL key-value store creates proper tables and metadata records.""" async with SqlStorageClient() as storage_client: client = await storage_client.create_kvs_client( name='new-kvs', configuration=configuration, ) client_metadata = await client.get_metadata() async with storage_client.engine.begin() as conn: tables = await conn.run_sync(get_tables) assert 'key_value_stores' in tables assert 'key_value_store_records' in tables async with client.get_session() as session: stmt = select(KeyValueStoreMetadataDb).where(KeyValueStoreMetadataDb.name == 'new-kvs') result = await session.execute(stmt) orm_metadata = result.scalar_one_or_none() metadata = KeyValueStoreMetadata.model_validate(orm_metadata) assert metadata.id == client_metadata.id assert metadata.name == 'new-kvs' await client.drop() async def test_value_record_creation(kvs_client: SqlKeyValueStoreClient) -> None: """Test that SQL key-value store client can create a record.""" test_key = 'test-key' test_value = 'Hello, world!' await kvs_client.set_value(key=test_key, value=test_value) async with kvs_client.get_session() as session: stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key) result = await session.execute(stmt) record = result.scalar_one_or_none() assert record is not None assert record.key == test_key assert record.content_type == 'text/plain; charset=utf-8' assert record.size == len(test_value.encode('utf-8')) assert record.value == test_value.encode('utf-8') async def test_binary_data_persistence(kvs_client: SqlKeyValueStoreClient) -> None: """Test that binary data is stored correctly without corruption.""" test_key = 'test-binary' test_value = b'\x00\x01\x02\x03\x04' await kvs_client.set_value(key=test_key, value=test_value) async with kvs_client.get_session() as session: stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key) result = await session.execute(stmt) record = result.scalar_one_or_none() assert record is not None assert record.key == test_key assert record.content_type == 'application/octet-stream' assert record.size == len(test_value) assert record.value == test_value verify_record = await kvs_client.get_value(key=test_key) assert verify_record is not None assert verify_record.value == test_value assert verify_record.content_type == 'application/octet-stream' async def test_json_serialization_to_record(kvs_client: SqlKeyValueStoreClient) -> None: """Test that JSON objects are properly serialized to records.""" test_key = 'test-json' test_value = {'name': 'John', 'age': 30, 'items': [1, 2, 3]} await kvs_client.set_value(key=test_key, value=test_value) async with kvs_client.get_session() as session: stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key) result = await session.execute(stmt) record = result.scalar_one_or_none() assert record is not None assert record.key == test_key assert json.loads(record.value.decode('utf-8')) == test_value async def test_record_deletion_on_value_delete(kvs_client: SqlKeyValueStoreClient) -> None: """Test that deleting a value removes its record from the database.""" test_key = 'test-delete' test_value = 'Delete me' # Set a value await kvs_client.set_value(key=test_key, value=test_value) async with kvs_client.get_session() as session: stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key) result = await session.execute(stmt) record = result.scalar_one_or_none() assert record is not None assert record.key == test_key assert record.value == test_value.encode('utf-8') # Delete the value await kvs_client.delete_value(key=test_key) # Verify record was deleted async with kvs_client.get_session() as session: stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key) result = await session.execute(stmt) record = result.scalar_one_or_none() assert record is None async def test_drop_removes_records(kvs_client: SqlKeyValueStoreClient) -> None: """Test that drop removes all records from the database.""" await kvs_client.set_value(key='test', value='test-value') client_metadata = await kvs_client.get_metadata() async with kvs_client.get_session() as session: stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == 'test') result = await session.execute(stmt) record = result.scalar_one_or_none() assert record is not None # Drop the store await kvs_client.drop() async with kvs_client.get_session() as session: stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == 'test') result = await session.execute(stmt) record = result.scalar_one_or_none() assert record is None metadata = await session.get(KeyValueStoreMetadataDb, client_metadata.id) assert metadata is None async def test_metadata_record_updates(kvs_client: SqlKeyValueStoreClient) -> None: """Test that read/write operations properly update metadata record timestamps.""" # Record initial timestamps metadata = await kvs_client.get_metadata() initial_created = metadata.created_at initial_accessed = metadata.accessed_at initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a read operation await kvs_client.get_value(key='nonexistent') # Verify accessed timestamp was updated metadata = await kvs_client.get_metadata() assert metadata.created_at == initial_created assert metadata.accessed_at > initial_accessed assert metadata.modified_at == initial_modified accessed_after_read = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a write operation await kvs_client.set_value(key='test', value='test-value') # Verify modified timestamp was updated metadata = await kvs_client.get_metadata() assert metadata.created_at == initial_created assert metadata.modified_at > initial_modified assert metadata.accessed_at > accessed_after_read async with kvs_client.get_session() as session: orm_metadata = await session.get(KeyValueStoreMetadataDb, metadata.id) assert orm_metadata is not None assert orm_metadata.created_at == metadata.created_at assert orm_metadata.accessed_at == metadata.accessed_at assert orm_metadata.modified_at == metadata.modified_at async def test_data_persistence_across_reopens(configuration: Configuration) -> None: """Test that data persists correctly when reopening the same key-value store.""" async with SqlStorageClient() as storage_client: original_client = await storage_client.create_kvs_client( name='persistence-test', configuration=configuration, ) test_key = 'persistent-key' test_value = 'persistent-value' await original_client.set_value(key=test_key, value=test_value) kvs_id = (await original_client.get_metadata()).id # Reopen by ID and verify data persists reopened_client = await storage_client.create_kvs_client( id=kvs_id, configuration=configuration, ) record = await reopened_client.get_value(key=test_key) assert record is not None assert record.value == test_value await reopened_client.drop() ================================================ FILE: tests/unit/storage_clients/_sql/test_sql_rq_client.py ================================================ from __future__ import annotations import asyncio import json from typing import TYPE_CHECKING import pytest from sqlalchemy import inspect, select from sqlalchemy.ext.asyncio import create_async_engine from crawlee import Request from crawlee.configuration import Configuration from crawlee.storage_clients import SqlStorageClient from crawlee.storage_clients._sql._db_models import RequestDb, RequestQueueMetadataDb from crawlee.storage_clients.models import RequestQueueMetadata if TYPE_CHECKING: from collections.abc import AsyncGenerator from pathlib import Path from sqlalchemy import Connection from crawlee.storage_clients._sql import SqlRequestQueueClient @pytest.fixture def configuration(tmp_path: Path) -> Configuration: """Temporary configuration for tests.""" return Configuration( storage_dir=str(tmp_path), ) @pytest.fixture async def rq_client( configuration: Configuration, ) -> AsyncGenerator[SqlRequestQueueClient, None]: """A fixture for a SQL request queue client.""" async with SqlStorageClient() as storage_client: client = await storage_client.create_rq_client( name='test-request-queue', configuration=configuration, ) yield client await client.drop() # Helper function that allows you to use inspect with an asynchronous engine def get_tables(sync_conn: Connection) -> list[str]: inspector = inspect(sync_conn) return inspector.get_table_names() async def test_create_tables_with_connection_string(configuration: Configuration, tmp_path: Path) -> None: """Test that SQL request queue client creates tables with a connection string.""" storage_dir = tmp_path / 'test_table.db' async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client: await storage_client.create_rq_client( name='test-request-queue', configuration=configuration, ) async with storage_client.engine.begin() as conn: tables = await conn.run_sync(get_tables) assert 'request_queues' in tables assert 'request_queue_records' in tables assert 'request_queue_state' in tables async def test_create_tables_with_engine(configuration: Configuration, tmp_path: Path) -> None: """Test that SQL request queue client creates tables with a pre-configured engine.""" storage_dir = tmp_path / 'test_table.db' engine = create_async_engine(f'sqlite+aiosqlite:///{storage_dir}', future=True, echo=False) async with SqlStorageClient(engine=engine) as storage_client: await storage_client.create_rq_client( name='test-request-queue', configuration=configuration, ) async with engine.begin() as conn: tables = await conn.run_sync(get_tables) assert 'request_queues' in tables assert 'request_queue_records' in tables assert 'request_queue_state' in tables async def test_tables_and_metadata_record(configuration: Configuration) -> None: """Test that SQL request queue creates proper tables and metadata records.""" async with SqlStorageClient() as storage_client: client = await storage_client.create_rq_client( name='test-request-queue', configuration=configuration, ) client_metadata = await client.get_metadata() async with storage_client.engine.begin() as conn: tables = await conn.run_sync(get_tables) assert 'request_queues' in tables assert 'request_queue_records' in tables assert 'request_queue_state' in tables async with client.get_session() as session: stmt = select(RequestQueueMetadataDb).where(RequestQueueMetadataDb.name == 'test-request-queue') result = await session.execute(stmt) orm_metadata = result.scalar_one_or_none() metadata = RequestQueueMetadata.model_validate(orm_metadata) assert metadata.id == client_metadata.id assert metadata.name == 'test-request-queue' await client.drop() async def test_request_records_persistence(rq_client: SqlRequestQueueClient) -> None: """Test that all added requests are persisted and can be retrieved from the database.""" requests = [ Request.from_url('https://example.com/1'), Request.from_url('https://example.com/2'), Request.from_url('https://example.com/3'), ] await rq_client.add_batch_of_requests(requests) metadata_client = await rq_client.get_metadata() async with rq_client.get_session() as session: stmt = select(RequestDb).where(RequestDb.request_queue_id == metadata_client.id) result = await session.execute(stmt) db_requests = result.scalars().all() assert len(db_requests) == 3 for db_request in db_requests: request = json.loads(db_request.data) assert request['url'] in ['https://example.com/1', 'https://example.com/2', 'https://example.com/3'] async def test_drop_removes_records(rq_client: SqlRequestQueueClient) -> None: """Test that drop removes all records from the database.""" await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) metadata = await rq_client.get_metadata() async with rq_client.get_session() as session: stmt = select(RequestDb).where(RequestDb.request_queue_id == metadata.id) result = await session.execute(stmt) records = result.scalars().all() assert len(records) == 1 await rq_client.drop() async with rq_client.get_session() as session: stmt = select(RequestDb).where(RequestDb.request_queue_id == metadata.id) result = await session.execute(stmt) records = result.scalars().all() assert len(records) == 0 db_metadata = await session.get(RequestQueueMetadataDb, metadata.id) assert db_metadata is None async def test_metadata_record_updates(rq_client: SqlRequestQueueClient) -> None: """Test that metadata record updates correctly after operations.""" # Record initial timestamps metadata = await rq_client.get_metadata() initial_created = metadata.created_at initial_accessed = metadata.accessed_at initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a read operation await rq_client.is_empty() # Verify accessed timestamp was updated metadata = await rq_client.get_metadata() assert metadata.created_at == initial_created assert metadata.accessed_at > initial_accessed assert metadata.modified_at == initial_modified accessed_after_read = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) # Perform a write operation await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) # Verify modified timestamp was updated metadata = await rq_client.get_metadata() assert metadata.created_at == initial_created assert metadata.modified_at > initial_modified assert metadata.accessed_at > accessed_after_read async with rq_client.get_session() as session: orm_metadata = await session.get(RequestQueueMetadataDb, metadata.id) assert orm_metadata is not None assert orm_metadata.created_at == metadata.created_at assert orm_metadata.accessed_at == metadata.accessed_at assert orm_metadata.modified_at == metadata.modified_at async def test_data_persistence_across_reopens(configuration: Configuration) -> None: """Test that data persists correctly when reopening the same request queue.""" async with SqlStorageClient() as storage_client: original_client = await storage_client.create_rq_client( name='persistence-test', configuration=configuration, ) test_requests = [ Request.from_url('https://example.com/1'), Request.from_url('https://example.com/2'), ] await original_client.add_batch_of_requests(test_requests) rq_id = (await original_client.get_metadata()).id # Reopen by ID and verify data persists reopened_client = await storage_client.create_rq_client( id=rq_id, configuration=configuration, ) metadata = await reopened_client.get_metadata() assert metadata.total_request_count == 2 # Fetch requests to verify they're still there request1 = await reopened_client.fetch_next_request() request2 = await reopened_client.fetch_next_request() assert request1 is not None assert request2 is not None assert {request1.url, request2.url} == {'https://example.com/1', 'https://example.com/2'} await reopened_client.drop() ================================================ FILE: tests/unit/storages/conftest.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING import pytest from crawlee import service_locator from crawlee.storage_clients import ( FileSystemStorageClient, MemoryStorageClient, RedisStorageClient, SqlStorageClient, StorageClient, ) if TYPE_CHECKING: from fakeredis import FakeAsyncRedis @pytest.fixture(params=['memory', 'file_system', 'sql', 'redis']) def storage_client( request: pytest.FixtureRequest, redis_client: FakeAsyncRedis, ) -> StorageClient: """Parameterized fixture to test with different storage clients.""" storage_client: StorageClient storage_type = request.param if storage_type == 'memory': storage_client = MemoryStorageClient() elif storage_type == 'sql': storage_client = SqlStorageClient() elif storage_type == 'redis': storage_client = RedisStorageClient(redis=redis_client) else: storage_client = FileSystemStorageClient() service_locator.set_storage_client(storage_client) return storage_client ================================================ FILE: tests/unit/storages/test_dataset.py ================================================ from __future__ import annotations import json from typing import TYPE_CHECKING import pytest from crawlee import service_locator from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient from crawlee.storages import Dataset, KeyValueStore from crawlee.storages._storage_instance_manager import StorageInstanceManager if TYPE_CHECKING: from collections.abc import AsyncGenerator from pathlib import Path from typing import Any from crawlee.storage_clients import StorageClient @pytest.fixture async def dataset( storage_client: StorageClient, ) -> AsyncGenerator[Dataset, None]: """Fixture that provides a dataset instance for each test.""" dataset = await Dataset.open( storage_client=storage_client, ) yield dataset await dataset.drop() async def test_open_creates_new_dataset( storage_client: StorageClient, ) -> None: """Test that open() creates a new dataset with proper metadata.""" dataset = await Dataset.open( name='new-dataset', storage_client=storage_client, ) # Verify dataset properties assert dataset.id is not None assert dataset.name == 'new-dataset' metadata = await dataset.get_metadata() assert metadata.item_count == 0 await dataset.drop() async def test_reopen_default( storage_client: StorageClient, ) -> None: """Test reopening a dataset with default parameters.""" # Create a first dataset instance with default parameters dataset_1 = await Dataset.open( storage_client=storage_client, ) # Verify default properties assert dataset_1.id is not None metadata_1 = await dataset_1.get_metadata() assert metadata_1.item_count == 0 # Add an item await dataset_1.push_data({'key': 'value'}) metadata_1 = await dataset_1.get_metadata() assert metadata_1.item_count == 1 # Reopen the same dataset dataset_2 = await Dataset.open( storage_client=storage_client, ) # Verify both instances reference the same dataset assert dataset_2.id == dataset_1.id assert dataset_2.name == dataset_1.name metadata_1 = await dataset_1.get_metadata() metadata_2 = await dataset_2.get_metadata() assert metadata_2.item_count == metadata_1.item_count == 1 # Verify they are the same object (cached) assert id(dataset_1) == id(dataset_2) # Clean up await dataset_1.drop() async def test_open_by_id( storage_client: StorageClient, ) -> None: """Test opening a dataset by its ID.""" # First create a dataset by name dataset1 = await Dataset.open( name='dataset-by-id-test', storage_client=storage_client, ) # Add some data to identify it test_item = {'test': 'opening_by_id', 'timestamp': 12345} await dataset1.push_data(test_item) # Open the dataset by ID dataset2 = await Dataset.open( id=dataset1.id, storage_client=storage_client, ) # Verify it's the same dataset assert dataset2.id == dataset1.id assert dataset2.name == 'dataset-by-id-test' # Verify the data is still there data = await dataset2.get_data() assert data.count == 1 assert data.items[0]['test'] == 'opening_by_id' assert data.items[0]['timestamp'] == 12345 # Clean up await dataset2.drop() async def test_open_existing_dataset( dataset: Dataset, ) -> None: """Test that open() loads an existing dataset correctly.""" # Open the same dataset again reopened_dataset = await Dataset.open( name=dataset.name, ) # Verify dataset properties assert dataset.id == reopened_dataset.id assert dataset.name == reopened_dataset.name metadata = await dataset.get_metadata() reopened_metadata = await reopened_dataset.get_metadata() assert metadata.item_count == reopened_metadata.item_count # Verify they are the same object (from cache) assert id(dataset) == id(reopened_dataset) async def test_open_with_id_and_name( storage_client: StorageClient, ) -> None: """Test that open() raises an error when both id and name are provided.""" with pytest.raises( ValueError, match=r'Only one of "id", "name", "alias" can be specified, but following arguments ' r'were specified: "id", "name".', ): await Dataset.open( id='some-id', name='some-name', storage_client=storage_client, ) async def test_push_data_single_item(dataset: Dataset) -> None: """Test pushing a single item to the dataset.""" item = {'key': 'value', 'number': 42} await dataset.push_data(item) # Verify item was stored result = await dataset.get_data() assert result.count == 1 assert result.items[0] == item async def test_push_data_multiple_items(dataset: Dataset) -> None: """Test pushing multiple items to the dataset.""" items = [ {'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}, ] await dataset.push_data(items) # Verify items were stored result = await dataset.get_data() assert result.count == 3 assert result.items == items async def test_get_data_empty_dataset(dataset: Dataset) -> None: """Test getting data from an empty dataset returns empty results.""" result = await dataset.get_data() assert result.count == 0 assert result.total == 0 assert result.items == [] async def test_get_data_with_pagination(dataset: Dataset) -> None: """Test getting data with offset and limit parameters for pagination.""" # Add some items items = [{'id': i} for i in range(1, 11)] # 10 items await dataset.push_data(items) # Test offset result = await dataset.get_data(offset=3) assert result.count == 7 assert result.offset == 3 assert result.items[0]['id'] == 4 # Test limit result = await dataset.get_data(limit=5) assert result.count == 5 assert result.limit == 5 assert result.items[-1]['id'] == 5 # Test both offset and limit result = await dataset.get_data(offset=2, limit=3) assert result.count == 3 assert result.offset == 2 assert result.limit == 3 assert result.items[0]['id'] == 3 assert result.items[-1]['id'] == 5 async def test_get_data_descending_order(dataset: Dataset) -> None: """Test getting data in descending order reverses the item order.""" # Add some items items = [{'id': i} for i in range(1, 6)] # 5 items await dataset.push_data(items) # Get items in descending order result = await dataset.get_data(desc=True) assert result.desc is True assert result.items[0]['id'] == 5 assert result.items[-1]['id'] == 1 async def test_get_data_skip_empty(dataset: Dataset) -> None: """Test getting data with skip_empty option filters out empty items.""" # Add some items including an empty one items = [ {'id': 1, 'name': 'Item 1'}, {}, # Empty item {'id': 3, 'name': 'Item 3'}, ] await dataset.push_data(items) # Get all items result = await dataset.get_data() assert result.count == 3 # Get non-empty items result = await dataset.get_data(skip_empty=True) assert result.count == 2 assert all(item != {} for item in result.items) async def test_iterate_items(dataset: Dataset) -> None: """Test iterating over dataset items yields each item in the correct order.""" # Add some items items = [{'id': i} for i in range(1, 6)] # 5 items await dataset.push_data(items) # Iterate over all items collected_items = [item async for item in dataset.iterate_items()] assert len(collected_items) == 5 assert collected_items[0]['id'] == 1 assert collected_items[-1]['id'] == 5 async def test_iterate_items_with_options(dataset: Dataset) -> None: """Test iterating with offset, limit and desc parameters.""" # Add some items items = [{'id': i} for i in range(1, 11)] # 10 items await dataset.push_data(items) # Test with offset and limit collected_items = [item async for item in dataset.iterate_items(offset=3, limit=3)] assert len(collected_items) == 3 assert collected_items[0]['id'] == 4 assert collected_items[-1]['id'] == 6 # Test with descending order collected_items = [] async for item in dataset.iterate_items(desc=True, limit=3): collected_items.append(item) assert len(collected_items) == 3 assert collected_items[0]['id'] == 10 assert collected_items[-1]['id'] == 8 async def test_list_items(dataset: Dataset) -> None: """Test that list_items returns all dataset items as a list.""" # Add some items items = [{'id': i} for i in range(1, 6)] # 5 items await dataset.push_data(items) # Get all items as a list collected_items = await dataset.list_items() assert len(collected_items) == 5 assert collected_items[0]['id'] == 1 assert collected_items[-1]['id'] == 5 async def test_list_items_with_options(dataset: Dataset) -> None: """Test that list_items respects filtering options.""" # Add some items items: list[dict[str, Any]] = [ {'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3}, # Item with missing 'name' field {}, # Empty item {'id': 5, 'name': 'Item 5'}, ] await dataset.push_data(items) # Test with offset and limit collected_items = await dataset.list_items(offset=1, limit=2) assert len(collected_items) == 2 assert collected_items[0]['id'] == 2 assert collected_items[1]['id'] == 3 # Test with descending order - skip empty items to avoid KeyError collected_items = await dataset.list_items(desc=True, skip_empty=True) # Filter items that have an 'id' field items_with_ids = [item for item in collected_items if 'id' in item] id_values = [item['id'] for item in items_with_ids] # Verify the list is sorted in descending order assert sorted(id_values, reverse=True) == id_values, f'IDs should be in descending order. Got {id_values}' # Verify key IDs are present and in the right order if 5 in id_values and 3 in id_values: assert id_values.index(5) < id_values.index(3), 'ID 5 should come before ID 3 in descending order' # Test with skip_empty collected_items = await dataset.list_items(skip_empty=True) assert len(collected_items) == 4 # Should skip the empty item assert all(item != {} for item in collected_items) # Test with fields - manually filter since 'fields' parameter is not supported # Get all items first collected_items = await dataset.list_items() assert len(collected_items) == 5 # Manually extract only the 'id' field from each item filtered_items = [{key: item[key] for key in ['id'] if key in item} for item in collected_items] # Verify 'name' field is not present in any item assert all('name' not in item for item in filtered_items) # Test clean functionality manually instead of using the clean parameter # Get all items collected_items = await dataset.list_items() # Manually filter out empty items as 'clean' would do clean_items = [item for item in collected_items if item != {}] assert len(clean_items) == 4 # Should have 4 non-empty items assert all(item != {} for item in clean_items) async def test_drop( storage_client: StorageClient, ) -> None: """Test dropping a dataset removes it from cache and clears its data.""" dataset = await Dataset.open( name='drop-test', storage_client=storage_client, ) # Add some data await dataset.push_data({'test': 'data'}) # Drop the dataset await dataset.drop() # Verify dataset is empty (by creating a new one with the same name) new_dataset = await Dataset.open( name='drop-test', storage_client=storage_client, ) result = await new_dataset.get_data() assert result.count == 0 await new_dataset.drop() async def test_export_to_json( dataset: Dataset, storage_client: StorageClient, ) -> None: """Test exporting dataset to JSON format.""" # Create a key-value store for export kvs = await KeyValueStore.open( name='export-kvs', ) # Add some items to the dataset items = [ {'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}, ] await dataset.push_data(items) # Export to JSON await dataset.export_to( key='dataset_export.json', content_type='json', to_kvs_name='export-kvs', to_kvs_storage_client=storage_client, ) # Retrieve the exported file record = await kvs.get_value(key='dataset_export.json') assert record is not None # Verify content has all the items assert '"id": 1' in record assert '"id": 2' in record assert '"id": 3' in record await kvs.drop() async def test_export_to_csv( dataset: Dataset, storage_client: StorageClient, ) -> None: """Test exporting dataset to CSV format.""" # Create a key-value store for export kvs = await KeyValueStore.open( name='export-kvs', storage_client=storage_client, ) # Add some items to the dataset items = [ {'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}, ] await dataset.push_data(items) # Export to CSV await dataset.export_to( key='dataset_export.csv', content_type='csv', to_kvs_name='export-kvs', to_kvs_storage_client=storage_client, ) # Retrieve the exported file record = await kvs.get_value(key='dataset_export.csv') assert record is not None # Verify content has all the items assert 'id,name' in record assert '1,Item 1' in record assert '2,Item 2' in record assert '3,Item 3' in record await kvs.drop() async def test_export_to_invalid_content_type(dataset: Dataset) -> None: """Test exporting dataset with invalid content type raises error.""" with pytest.raises(ValueError, match=r'Unsupported content type'): await dataset.export_to(key='invalid_export', content_type='invalid') # ty: ignore[no-matching-overload] async def test_export_with_multiple_kwargs(dataset: Dataset, tmp_path: Path) -> None: """Test exporting dataset using many optional arguments together.""" target_kvs_name = 'some-kvs' target_storage_client = FileSystemStorageClient() export_key = 'exported_dataset' data = {'some key': 'some data'} # Prepare custom directory and configuration custom_dir_name = 'some_dir' custom_dir = tmp_path / custom_dir_name custom_dir.mkdir() target_configuration = Configuration(storage_dir=str(custom_dir)) # Set expected values expected_exported_data = f'{json.dumps([{"some key": "some data"}])}' expected_kvs_dir = custom_dir / 'key_value_stores' / target_kvs_name # Populate dataset and export await dataset.push_data(data) await dataset.export_to( key=export_key, content_type='json', to_kvs_name=target_kvs_name, to_kvs_storage_client=target_storage_client, to_kvs_configuration=target_configuration, ) # Verify the directory was created assert expected_kvs_dir.is_dir() # Verify that kvs contains the exported data kvs = await KeyValueStore.open( name=target_kvs_name, storage_client=target_storage_client, configuration=target_configuration ) assert await kvs.get_value(key=export_key) == expected_exported_data async def test_large_dataset(dataset: Dataset) -> None: """Test handling a large dataset with many items.""" items = [{'id': i, 'value': f'value-{i}'} for i in range(100)] await dataset.push_data(items) # Test that all items are retrieved result = await dataset.get_data(limit=None) assert result.count == 100 assert result.total == 100 # Test pagination with large datasets result = await dataset.get_data(offset=50, limit=25) assert result.count == 25 assert result.offset == 50 assert result.items[0]['id'] == 50 assert result.items[-1]['id'] == 74 async def test_purge( storage_client: StorageClient, ) -> None: """Test purging a dataset removes all data but keeps the dataset itself.""" # First create a dataset dataset = await Dataset.open( name='purge-test-dataset', storage_client=storage_client, ) # Add some data initial_items = [ {'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}, ] await dataset.push_data(initial_items) # Verify data was added data = await dataset.get_data() assert data.count == 3 assert data.total == 3 metadata = await dataset.get_metadata() assert metadata.item_count == 3 # Record the dataset ID dataset_id = dataset.id # Purge the dataset await dataset.purge() # Verify the dataset still exists but is empty assert dataset.id == dataset_id # Same ID preserved assert dataset.name == 'purge-test-dataset' # Same name preserved # Dataset should be empty now data = await dataset.get_data() assert data.count == 0 assert data.total == 0 metadata = await dataset.get_metadata() assert metadata.item_count == 0 # Verify we can add new data after purging new_item = {'id': 4, 'name': 'New Item After Purge'} await dataset.push_data(new_item) data = await dataset.get_data() assert data.count == 1 assert data.items[0]['name'] == 'New Item After Purge' # Clean up await dataset.drop() async def test_open_with_alias( storage_client: StorageClient, ) -> None: """Test opening datasets with alias parameter for NDU functionality.""" # Create datasets with different aliases dataset_1 = await Dataset.open( alias='test_alias_1', storage_client=storage_client, ) dataset_2 = await Dataset.open( alias='test_alias_2', storage_client=storage_client, ) # Verify they have different IDs but no names (unnamed) assert dataset_1.id != dataset_2.id assert dataset_1.name is None assert dataset_2.name is None # Add different data to each await dataset_1.push_data({'source': 'alias_1', 'value': 1}) await dataset_2.push_data({'source': 'alias_2', 'value': 2}) # Verify data isolation data_1 = await dataset_1.get_data() data_2 = await dataset_2.get_data() assert data_1.count == 1 assert data_2.count == 1 assert data_1.items[0]['source'] == 'alias_1' assert data_2.items[0]['source'] == 'alias_2' # Clean up await dataset_1.drop() await dataset_2.drop() async def test_alias_caching( storage_client: StorageClient, ) -> None: """Test that datasets with same alias return same instance (cached).""" # Open dataset with alias dataset_1 = await Dataset.open( alias='cache_test', storage_client=storage_client, ) # Open again with same alias dataset_2 = await Dataset.open( alias='cache_test', storage_client=storage_client, ) # Should be same instance assert dataset_1 is dataset_2 assert dataset_1.id == dataset_2.id # Clean up await dataset_1.drop() async def test_alias_with_id_error( storage_client: StorageClient, ) -> None: """Test that providing both alias and id raises error.""" with pytest.raises( ValueError, match=r'Only one of "id", "name", "alias" can be specified, but following arguments ' r'were specified: "id", "alias".', ): await Dataset.open( id='some-id', alias='some-alias', storage_client=storage_client, ) async def test_alias_with_name_error( storage_client: StorageClient, ) -> None: """Test that providing both alias and name raises error.""" with pytest.raises( ValueError, match=r'Only one of "id", "name", "alias" can be specified, but following arguments ' r'were specified: "name", "alias".', ): await Dataset.open( name='some-name', alias='some-alias', storage_client=storage_client, ) async def test_alias_with_all_parameters_error( storage_client: StorageClient, ) -> None: """Test that providing id, name, and alias raises error.""" with pytest.raises( ValueError, match=r'Only one of "id", "name", "alias" can be specified, but following arguments ' r'were specified: "id", "name", "alias".', ): await Dataset.open( id='some-id', name='some-name', alias='some-alias', storage_client=storage_client, ) async def test_alias_with_special_characters( storage_client: StorageClient, ) -> None: """Test alias functionality with special characters.""" special_aliases = [ 'alias-with-dashes', 'alias_with_underscores', 'alias.with.dots', 'alias123with456numbers', 'CamelCaseAlias', ] datasets = [] for alias in special_aliases: dataset = await Dataset.open( alias=alias, storage_client=storage_client, ) datasets.append(dataset) # Add data with the alias as identifier await dataset.push_data({'alias_used': alias, 'test': 'special_chars'}) # Verify all work correctly for i, dataset in enumerate(datasets): data = await dataset.get_data() assert data.count == 1 assert data.items[0]['alias_used'] == special_aliases[i] # Clean up for dataset in datasets: await dataset.drop() async def test_named_vs_alias_conflict_detection( storage_client: StorageClient, ) -> None: """Test that conflicts between named and alias storages are detected.""" # Test 1: Create named storage first, then try alias with same name named_dataset = await Dataset.open(name='conflict-test', storage_client=storage_client) assert named_dataset.name == 'conflict-test' # Try to create alias with same name - should raise error with pytest.raises(ValueError, match=r'Cannot create alias storage "conflict-test".*already exists'): await Dataset.open(alias='conflict-test', storage_client=storage_client) # Clean up await named_dataset.drop() # Test 2: Create alias first, then try named with same name alias_dataset = await Dataset.open(alias='conflict-test2', storage_client=storage_client) assert alias_dataset.name is None # Alias storages have no name # Try to create named with same name - should raise error with pytest.raises(ValueError, match=r'Cannot create named storage "conflict-test2".*already exists'): await Dataset.open(name='conflict-test2', storage_client=storage_client) # Clean up await alias_dataset.drop() async def test_alias_parameter( storage_client: StorageClient, ) -> None: """Test dataset creation and operations with alias parameter.""" # Create dataset with alias alias_dataset = await Dataset.open( alias='test_alias', storage_client=storage_client, ) # Verify alias dataset properties assert alias_dataset.id is not None assert alias_dataset.name is None # Alias storages should be unnamed # Test data operations await alias_dataset.push_data({'type': 'alias', 'value': 1}) data = await alias_dataset.get_data() assert data.count == 1 assert data.items[0]['type'] == 'alias' await alias_dataset.drop() async def test_alias_vs_named_isolation( storage_client: StorageClient, ) -> None: """Test that alias and named datasets with same identifier are isolated.""" # Create named dataset named_dataset = await Dataset.open( name='test-identifier', storage_client=storage_client, ) # Verify named dataset assert named_dataset.name == 'test-identifier' await named_dataset.push_data({'type': 'named'}) # Clean up named dataset first await named_dataset.drop() # Now create alias dataset with same identifier (should work after cleanup) alias_dataset = await Dataset.open( alias='test_identifier', storage_client=storage_client, ) # Should be different instance assert alias_dataset.name is None await alias_dataset.push_data({'type': 'alias'}) # Verify alias data alias_data = await alias_dataset.get_data() assert alias_data.items[0]['type'] == 'alias' await alias_dataset.drop() async def test_default_vs_alias_default_equivalence( storage_client: StorageClient, ) -> None: """Test that default dataset and alias='default' are equivalent.""" # Open default dataset default_dataset = await Dataset.open( storage_client=storage_client, ) alias_default_dataset = await Dataset.open( alias=StorageInstanceManager._DEFAULT_STORAGE_ALIAS, storage_client=storage_client, ) # Should be the same assert default_dataset.id == alias_default_dataset.id assert default_dataset.name is None assert alias_default_dataset.name is None # Data should be shared await default_dataset.push_data({'source': 'default'}) data = await alias_default_dataset.get_data() assert data.items[0]['source'] == 'default' await default_dataset.drop() async def test_multiple_alias_isolation( storage_client: StorageClient, ) -> None: """Test that different aliases create separate datasets.""" datasets = [] for i in range(3): dataset = await Dataset.open( alias=f'alias_{i}', storage_client=storage_client, ) await dataset.push_data({'alias': f'alias_{i}', 'index': i}) datasets.append(dataset) # All should be different for i in range(3): for j in range(i + 1, 3): assert datasets[i].id != datasets[j].id # Verify data isolation for i, dataset in enumerate(datasets): data = await dataset.get_data() assert data.items[0]['alias'] == f'alias_{i}' await dataset.drop() async def test_purge_on_start_enabled(storage_client: StorageClient) -> None: """Test purge behavior when purge_on_start=True: named storages retain data, unnamed storages are purged.""" # Skip this test for memory storage since it doesn't persist data between client instances. if isinstance(storage_client, MemoryStorageClient): pytest.skip('Memory storage does not persist data between client instances.') configuration = Configuration(purge_on_start=True) # First, create all storage types with purge enabled and add data. default_dataset = await Dataset.open( storage_client=storage_client, configuration=configuration, ) alias_dataset = await Dataset.open( alias='purge-test-alias', storage_client=storage_client, configuration=configuration, ) named_dataset = await Dataset.open( name='purge-test-named', storage_client=storage_client, configuration=configuration, ) await default_dataset.push_data({'type': 'default', 'data': 'should_be_purged'}) await alias_dataset.push_data({'type': 'alias', 'data': 'should_be_purged'}) await named_dataset.push_data({'type': 'named', 'data': 'should_persist'}) # Verify data was added default_data = await default_dataset.get_data() alias_data = await alias_dataset.get_data() named_data = await named_dataset.get_data() assert len(default_data.items) == 1 assert len(alias_data.items) == 1 assert len(named_data.items) == 1 # Verify that default and alias storages are unnamed default_metadata = await default_dataset.get_metadata() alias_metadata = await alias_dataset.get_metadata() named_metadata = await named_dataset.get_metadata() assert default_metadata.name is None assert alias_metadata.name is None assert named_metadata.name == 'purge-test-named' # Clear storage cache to simulate "reopening" storages service_locator.storage_instance_manager.clear_cache() # Now "reopen" all storages default_dataset_2 = await Dataset.open( storage_client=storage_client, configuration=configuration, ) alias_dataset_2 = await Dataset.open( alias='purge-test-alias', storage_client=storage_client, configuration=configuration, ) named_dataset_2 = await Dataset.open( name='purge-test-named', storage_client=storage_client, configuration=configuration, ) # Check the data after purge default_data_after = await default_dataset_2.get_data() alias_data_after = await alias_dataset_2.get_data() named_data_after = await named_dataset_2.get_data() # Unnamed storages (alias and default) should be purged (data removed) assert len(default_data_after.items) == 0 assert len(alias_data_after.items) == 0 # Named storage should retain data (not purged) assert len(named_data_after.items) == 1 # Clean up await named_dataset_2.drop() await alias_dataset_2.drop() await default_dataset_2.drop() async def test_purge_on_start_disabled(storage_client: StorageClient) -> None: """Test purge behavior when purge_on_start=False: all storages retain data regardless of type.""" # Skip this test for memory storage since it doesn't persist data between client instances. if isinstance(storage_client, MemoryStorageClient): pytest.skip('Memory storage does not persist data between client instances.') configuration = Configuration(purge_on_start=False) # First, create all storage types with purge disabled and add data. default_dataset = await Dataset.open( storage_client=storage_client, configuration=configuration, ) alias_dataset = await Dataset.open( alias='purge-test-alias', storage_client=storage_client, configuration=configuration, ) named_dataset = await Dataset.open( name='purge-test-named', storage_client=storage_client, configuration=configuration, ) await default_dataset.push_data({'type': 'default', 'data': 'should_persist'}) await alias_dataset.push_data({'type': 'alias', 'data': 'should_persist'}) await named_dataset.push_data({'type': 'named', 'data': 'should_persist'}) # Verify data was added default_data = await default_dataset.get_data() alias_data = await alias_dataset.get_data() named_data = await named_dataset.get_data() assert len(default_data.items) == 1 assert len(alias_data.items) == 1 assert len(named_data.items) == 1 # Verify that default and alias storages are unnamed default_metadata = await default_dataset.get_metadata() alias_metadata = await alias_dataset.get_metadata() named_metadata = await named_dataset.get_metadata() assert default_metadata.name is None assert alias_metadata.name is None assert named_metadata.name == 'purge-test-named' # Clear storage cache to simulate "reopening" storages service_locator.storage_instance_manager.clear_cache() # Now "reopen" all storages default_dataset_2 = await Dataset.open( storage_client=storage_client, configuration=configuration, ) alias_dataset_2 = await Dataset.open( alias='purge-test-alias', storage_client=storage_client, configuration=configuration, ) named_dataset_2 = await Dataset.open( name='purge-test-named', storage_client=storage_client, configuration=configuration, ) # Check the data after purge default_data_after = await default_dataset_2.get_data() alias_data_after = await alias_dataset_2.get_data() named_data_after = await named_dataset_2.get_data() # All storages should retain data (not purged) assert len(default_data_after.items) == 1 assert len(alias_data_after.items) == 1 assert len(named_data_after.items) == 1 assert default_data_after.items[0]['data'] == 'should_persist' assert alias_data_after.items[0]['data'] == 'should_persist' assert named_data_after.items[0]['data'] == 'should_persist' # Clean up await default_dataset_2.drop() await alias_dataset_2.drop() await named_dataset_2.drop() async def test_name_default_not_allowed(storage_client: StorageClient) -> None: """Test that storage can't have default alias as name, to prevent collisions with unnamed storage alias.""" with pytest.raises( ValueError, match=f'Storage name cannot be "{StorageInstanceManager._DEFAULT_STORAGE_ALIAS}" as ' f'it is reserved for default alias.', ): await Dataset.open(name=StorageInstanceManager._DEFAULT_STORAGE_ALIAS, storage_client=storage_client) @pytest.mark.parametrize( ('name', 'is_valid'), [ pytest.param('F', True, id='single-char'), pytest.param('7', True, id='single-digit'), pytest.param('FtghdfseySds', True, id='mixed-case'), pytest.param('125673450', True, id='all-digits'), pytest.param('Ft2134Sfe0O1hf', True, id='mixed-alphanumeric'), pytest.param('name-with-dashes', True, id='dashes'), pytest.param('1-value', True, id='number start'), pytest.param('value-1', True, id='number end'), pytest.param('test-1-value', True, id='number middle'), pytest.param('test-------value', True, id='multiple-dashes'), pytest.param('test-VALUES-test', True, id='multiple-cases'), pytest.param('name_with_underscores', False, id='underscores'), pytest.param('name with spaces', False, id='spaces'), pytest.param('-test', False, id='dashes start'), pytest.param('test-', False, id='dashes end'), ], ) async def test_validate_name(storage_client: StorageClient, name: str, *, is_valid: bool) -> None: """Test name validation logic.""" if is_valid: # Should not raise dataset = await Dataset.open(name=name, storage_client=storage_client) assert dataset.name == name await dataset.drop() else: with pytest.raises(ValueError, match=rf'Invalid storage name "{name}".*'): await Dataset.open(name=name, storage_client=storage_client) async def test_record_with_noascii_chars(dataset: Dataset) -> None: """Test handling record with non-ASCII characters.""" init_value = { 'record_1': 'Supermaxi El Jardín', 'record_2': 'záznam dva', 'record_3': '記録三', } # Save the record to the dataset await dataset.push_data(init_value) # Get the record and verify value = await dataset.get_data() assert value is not None assert value.items[0] == init_value ================================================ FILE: tests/unit/storages/test_key_value_store.py ================================================ from __future__ import annotations import json from typing import TYPE_CHECKING import pytest from crawlee import service_locator from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, SqlStorageClient, StorageClient from crawlee.storages import KeyValueStore from crawlee.storages._storage_instance_manager import StorageInstanceManager if TYPE_CHECKING: from collections.abc import AsyncGenerator from pathlib import Path @pytest.fixture async def kvs( storage_client: StorageClient, ) -> AsyncGenerator[KeyValueStore, None]: """Fixture that provides a key-value store instance for each test.""" kvs = await KeyValueStore.open( storage_client=storage_client, ) yield kvs await kvs.drop() async def test_open_creates_new_kvs( storage_client: StorageClient, ) -> None: """Test that open() creates a new key-value store with proper metadata.""" kvs = await KeyValueStore.open( name='new-kvs', storage_client=storage_client, ) # Verify key-value store properties assert kvs.id is not None assert kvs.name == 'new-kvs' await kvs.drop() async def test_open_existing_kvs( kvs: KeyValueStore, storage_client: StorageClient, ) -> None: """Test that open() loads an existing key-value store correctly.""" # Open the same key-value store again reopened_kvs = await KeyValueStore.open( name=kvs.name, storage_client=storage_client, ) # Verify key-value store properties assert kvs.id == reopened_kvs.id assert kvs.name == reopened_kvs.name # Verify they are the same object (from cache) assert id(kvs) == id(reopened_kvs) async def test_open_with_id_and_name( storage_client: StorageClient, ) -> None: """Test that open() raises an error when both id and name are provided.""" with pytest.raises( ValueError, match=r'Only one of "id", "name", "alias" can be specified, but following arguments ' r'were specified: "id", "name".', ): await KeyValueStore.open( id='some-id', name='some-name', storage_client=storage_client, ) async def test_open_by_id( storage_client: StorageClient, ) -> None: """Test opening a key-value store by its ID.""" # First create a key-value store by name kvs1 = await KeyValueStore.open( name='kvs-by-id-test', storage_client=storage_client, ) # Add some data to identify it await kvs1.set_value('test_key', {'test': 'opening_by_id', 'timestamp': 12345}) # Open the key-value store by ID kvs2 = await KeyValueStore.open( id=kvs1.id, storage_client=storage_client, ) # Verify it's the same key-value store assert kvs2.id == kvs1.id assert kvs2.name == 'kvs-by-id-test' # Verify the data is still there value = await kvs2.get_value('test_key') assert value is not None assert value['test'] == 'opening_by_id' assert value['timestamp'] == 12345 # Clean up await kvs2.drop() async def test_set_get_value(kvs: KeyValueStore) -> None: """Test setting and getting a value from the key-value store.""" # Set a value test_key = 'test-key' test_value = {'data': 'value', 'number': 42} await kvs.set_value(test_key, test_value) # Get the value result = await kvs.get_value(test_key) assert result == test_value async def test_set_get_none(kvs: KeyValueStore) -> None: """Test setting and getting None as a value.""" test_key = 'none-key' await kvs.set_value(test_key, None) result = await kvs.get_value(test_key) assert result is None async def test_get_value_nonexistent(kvs: KeyValueStore) -> None: """Test getting a nonexistent value returns None.""" result = await kvs.get_value('nonexistent-key') assert result is None async def test_get_value_with_default(kvs: KeyValueStore) -> None: """Test getting a nonexistent value with a default value.""" default_value = {'default': True} result = await kvs.get_value('nonexistent-key', default_value=default_value) assert result == default_value async def test_set_value_with_content_type(kvs: KeyValueStore) -> None: """Test setting a value with a specific content type.""" test_key = 'test-json' test_value = {'data': 'value', 'items': [1, 2, 3]} await kvs.set_value(test_key, test_value, content_type='application/json') # Verify the value is retrievable result = await kvs.get_value(test_key) assert result == test_value async def test_delete_value(kvs: KeyValueStore) -> None: """Test deleting a value from the key-value store.""" # Set a value first test_key = 'delete-me' test_value = 'value to delete' await kvs.set_value(test_key, test_value) # Verify value exists assert await kvs.get_value(test_key) == test_value # Delete the value await kvs.delete_value(test_key) # Verify value is gone assert await kvs.get_value(test_key) is None async def test_list_keys_empty_kvs(kvs: KeyValueStore) -> None: """Test listing keys from an empty key-value store.""" keys = await kvs.list_keys() assert len(keys) == 0 async def test_list_keys(kvs: KeyValueStore) -> None: """Test listing keys from a key-value store with items.""" # Add some items await kvs.set_value('key1', 'value1') await kvs.set_value('key2', 'value2') await kvs.set_value('key3', 'value3') # List keys keys = await kvs.list_keys() # Verify keys assert len(keys) == 3 key_names = [k.key for k in keys] assert 'key1' in key_names assert 'key2' in key_names assert 'key3' in key_names async def test_list_keys_with_limit(kvs: KeyValueStore) -> None: """Test listing keys with a limit parameter.""" # Add some items for i in range(10): await kvs.set_value(f'key{i}', f'value{i}') # List with limit keys = await kvs.list_keys(limit=5) assert len(keys) == 5 async def test_list_keys_with_exclusive_start_key(kvs: KeyValueStore) -> None: """Test listing keys with an exclusive start key.""" # Add some items in a known order await kvs.set_value('key1', 'value1') await kvs.set_value('key2', 'value2') await kvs.set_value('key3', 'value3') await kvs.set_value('key4', 'value4') await kvs.set_value('key5', 'value5') # Get all keys first to determine their order all_keys = await kvs.list_keys() all_key_names = [k.key for k in all_keys] if len(all_key_names) >= 3: # Start from the second key start_key = all_key_names[1] keys = await kvs.list_keys(exclusive_start_key=start_key) # We should get all keys after the start key expected_count = len(all_key_names) - all_key_names.index(start_key) - 1 assert len(keys) == expected_count # First key should be the one after start_key first_returned_key = keys[0].key assert first_returned_key != start_key assert all_key_names.index(first_returned_key) > all_key_names.index(start_key) async def test_iterate_keys(kvs: KeyValueStore) -> None: """Test iterating over keys in the key-value store.""" # Add some items await kvs.set_value('key1', 'value1') await kvs.set_value('key2', 'value2') await kvs.set_value('key3', 'value3') collected_keys = [key async for key in kvs.iterate_keys()] # Verify iteration result assert len(collected_keys) == 3 key_names = [k.key for k in collected_keys] assert 'key1' in key_names assert 'key2' in key_names assert 'key3' in key_names async def test_iterate_keys_with_limit(kvs: KeyValueStore) -> None: """Test iterating over keys with a limit parameter.""" # Add some items for i in range(10): await kvs.set_value(f'key{i}', f'value{i}') collected_keys = [key async for key in kvs.iterate_keys(limit=5)] # Verify iteration result assert len(collected_keys) == 5 async def test_drop( storage_client: StorageClient, ) -> None: """Test dropping a key-value store removes it from cache and clears its data.""" kvs = await KeyValueStore.open( name='drop-test', storage_client=storage_client, ) # Add some data await kvs.set_value('test', 'data') # Drop the key-value store await kvs.drop() # Verify key-value store is empty (by creating a new one with the same name) new_kvs = await KeyValueStore.open( name='drop-test', storage_client=storage_client, ) # Attempt to get a previously stored value result = await new_kvs.get_value('test') assert result is None await new_kvs.drop() async def test_reopen_default( storage_client: StorageClient, ) -> None: """Test reopening the default key-value store.""" # Open the default key-value store kvs1 = await KeyValueStore.open( storage_client=storage_client, ) # Set a value await kvs1.set_value('test_key', 'test_value') # Open the default key-value store again kvs2 = await KeyValueStore.open( storage_client=storage_client, ) # Verify they are the same store assert kvs1.id == kvs2.id assert kvs1.name == kvs2.name # Verify the value is accessible value1 = await kvs1.get_value('test_key') value2 = await kvs2.get_value('test_key') assert value1 == value2 == 'test_value' # Verify they are the same object assert id(kvs1) == id(kvs2) async def test_complex_data_types(kvs: KeyValueStore) -> None: """Test storing and retrieving complex data types.""" # Test nested dictionaries nested_dict = { 'level1': { 'level2': { 'level3': 'deep value', 'numbers': [1, 2, 3], }, }, 'array': [{'a': 1}, {'b': 2}], } await kvs.set_value('nested', nested_dict) result = await kvs.get_value('nested') assert result == nested_dict # Test lists test_list = [1, 'string', True, None, {'key': 'value'}] await kvs.set_value('list', test_list) result = await kvs.get_value('list') assert result == test_list async def test_string_data(kvs: KeyValueStore) -> None: """Test storing and retrieving string data.""" # Plain string await kvs.set_value('string', 'simple string') result = await kvs.get_value('string') assert result == 'simple string' # JSON string json_string = json.dumps({'key': 'value'}) await kvs.set_value('json_string', json_string) result = await kvs.get_value('json_string') assert result == json_string async def test_key_with_special_characters(kvs: KeyValueStore) -> None: """Test storing and retrieving values with keys containing special characters.""" # Key with spaces, slashes, and special characters special_key = 'key with spaces/and/slashes!@#$%^&*()' test_value = 'Special key value' # Store the value with the special key await kvs.set_value(key=special_key, value=test_value) # Retrieve the value and verify it matches result = await kvs.get_value(key=special_key) assert result is not None assert result == test_value # Make sure the key is properly listed keys = await kvs.list_keys() key_names = [k.key for k in keys] assert special_key in key_names # Test key deletion await kvs.delete_value(key=special_key) assert await kvs.get_value(key=special_key) is None async def test_data_persistence_on_reopen() -> None: """Test that data persists when reopening a KeyValueStore.""" kvs1 = await KeyValueStore.open() await kvs1.set_value('key_123', 'value_123') result1 = await kvs1.get_value('key_123') assert result1 == 'value_123' kvs2 = await KeyValueStore.open() result2 = await kvs2.get_value('key_123') assert result2 == 'value_123' assert await kvs1.list_keys() == await kvs2.list_keys() await kvs2.set_value('key_456', 'value_456') result1 = await kvs1.get_value('key_456') assert result1 == 'value_456' async def test_purge( storage_client: StorageClient, ) -> None: """Test purging a key-value store removes all values but keeps the store itself.""" # First create a key-value store kvs = await KeyValueStore.open( name='purge-test-kvs', storage_client=storage_client, ) # Add some values await kvs.set_value('key1', 'value1') await kvs.set_value('key2', 'value2') await kvs.set_value('key3', {'complex': 'value', 'number': 42}) # Verify values were added keys = await kvs.list_keys() assert len(keys) == 3 # Record the store ID kvs_id = kvs.id # Purge the key-value store await kvs.purge() # Verify the store still exists but is empty assert kvs.id == kvs_id # Same ID preserved assert kvs.name == 'purge-test-kvs' # Same name preserved # Store should be empty now keys = await kvs.list_keys() assert len(keys) == 0 # Values should no longer be accessible assert await kvs.get_value('key1') is None assert await kvs.get_value('key2') is None assert await kvs.get_value('key3') is None # Verify we can add new values after purging await kvs.set_value('new_key', 'new value after purge') value = await kvs.get_value('new_key') assert value == 'new value after purge' # Clean up await kvs.drop() async def test_record_exists_nonexistent(kvs: KeyValueStore) -> None: """Test that record_exists returns False for a nonexistent key.""" result = await kvs.record_exists('nonexistent-key') assert result is False async def test_record_exists_after_set(kvs: KeyValueStore) -> None: """Test that record_exists returns True after setting a value.""" test_key = 'exists-key' test_value = {'data': 'test'} # Initially should not exist assert await kvs.record_exists(test_key) is False # Set the value await kvs.set_value(test_key, test_value) # Now should exist assert await kvs.record_exists(test_key) is True async def test_record_exists_after_delete(kvs: KeyValueStore) -> None: """Test that record_exists returns False after deleting a value.""" test_key = 'exists-then-delete-key' test_value = 'will be deleted' # Set a value await kvs.set_value(test_key, test_value) assert await kvs.record_exists(test_key) is True # Delete the value await kvs.delete_value(test_key) # Should no longer exist assert await kvs.record_exists(test_key) is False async def test_record_exists_with_none_value(kvs: KeyValueStore) -> None: """Test that record_exists returns True even when value is None.""" test_key = 'none-value-key' # Set None as value await kvs.set_value(test_key, None) # Should still exist even though value is None assert await kvs.record_exists(test_key) is True # Verify we can distinguish between None value and nonexistent key assert await kvs.get_value(test_key) is None assert await kvs.record_exists(test_key) is True assert await kvs.record_exists('truly-nonexistent') is False async def test_record_exists_different_content_types(kvs: KeyValueStore) -> None: """Test record_exists with different content types.""" test_cases = [ ('json-key', {'data': 'json'}, 'application/json'), ('text-key', 'plain text', 'text/plain'), ('binary-key', b'binary data', 'application/octet-stream'), ] for key, value, content_type in test_cases: # Set value with specific content type await kvs.set_value(key, value, content_type=content_type) # Should exist regardless of content type assert await kvs.record_exists(key) is True async def test_record_exists_multiple_keys(kvs: KeyValueStore) -> None: """Test record_exists with multiple keys and batch operations.""" keys_and_values = [ ('key1', 'value1'), ('key2', {'nested': 'object'}), ('key3', [1, 2, 3]), ('key4', None), ] # Initially, none should exist for key, _ in keys_and_values: assert await kvs.record_exists(key) is False # Set all values for key, value in keys_and_values: await kvs.set_value(key, value) # All should exist now for key, _ in keys_and_values: assert await kvs.record_exists(key) is True # Test some non-existent keys assert await kvs.record_exists('nonexistent1') is False assert await kvs.record_exists('nonexistent2') is False async def test_record_exists_after_purge(kvs: KeyValueStore) -> None: """Test that record_exists returns False after purging the store.""" # Set some values await kvs.set_value('key1', 'value1') await kvs.set_value('key2', 'value2') # Verify they exist assert await kvs.record_exists('key1') is True assert await kvs.record_exists('key2') is True # Purge the store await kvs.purge() # Should no longer exist assert await kvs.record_exists('key1') is False assert await kvs.record_exists('key2') is False async def test_open_with_alias( storage_client: StorageClient, ) -> None: """Test opening key-value stores with alias parameter for NDU functionality.""" # Create key-value stores with different aliases kvs_1 = await KeyValueStore.open( alias='test_alias_1', storage_client=storage_client, ) kvs_2 = await KeyValueStore.open( alias='test_alias_2', storage_client=storage_client, ) # Verify they have different IDs but no names (unnamed) assert kvs_1.id != kvs_2.id assert kvs_1.name is None assert kvs_2.name is None # Add different data to each await kvs_1.set_value('source', 'alias_1') await kvs_2.set_value('source', 'alias_2') # Verify data isolation value_1 = await kvs_1.get_value('source') value_2 = await kvs_2.get_value('source') assert value_1 == 'alias_1' assert value_2 == 'alias_2' # Clean up await kvs_1.drop() await kvs_2.drop() async def test_alias_caching( storage_client: StorageClient, ) -> None: """Test that key-value stores with same alias return same instance (cached).""" # Open kvs with alias kvs_1 = await KeyValueStore.open( alias='cache_test', storage_client=storage_client, ) # Open again with same alias kvs_2 = await KeyValueStore.open( alias='cache_test', storage_client=storage_client, ) # Should be same instance assert kvs_1 is kvs_2 assert kvs_1.id == kvs_2.id # Clean up await kvs_1.drop() async def test_alias_with_id_error( storage_client: StorageClient, ) -> None: """Test that providing both alias and id raises error.""" with pytest.raises( ValueError, match=r'Only one of "id", "name", "alias" can be specified, but following arguments ' r'were specified: "id", "alias".', ): await KeyValueStore.open( id='some-id', alias='some-alias', storage_client=storage_client, ) async def test_alias_with_name_error( storage_client: StorageClient, ) -> None: """Test that providing both alias and name raises error.""" with pytest.raises( ValueError, match=r'Only one of "id", "name", "alias" can be specified, but following arguments ' r'were specified: "name", "alias".', ): await KeyValueStore.open( name='some-name', alias='some-alias', storage_client=storage_client, ) async def test_alias_with_special_characters( storage_client: StorageClient, ) -> None: """Test alias functionality with special characters.""" special_aliases = [ 'alias-with-dashes', 'alias_with_underscores', 'alias.with.dots', 'alias123with456numbers', 'CamelCaseAlias', ] stores = [] for alias in special_aliases: kvs = await KeyValueStore.open( alias=alias, storage_client=storage_client, ) stores.append(kvs) # Add data with the alias as identifier await kvs.set_value('alias_used', alias) await kvs.set_value('test', 'special_chars') # Verify all work correctly for i, kvs in enumerate(stores): assert await kvs.get_value('alias_used') == special_aliases[i] assert await kvs.get_value('test') == 'special_chars' # Clean up for kvs in stores: await kvs.drop() async def test_alias_key_operations( storage_client: StorageClient, ) -> None: """Test that key operations work correctly with alias stores.""" kvs = await KeyValueStore.open( alias='key_ops_test', storage_client=storage_client, ) # Test setting multiple keys test_data = { 'key1': {'data': 'value1', 'number': 1}, 'key2': 'simple string value', 'key3': [1, 2, 3, 4, 5], 'key4': None, } for key, value in test_data.items(): await kvs.set_value(key, value) # Test getting all keys keys = await kvs.list_keys() key_names = [k.key for k in keys] assert len(keys) == 4 for key in test_data: assert key in key_names # Test record_exists for key in test_data: assert await kvs.record_exists(key) is True assert await kvs.record_exists('nonexistent') is False # Test iteration collected_keys = [key async for key in kvs.iterate_keys()] assert len(collected_keys) == 4 # Test deletion await kvs.delete_value('key2') assert await kvs.record_exists('key2') is False assert await kvs.get_value('key2') is None # Verify other keys still exist remaining_keys = await kvs.list_keys() assert len(remaining_keys) == 3 # Clean up await kvs.drop() async def test_named_vs_alias_conflict_detection( storage_client: StorageClient, ) -> None: """Test that conflicts between named and alias storages are detected.""" # Test 1: Create named storage first, then try alias with same name named_kvs = await KeyValueStore.open(name='conflict-test', storage_client=storage_client) assert named_kvs.name == 'conflict-test' # Try to create alias with same name - should raise error with pytest.raises(ValueError, match=r'Cannot create alias storage "conflict-test".*already exists'): await KeyValueStore.open(alias='conflict-test', storage_client=storage_client) # Clean up await named_kvs.drop() # Test 2: Create alias first, then try named with same name alias_kvs = await KeyValueStore.open(alias='conflict-test2', storage_client=storage_client) assert alias_kvs.name is None # Alias storages have no name # Try to create named with same name - should raise error with pytest.raises(ValueError, match=r'Cannot create named storage "conflict-test2".*already exists'): await KeyValueStore.open(name='conflict-test2', storage_client=storage_client) # Clean up await alias_kvs.drop() # Test 3: Different names should work fine named_kvs_ok = await KeyValueStore.open(name='different-name', storage_client=storage_client) alias_kvs_ok = await KeyValueStore.open(alias='different-alias', storage_client=storage_client) assert named_kvs_ok.name == 'different-name' assert alias_kvs_ok.name is None # Clean up await named_kvs_ok.drop() await alias_kvs_ok.drop() async def test_alias_parameter( storage_client: StorageClient, ) -> None: """Test key-value store creation and operations with alias parameter.""" # Create kvs with alias alias_kvs = await KeyValueStore.open( alias='test_alias', storage_client=storage_client, ) # Verify alias kvs properties assert alias_kvs.id is not None assert alias_kvs.name is None # Alias storages should be unnamed # Test data operations await alias_kvs.set_value('test_key', {'type': 'alias', 'value': 1}) value = await alias_kvs.get_value('test_key') assert value['type'] == 'alias' await alias_kvs.drop() async def test_alias_vs_named_isolation( storage_client: StorageClient, ) -> None: """Test that alias and named key-value stores with same identifier are isolated.""" # Create named kvs named_kvs = await KeyValueStore.open( name='test-identifier', storage_client=storage_client, ) # Verify named kvs assert named_kvs.name == 'test-identifier' await named_kvs.set_value('type', 'named') # Clean up named kvs first await named_kvs.drop() # Now create alias kvs with same identifier (should work after cleanup) alias_kvs = await KeyValueStore.open( alias='test_identifier', storage_client=storage_client, ) # Should be different instance assert alias_kvs.name is None await alias_kvs.set_value('type', 'alias') # Verify alias data alias_value = await alias_kvs.get_value('type') assert alias_value == 'alias' await alias_kvs.drop() async def test_default_vs_alias_default_equivalence( storage_client: StorageClient, ) -> None: """Test that default key-value store and alias='default' are equivalent.""" # Open default kvs default_kvs = await KeyValueStore.open( storage_client=storage_client, ) alias_default_kvs = await KeyValueStore.open( alias=StorageInstanceManager._DEFAULT_STORAGE_ALIAS, storage_client=storage_client, ) # Should be the same assert default_kvs.id == alias_default_kvs.id assert default_kvs.name is None assert alias_default_kvs.name is None # Data should be shared await default_kvs.set_value('source', 'default') value = await alias_default_kvs.get_value('source') assert value == 'default' await default_kvs.drop() async def test_multiple_alias_isolation( storage_client: StorageClient, ) -> None: """Test that different aliases create separate key-value stores.""" kvs_stores = [] for i in range(3): kvs = await KeyValueStore.open( alias=f'alias_{i}', storage_client=storage_client, ) await kvs.set_value('alias', f'alias_{i}') await kvs.set_value('index', i) kvs_stores.append(kvs) # All should be different for i in range(3): for j in range(i + 1, 3): assert kvs_stores[i].id != kvs_stores[j].id # Verify data isolation for i, kvs in enumerate(kvs_stores): alias_value = await kvs.get_value('alias') index_value = await kvs.get_value('index') assert alias_value == f'alias_{i}' # For memory storage, value is preserved as int; for filesystem it's converted to string assert index_value == i or index_value == str(i) await kvs.drop() async def test_purge_on_start_enabled(storage_client: StorageClient) -> None: """Test purge behavior when purge_on_start=True: named storages retain data, unnamed storages are purged.""" # Skip this test for memory storage since it doesn't persist data between client instances. if isinstance(storage_client, MemoryStorageClient): pytest.skip('Memory storage does not persist data between client instances.') configuration = Configuration(purge_on_start=True) # First, create all storage types with purge enabled and add data. default_kvs = await KeyValueStore.open( storage_client=storage_client, configuration=configuration, ) alias_kvs = await KeyValueStore.open( alias='purge-test-alias', storage_client=storage_client, configuration=configuration, ) named_kvs = await KeyValueStore.open( name='purge-test-named', storage_client=storage_client, configuration=configuration, ) await default_kvs.set_value(key='data', value='should_be_purged') await alias_kvs.set_value(key='data', value='should_be_purged') await named_kvs.set_value(key='data', value='should_persist') # Verify data was added default_data = await default_kvs.get_value(key='data') alias_data = await alias_kvs.get_value(key='data') named_data = await named_kvs.get_value(key='data') assert default_data == 'should_be_purged' assert alias_data == 'should_be_purged' assert named_data == 'should_persist' # Verify that default and alias storages are unnamed default_metadata = await default_kvs.get_metadata() alias_metadata = await alias_kvs.get_metadata() named_metadata = await named_kvs.get_metadata() assert default_metadata.name is None assert alias_metadata.name is None assert named_metadata.name == 'purge-test-named' # Clear storage cache to simulate "reopening" storages service_locator.storage_instance_manager.clear_cache() # Now "reopen" all storages default_kvs_2 = await KeyValueStore.open( storage_client=storage_client, configuration=configuration, ) alias_kvs_2 = await KeyValueStore.open( alias='purge-test-alias', storage_client=storage_client, configuration=configuration, ) named_kvs_2 = await KeyValueStore.open( name='purge-test-named', storage_client=storage_client, configuration=configuration, ) # Check the data after purge default_data_after = await default_kvs_2.get_value(key='data') alias_data_after = await alias_kvs_2.get_value(key='data') named_data_after = await named_kvs_2.get_value(key='data') # Unnamed storages (alias and default) should be purged (data removed) assert default_data_after is None assert alias_data_after is None # Named storage should retain data (not purged) assert named_data_after == 'should_persist' # Clean up await named_kvs_2.drop() await alias_kvs_2.drop() await default_kvs_2.drop() async def test_purge_on_start_disabled(storage_client: StorageClient) -> None: """Test purge behavior when purge_on_start=False: all storages retain data regardless of type.""" # Skip this test for memory storage since it doesn't persist data between client instances. if isinstance(storage_client, MemoryStorageClient): pytest.skip('Memory storage does not persist data between client instances.') configuration = Configuration(purge_on_start=False) # First, create all storage types with purge disabled and add data. default_kvs = await KeyValueStore.open( storage_client=storage_client, configuration=configuration, ) alias_kvs = await KeyValueStore.open( alias='purge-test-alias', storage_client=storage_client, configuration=configuration, ) named_kvs = await KeyValueStore.open( name='purge-test-named', storage_client=storage_client, configuration=configuration, ) await default_kvs.set_value('data', 'should_persist') await alias_kvs.set_value('data', 'should_persist') await named_kvs.set_value('data', 'should_persist') # Verify data was added default_data = await default_kvs.get_value('data') alias_data = await alias_kvs.get_value('data') named_data = await named_kvs.get_value('data') assert default_data == 'should_persist' assert alias_data == 'should_persist' assert named_data == 'should_persist' # Clear storage cache to simulate "reopening" storages service_locator.storage_instance_manager.clear_cache() # Now "reopen" all storages default_kvs_2 = await KeyValueStore.open( storage_client=storage_client, configuration=configuration, ) alias_kvs_2 = await KeyValueStore.open( alias='purge-test-alias', storage_client=storage_client, configuration=configuration, ) named_kvs_2 = await KeyValueStore.open( name='purge-test-named', storage_client=storage_client, configuration=configuration, ) # Check the data after reopen default_data_after = await default_kvs_2.get_value('data') alias_data_after = await alias_kvs_2.get_value('data') named_data_after = await named_kvs_2.get_value('data') # All storages should retain data when purge is disabled assert default_data_after == 'should_persist' assert alias_data_after == 'should_persist' assert named_data_after == 'should_persist' # Clean up await named_kvs_2.drop() await alias_kvs_2.drop() await default_kvs_2.drop() async def test_name_default_not_allowed(storage_client: StorageClient) -> None: """Test that storage can't have default alias as name, to prevent collisions with unnamed storage alias.""" with pytest.raises( ValueError, match=f'Storage name cannot be "{StorageInstanceManager._DEFAULT_STORAGE_ALIAS}" as ' f'it is reserved for default alias.', ): await KeyValueStore.open(name=StorageInstanceManager._DEFAULT_STORAGE_ALIAS, storage_client=storage_client) @pytest.mark.parametrize( ('name', 'is_valid'), [ pytest.param('F', True, id='single-char'), pytest.param('7', True, id='single-digit'), pytest.param('FtghdfseySds', True, id='mixed-case'), pytest.param('125673450', True, id='all-digits'), pytest.param('Ft2134Sfe0O1hf', True, id='mixed-alphanumeric'), pytest.param('name-with-dashes', True, id='dashes'), pytest.param('1-value', True, id='number start'), pytest.param('value-1', True, id='number end'), pytest.param('test-1-value', True, id='number middle'), pytest.param('test-------value', True, id='multiple-dashes'), pytest.param('test-VALUES-test', True, id='multiple-cases'), pytest.param('name_with_underscores', False, id='underscores'), pytest.param('name with spaces', False, id='spaces'), pytest.param('-test', False, id='dashes start'), pytest.param('test-', False, id='dashes end'), ], ) async def test_validate_name(storage_client: StorageClient, name: str, *, is_valid: bool) -> None: """Test name validation logic.""" if is_valid: # Should not raise dataset = await KeyValueStore.open(name=name, storage_client=storage_client) assert dataset.name == name await dataset.drop() else: with pytest.raises(ValueError, match=rf'Invalid storage name "{name}".*'): await KeyValueStore.open(name=name, storage_client=storage_client) @pytest.mark.parametrize( 'tested_storage_client_class', [ pytest.param(MemoryStorageClient, id='tested=MemoryStorageClient'), pytest.param(FileSystemStorageClient, id='tested=FileSystemStorageClient'), pytest.param(SqlStorageClient, id='tested=SqlStorageClient'), ], ) @pytest.mark.parametrize( 'global_storage_client_class', [ pytest.param(MemoryStorageClient, id='global=MemoryStorageClient'), pytest.param(FileSystemStorageClient, id='global=FileSystemStorageClient'), pytest.param(SqlStorageClient, id='global=SqlStorageClient'), ], ) async def test_get_auto_saved_value_various_global_clients( tmp_path: Path, tested_storage_client_class: type[StorageClient], global_storage_client_class: type[StorageClient] ) -> None: """Ensure that persistence is working for all clients regardless of what is set in service locator.""" tested_storage_client = tested_storage_client_class() global_storage_client = global_storage_client_class() service_locator.set_configuration( Configuration( storage_dir=str(tmp_path), purge_on_start=True, ) ) service_locator.set_storage_client(global_storage_client) kvs = await KeyValueStore.open(storage_client=tested_storage_client) values_kvs = {'key': 'some_value'} test_key = 'test_key' autosaved_value_kvs = await kvs.get_auto_saved_value(test_key) assert autosaved_value_kvs == {} autosaved_value_kvs.update(values_kvs) await kvs.persist_autosaved_values() assert await kvs.get_value(test_key) == autosaved_value_kvs async def test_record_with_noascii_chars(kvs: KeyValueStore) -> None: """Test storing and retrieving a record with non-ASCII characters.""" init_value = { 'record_1': 'Supermaxi El Jardín', 'record_2': 'záznam dva', 'record_3': '記録三', } key = 'non_ascii_key' # Save the record in the key-value store await kvs.set_value(key, init_value) # Get the record and verify value = await kvs.get_value(key) assert value is not None assert value == init_value ================================================ FILE: tests/unit/storages/test_request_manager_tandem.py ================================================ from __future__ import annotations from dataclasses import dataclass from unittest.mock import create_autospec import pytest from crawlee import Request from crawlee.request_loaders import RequestLoader, RequestManagerTandem from crawlee.storages import RequestQueue @dataclass class TestInput: __test__ = False request_loader_items: list[str | Request | None] request_manager_items: list[str | Request] discovered_items: list[Request] expected_result: set[str] @pytest.mark.parametrize( argnames='test_input', argvalues=[ pytest.param( TestInput( request_loader_items=['https://a.placeholder.com', 'https://b.placeholder.com'], request_manager_items=[], discovered_items=[Request.from_url('https://c.placeholder.com')], expected_result={ 'https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com', }, ), id='basic_usage', ), pytest.param( TestInput( request_loader_items=[ Request.from_url('https://a.placeholder.com'), None, Request.from_url('https://c.placeholder.com'), ], request_manager_items=['https://b.placeholder.com', 'http://d.com'], discovered_items=[], expected_result={ 'https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com', 'http://d.com', }, ), id='wait_for_read_only_source', ), ], ) async def test_basic_functionality(test_input: TestInput) -> None: request_queue = await RequestQueue.open() if test_input.request_manager_items: await request_queue.add_requests(test_input.request_manager_items) mock_request_loader = create_autospec(RequestLoader, instance=True, spec_set=True) mock_request_loader.fetch_next_request.side_effect = lambda: test_input.request_loader_items.pop(0) mock_request_loader.is_finished.side_effect = lambda: len(test_input.request_loader_items) == 0 tandem = RequestManagerTandem(mock_request_loader, request_queue) processed = set[str]() while not await tandem.is_finished(): request = await tandem.fetch_next_request() assert request is not None processed.add(request.url) for new_request in test_input.discovered_items: await tandem.add_request(new_request) await tandem.mark_request_as_handled(request) assert processed == test_input.expected_result ================================================ FILE: tests/unit/storages/test_request_queue.py ================================================ from __future__ import annotations import asyncio from datetime import timedelta from typing import TYPE_CHECKING import pytest from crawlee import Request, service_locator from crawlee.configuration import Configuration from crawlee.storage_clients import MemoryStorageClient, StorageClient from crawlee.storages import RequestQueue from crawlee.storages._storage_instance_manager import StorageInstanceManager if TYPE_CHECKING: from collections.abc import AsyncGenerator from crawlee.storage_clients import StorageClient @pytest.fixture async def rq( storage_client: StorageClient, ) -> AsyncGenerator[RequestQueue, None]: """Fixture that provides a request queue instance for each test.""" rq = await RequestQueue.open( storage_client=storage_client, ) yield rq await rq.drop() async def test_open_creates_new_rq( storage_client: StorageClient, ) -> None: """Test that open() creates a new request queue with proper metadata.""" rq = await RequestQueue.open( name='new-request-queue', storage_client=storage_client, ) # Verify request queue properties assert rq.id is not None assert rq.name == 'new-request-queue' metadata = await rq.get_metadata() assert metadata.pending_request_count == 0 assert metadata.handled_request_count == 0 assert metadata.total_request_count == 0 await rq.drop() async def test_open_existing_rq( rq: RequestQueue, storage_client: StorageClient, ) -> None: """Test that open() loads an existing request queue correctly.""" # Open the same request queue again reopened_rq = await RequestQueue.open( name=rq.name, storage_client=storage_client, ) # Verify request queue properties assert rq.id == reopened_rq.id assert rq.name == reopened_rq.name # Verify they are the same object (from cache) assert id(rq) == id(reopened_rq) async def test_open_with_id_and_name( storage_client: StorageClient, ) -> None: """Test that open() raises an error when both id and name are provided.""" with pytest.raises( ValueError, match=r'Only one of "id", "name", "alias" can be specified, but following arguments ' r'were specified: "id", "name".', ): await RequestQueue.open( id='some-id', name='some-name', storage_client=storage_client, ) async def test_open_by_id( storage_client: StorageClient, ) -> None: """Test opening a request queue by its ID.""" # First create a request queue by name rq1 = await RequestQueue.open( name='rq-by-id-test', storage_client=storage_client, ) # Add a request to identify it await rq1.add_request('https://example.com/open-by-id-test') # Open the request queue by ID rq2 = await RequestQueue.open( id=rq1.id, storage_client=storage_client, ) # Verify it's the same request queue assert rq2.id == rq1.id assert rq2.name == 'rq-by-id-test' # Verify the request is still there request = await rq2.fetch_next_request() assert request is not None assert request.url == 'https://example.com/open-by-id-test' # Clean up await rq2.drop() async def test_add_request_string_url(rq: RequestQueue) -> None: """Test adding a request with a string URL.""" # Add a request with a string URL url = 'https://example.com' result = await rq.add_request(url) # Verify request was added assert result is not None assert result.unique_key is not None assert result.was_already_present is False assert result.was_already_handled is False # Verify the queue stats were updated metadata = await rq.get_metadata() assert metadata.total_request_count == 1 assert metadata.pending_request_count == 1 async def test_add_request_object(rq: RequestQueue) -> None: """Test adding a request object.""" # Create and add a request object request = Request.from_url(url='https://example.com', user_data={'key': 'value'}) result = await rq.add_request(request) # Verify request was added assert result is not None assert result.unique_key is not None assert result.was_already_present is False assert result.was_already_handled is False # Verify the queue stats were updated metadata = await rq.get_metadata() assert metadata.total_request_count == 1 assert metadata.pending_request_count == 1 async def test_add_duplicate_request(rq: RequestQueue) -> None: """Test adding a duplicate request to the queue.""" # Add a request url = 'https://example.com' first_result = await rq.add_request(url) assert first_result is not None # Add the same request again second_result = await rq.add_request(url) # Verify the second request was detected as duplicate assert second_result is not None assert second_result.was_already_present is True assert second_result.unique_key == first_result.unique_key # Verify the queue stats weren't incremented twice metadata = await rq.get_metadata() assert metadata.total_request_count == 1 assert metadata.pending_request_count == 1 async def test_add_requests_batch(rq: RequestQueue) -> None: """Test adding multiple requests in a batch.""" # Create a batch of requests urls = [ 'https://example.com/page1', 'https://example.com/page2', 'https://example.com/page3', ] # Add the requests await rq.add_requests(urls) # Wait for all background tasks to complete await asyncio.sleep(0.1) # Verify the queue stats metadata = await rq.get_metadata() assert metadata.total_request_count == 3 assert metadata.pending_request_count == 3 async def test_add_requests_batch_with_forefront(rq: RequestQueue) -> None: """Test adding multiple requests in a batch with forefront option.""" # Add some initial requests await rq.add_request('https://example.com/page1') await rq.add_request('https://example.com/page2') # Add a batch of priority requests at the forefront await rq.add_requests( [ 'https://example.com/priority1', 'https://example.com/priority2', 'https://example.com/priority3', ], forefront=True, ) # Wait for all background tasks to complete await asyncio.sleep(0.1) # Fetch requests - they should come out in priority order first next_request1 = await rq.fetch_next_request() assert next_request1 is not None assert next_request1.url.startswith('https://example.com/priority') next_request2 = await rq.fetch_next_request() assert next_request2 is not None assert next_request2.url.startswith('https://example.com/priority') next_request3 = await rq.fetch_next_request() assert next_request3 is not None assert next_request3.url.startswith('https://example.com/priority') # Now we should get the original requests next_request4 = await rq.fetch_next_request() assert next_request4 is not None assert next_request4.url == 'https://example.com/page1' next_request5 = await rq.fetch_next_request() assert next_request5 is not None assert next_request5.url == 'https://example.com/page2' # Queue should be empty now next_request6 = await rq.fetch_next_request() assert next_request6 is None async def test_add_requests_with_forefront(rq: RequestQueue) -> None: """Test adding requests to the front of the queue.""" # Add some initial requests await rq.add_request('https://example.com/page1') await rq.add_request('https://example.com/page2') # Add a priority request at the forefront await rq.add_request('https://example.com/priority', forefront=True) # Fetch the next request - should be the priority one next_request = await rq.fetch_next_request() assert next_request is not None assert next_request.url == 'https://example.com/priority' async def test_add_requests_mixed_forefront(rq: RequestQueue) -> None: """Test the ordering when adding requests with mixed forefront values.""" # Add normal requests await rq.add_request('https://example.com/normal1') await rq.add_request('https://example.com/normal2') # Add a batch with forefront=True await rq.add_requests( ['https://example.com/priority1', 'https://example.com/priority2'], forefront=True, ) # Add another normal request await rq.add_request('https://example.com/normal3') # Add another priority request await rq.add_request('https://example.com/priority3', forefront=True) # Wait for background tasks await asyncio.sleep(0.1) # The expected order should be: # 1. priority3 (most recent forefront) # 2. priority1 (from batch, forefront) # 3. priority2 (from batch, forefront) # 4. normal1 (oldest normal) # 5. normal2 # 6. normal3 (newest normal) requests = [] while True: req = await rq.fetch_next_request() if req is None: break requests.append(req) await rq.mark_request_as_handled(req) assert len(requests) == 6 assert requests[0].url == 'https://example.com/priority3' # The next two should be from the forefront batch (exact order within batch may vary) batch_urls = {requests[1].url, requests[2].url} assert 'https://example.com/priority1' in batch_urls assert 'https://example.com/priority2' in batch_urls # Then the normal requests in order assert requests[3].url == 'https://example.com/normal1' assert requests[4].url == 'https://example.com/normal2' assert requests[5].url == 'https://example.com/normal3' async def test_fetch_next_request_and_mark_handled(rq: RequestQueue) -> None: """Test fetching and marking requests as handled.""" # Add some requests await rq.add_request('https://example.com/page1') await rq.add_request('https://example.com/page2') # Fetch first request request1 = await rq.fetch_next_request() assert request1 is not None assert request1.url == 'https://example.com/page1' # Mark the request as handled result = await rq.mark_request_as_handled(request1) assert result is not None assert result.was_already_handled is True # Fetch next request request2 = await rq.fetch_next_request() assert request2 is not None assert request2.url == 'https://example.com/page2' # Mark the second request as handled await rq.mark_request_as_handled(request2) # Verify counts metadata = await rq.get_metadata() assert metadata.total_request_count == 2 assert metadata.handled_request_count == 2 assert metadata.pending_request_count == 0 # Verify queue is empty empty_request: Request | None = await rq.fetch_next_request() assert empty_request is None async def test_get_request_by_id(rq: RequestQueue) -> None: """Test retrieving a request by its ID.""" # Add a request added_result = await rq.add_request('https://example.com') assert added_result is not None unique_key = added_result.unique_key # Retrieve the request by ID retrieved_request = await rq.get_request(unique_key) assert retrieved_request is not None assert retrieved_request.unique_key == unique_key assert retrieved_request.url == 'https://example.com' async def test_handled_request_records_persistence(rq: RequestQueue) -> None: request = Request.from_url('https://example.com/1') await rq.add_request(request) fetched_request = await rq.fetch_next_request() assert isinstance(fetched_request, Request) await rq.mark_request_as_handled(fetched_request) fetched_request = await rq.get_request(request.unique_key) assert isinstance(fetched_request, Request) assert fetched_request.unique_key == request.unique_key async def test_get_non_existent_request(rq: RequestQueue) -> None: """Test retrieving a request that doesn't exist.""" non_existent_request = await rq.get_request('non-existent-id') assert non_existent_request is None async def test_reclaim_request(rq: RequestQueue) -> None: """Test reclaiming a request that failed processing.""" # Add a request await rq.add_request('https://example.com') # Fetch the request request = await rq.fetch_next_request() assert request is not None # Reclaim the request result = await rq.reclaim_request(request) assert result is not None assert result.was_already_handled is False # Verify we can fetch it again reclaimed_request = await rq.fetch_next_request() assert reclaimed_request is not None assert reclaimed_request.unique_key == request.unique_key assert reclaimed_request.url == 'https://example.com' async def test_reclaim_request_with_forefront(rq: RequestQueue) -> None: """Test reclaiming a request to the front of the queue.""" # Add requests await rq.add_request('https://example.com/first') await rq.add_request('https://example.com/second') # Fetch the first request first_request = await rq.fetch_next_request() assert first_request is not None assert first_request.url == 'https://example.com/first' # Reclaim it to the forefront await rq.reclaim_request(first_request, forefront=True) # The reclaimed request should be returned first (before the second request) next_request = await rq.fetch_next_request() assert next_request is not None assert next_request.url == 'https://example.com/first' async def test_is_empty(rq: RequestQueue) -> None: """Test checking if a request queue is empty.""" # Initially the queue should be empty assert await rq.is_empty() is True # Add a request await rq.add_request('https://example.com') assert await rq.is_empty() is False # Fetch and handle the request request = await rq.fetch_next_request() assert request is not None await rq.mark_request_as_handled(request) # Queue should be empty again assert await rq.is_empty() is True @pytest.mark.parametrize( ('wait_for_all'), [ pytest.param(True, id='wait for all'), pytest.param(False, id='do not wait for all'), ], ) async def test_add_requests_wait_for_all( rq: RequestQueue, *, wait_for_all: bool, ) -> None: """Test adding requests with wait_for_all_requests_to_be_added option.""" urls = [f'https://example.com/{i}' for i in range(15)] # Add requests without waiting await rq.add_requests( urls, batch_size=5, wait_for_all_requests_to_be_added=wait_for_all, wait_time_between_batches=timedelta(milliseconds=50), ) if not wait_for_all: # Immediately after adding, the total count may be less than 15 due to background processing assert await rq.get_total_count() <= 15 # Wait for background tasks to complete while await rq.get_total_count() < 15: # noqa: ASYNC110 await asyncio.sleep(0.1) # Verify all requests were added assert await rq.get_total_count() == 15 async def test_is_finished(rq: RequestQueue) -> None: """Test checking if a request queue is finished.""" # Initially the queue should be finished (empty and no background tasks) assert await rq.is_finished() is True # Add a request await rq.add_request('https://example.com') assert await rq.is_finished() is False # Add requests in the background await rq.add_requests( ['https://example.com/1', 'https://example.com/2'], wait_for_all_requests_to_be_added=False, ) # Queue shouldn't be finished while background tasks are running assert await rq.is_finished() is False # Wait for background tasks to finish await asyncio.sleep(0.2) # Process all requests while True: request = await rq.fetch_next_request() if request is None: break await rq.mark_request_as_handled(request) # Now queue should be finished assert await rq.is_finished() is True async def test_mark_non_existent_request_as_handled(rq: RequestQueue) -> None: """Test marking a non-existent request as handled.""" # Create a request that hasn't been added to the queue request = Request.from_url(url='https://example.com', id='non-existent-id') # Attempt to mark it as handled result = await rq.mark_request_as_handled(request) assert result is None async def test_reclaim_non_existent_request(rq: RequestQueue) -> None: """Test reclaiming a non-existent request.""" # Create a request that hasn't been added to the queue request = Request.from_url(url='https://example.com', id='non-existent-id') # Attempt to reclaim it result = await rq.reclaim_request(request) assert result is None async def test_drop( storage_client: StorageClient, ) -> None: """Test dropping a request queue removes it from cache and clears its data.""" rq = await RequestQueue.open( name='drop-test', storage_client=storage_client, ) # Add a request await rq.add_request('https://example.com') # Drop the request queue await rq.drop() # Verify request queue is empty (by creating a new one with the same name) new_rq = await RequestQueue.open( name='drop-test', storage_client=storage_client, ) # Verify the queue is empty assert await new_rq.is_empty() is True metadata = await new_rq.get_metadata() assert metadata.total_request_count == 0 assert metadata.pending_request_count == 0 await new_rq.drop() async def test_reopen_default( storage_client: StorageClient, ) -> None: """Test reopening the default request queue.""" # First clean up any storage instance caches storage_instance_manager = service_locator.storage_instance_manager storage_instance_manager.clear_cache() # Open the default request queue rq1 = await RequestQueue.open( storage_client=storage_client, ) # If a request queue already exists (due to previous test run), purge it to start fresh try: await rq1.purge() except Exception: # If purge fails, try dropping and recreating await rq1.drop() rq1 = await RequestQueue.open( storage_client=storage_client, ) # Verify we're starting fresh metadata1 = await rq1.get_metadata() assert metadata1.pending_request_count == 0 # Add a request await rq1.add_request('https://example.com/') # Verify the request was added metadata1 = await rq1.get_metadata() assert metadata1.pending_request_count == 1 # Open the default request queue again rq2 = await RequestQueue.open( storage_client=storage_client, ) # Verify they are the same queue assert rq1.id == rq2.id assert rq1.name == rq2.name metadata1 = await rq1.get_metadata() metadata2 = await rq2.get_metadata() assert metadata1.total_request_count == metadata2.total_request_count assert metadata1.pending_request_count == metadata2.pending_request_count assert metadata1.handled_request_count == metadata2.handled_request_count # Verify the request is accessible request = await rq2.fetch_next_request() assert request is not None assert request.url == 'https://example.com/' # Clean up after the test await rq1.drop() async def test_purge( storage_client: StorageClient, ) -> None: """Test purging a request queue removes all requests but keeps the queue itself.""" # First create a request queue rq = await RequestQueue.open( name='purge-test-queue', storage_client=storage_client, ) # Add some requests await rq.add_requests( [ 'https://example.com/page1', 'https://example.com/page2', 'https://example.com/page3', ] ) # Verify requests were added metadata = await rq.get_metadata() assert metadata.total_request_count == 3 assert metadata.pending_request_count == 3 assert metadata.handled_request_count == 0 # Record the queue ID queue_id = rq.id # Purge the queue await rq.purge() # Verify the queue still exists but is empty assert rq.id == queue_id # Same ID preserved assert rq.name == 'purge-test-queue' # Same name preserved # Queue should be empty now metadata = await rq.get_metadata() assert metadata.total_request_count == 0 assert metadata.pending_request_count == 0 assert metadata.handled_request_count == 0 assert await rq.is_empty() is True # Verify we can add new requests after purging await rq.add_request('https://example.com/new-after-purge') request = await rq.fetch_next_request() assert request is not None assert request.url == 'https://example.com/new-after-purge' # Clean up await rq.drop() async def test_open_with_alias( storage_client: StorageClient, ) -> None: """Test opening request queues with alias parameter for NDU functionality.""" # Create request queues with different aliases rq_1 = await RequestQueue.open( alias='test_alias_1', storage_client=storage_client, ) rq_2 = await RequestQueue.open( alias='test_alias_2', storage_client=storage_client, ) # Verify they have different IDs but no names (unnamed) assert rq_1.id != rq_2.id assert rq_1.name is None assert rq_2.name is None # Add different requests to each await rq_1.add_request('https://example.com/1') await rq_1.add_request('https://example.com/2') await rq_2.add_request('https://example.com/3') # Verify data isolation request_1 = await rq_1.fetch_next_request() request_2 = await rq_2.fetch_next_request() assert request_1 is not None assert request_2 is not None assert request_1.url == 'https://example.com/1' assert request_2.url == 'https://example.com/3' # Clean up await rq_1.drop() await rq_2.drop() async def test_alias_caching( storage_client: StorageClient, ) -> None: """Test that request queues with same alias return same instance (cached).""" # Open rq with alias rq_1 = await RequestQueue.open( alias='cache_test', storage_client=storage_client, ) # Open again with same alias rq_2 = await RequestQueue.open( alias='cache_test', storage_client=storage_client, ) # Should be same instance assert rq_1 is rq_2 assert rq_1.id == rq_2.id # Clean up await rq_1.drop() async def test_alias_with_id_error( storage_client: StorageClient, ) -> None: """Test that providing both alias and id raises error.""" with pytest.raises( ValueError, match=r'Only one of "id", "name", "alias" can be specified, but following arguments ' r'were specified: "id", "alias".', ): await RequestQueue.open( id='some-id', alias='some-alias', storage_client=storage_client, ) async def test_alias_with_name_error( storage_client: StorageClient, ) -> None: """Test that providing both alias and name raises error.""" with pytest.raises( ValueError, match=r'Only one of "id", "name", "alias" can be specified, but following arguments ' r'were specified: "name", "alias".', ): await RequestQueue.open( name='some-name', alias='some-alias', storage_client=storage_client, ) async def test_alias_with_special_characters( storage_client: StorageClient, ) -> None: """Test alias functionality with special characters.""" special_aliases = [ 'alias-with-dashes', 'alias_with_underscores', 'alias.with.dots', 'alias123with456numbers', 'CamelCaseAlias', ] queues = [] for alias in special_aliases: rq = await RequestQueue.open( alias=alias, storage_client=storage_client, ) queues.append(rq) # Add request with the alias as identifier in URL await rq.add_request(f'https://example.com/{alias}') # Verify all work correctly for i, rq in enumerate(queues): request = await rq.fetch_next_request() assert request is not None assert f'/{special_aliases[i]}' in request.url # Clean up for rq in queues: await rq.drop() async def test_alias_request_operations( storage_client: StorageClient, ) -> None: """Test that request operations work correctly with alias queues.""" rq = await RequestQueue.open( alias='request_ops_test', storage_client=storage_client, ) # Test adding multiple requests urls = [ 'https://example.com/page1', 'https://example.com/page2', 'https://example.com/page3', ] for url in urls: result = await rq.add_request(url) assert result is not None assert result.was_already_present is False # Test queue metadata metadata = await rq.get_metadata() assert metadata.total_request_count == 3 assert metadata.pending_request_count == 3 assert metadata.handled_request_count == 0 # Test fetching and handling requests processed_urls = [] while not await rq.is_empty(): request = await rq.fetch_next_request() if request: processed_urls.append(request.url) await rq.mark_request_as_handled(request) # Verify all requests were processed assert len(processed_urls) == 3 assert set(processed_urls) == set(urls) # Verify final state metadata = await rq.get_metadata() assert metadata.pending_request_count == 0 assert metadata.handled_request_count == 3 assert await rq.is_empty() is True # Clean up await rq.drop() async def test_alias_forefront_operations( storage_client: StorageClient, ) -> None: """Test forefront operations work correctly with alias queues.""" rq = await RequestQueue.open( alias='forefront_test', storage_client=storage_client, ) # Add normal requests await rq.add_request('https://example.com/normal1') await rq.add_request('https://example.com/normal2') # Add priority request to forefront await rq.add_request('https://example.com/priority', forefront=True) # Priority request should come first priority_request = await rq.fetch_next_request() assert priority_request is not None assert priority_request.url == 'https://example.com/priority' # Then normal requests normal_request = await rq.fetch_next_request() assert normal_request is not None assert normal_request.url == 'https://example.com/normal1' # Clean up await rq.drop() async def test_alias_batch_operations( storage_client: StorageClient, ) -> None: """Test batch operations work correctly with alias queues.""" rq = await RequestQueue.open( alias='batch_test', storage_client=storage_client, ) # Test batch adding batch_urls = [ 'https://example.com/batch1', 'https://example.com/batch2', 'https://example.com/batch3', ] await rq.add_requests(batch_urls) # Wait for background processing await asyncio.sleep(0.1) # Verify all requests were added metadata = await rq.get_metadata() assert metadata.total_request_count == 3 # Clean up await rq.drop() async def test_named_vs_alias_conflict_detection( storage_client: StorageClient, ) -> None: """Test that conflicts between named and alias storages are detected.""" # Test 1: Create named storage first, then try alias with same name named_rq = await RequestQueue.open( name='conflict-test', storage_client=storage_client, ) assert named_rq.name == 'conflict-test' # Try to create alias with same name - should raise error with pytest.raises(ValueError, match=r'Cannot create alias storage "conflict-test".*already exists'): await RequestQueue.open(alias='conflict-test', storage_client=storage_client) # Clean up await named_rq.drop() # Test 2: Create alias first, then try named with same name alias_rq = await RequestQueue.open(alias='conflict-test2', storage_client=storage_client) assert alias_rq.name is None # Alias storages have no name # Try to create named with same name - should raise error with pytest.raises(ValueError, match=r'Cannot create named storage "conflict-test2".*already exists'): await RequestQueue.open(name='conflict-test2', storage_client=storage_client) # Clean up await alias_rq.drop() # Test 3: Different names should work fine named_rq_ok = await RequestQueue.open(name='different-name') alias_rq_ok = await RequestQueue.open(alias='different-alias') assert named_rq_ok.name == 'different-name' assert alias_rq_ok.name is None # Clean up await named_rq_ok.drop() await alias_rq_ok.drop() async def test_alias_parameter( storage_client: StorageClient, ) -> None: """Test request queue creation and operations with alias parameter.""" # Create request queue with alias alias_rq = await RequestQueue.open( alias='test_alias', storage_client=storage_client, ) # Verify alias request queue properties assert alias_rq.id is not None assert alias_rq.name is None # Alias storages should be unnamed # Test data operations await alias_rq.add_request('https://example.com/alias') metadata = await alias_rq.get_metadata() assert metadata.pending_request_count == 1 await alias_rq.drop() async def test_alias_vs_named_isolation( storage_client: StorageClient, ) -> None: """Test that alias and named request queues with same identifier are isolated.""" # Create named request queue named_rq = await RequestQueue.open( name='test-identifier', storage_client=storage_client, ) # Verify named request queue assert named_rq.name == 'test-identifier' await named_rq.add_request('https://named.example.com') # Clean up named request queue first await named_rq.drop() # Now create alias request queue with same identifier (should work after cleanup) alias_rq = await RequestQueue.open( alias='test-identifier', storage_client=storage_client, ) # Should be different instance assert alias_rq.name is None await alias_rq.add_request('https://alias.example.com') # Verify alias data alias_request = await alias_rq.fetch_next_request() assert alias_request is not None assert alias_request.url == 'https://alias.example.com' await alias_rq.drop() async def test_default_vs_alias_default_equivalence( storage_client: StorageClient, ) -> None: """Test that default request queue and alias='default' are equivalent.""" # Open default request queue default_rq = await RequestQueue.open( storage_client=storage_client, ) alias_default_rq = await RequestQueue.open( alias=StorageInstanceManager._DEFAULT_STORAGE_ALIAS, storage_client=storage_client, ) # Should be the same assert default_rq.id == alias_default_rq.id assert default_rq.name is None assert alias_default_rq.name is None # Data should be shared await default_rq.add_request('https://default.example.com') metadata = await alias_default_rq.get_metadata() assert metadata.pending_request_count == 1 await default_rq.drop() async def test_multiple_alias_isolation( storage_client: StorageClient, ) -> None: """Test that different aliases create separate request queues.""" request_queues = [] for i in range(3): rq = await RequestQueue.open( alias=f'alias_{i}', storage_client=storage_client, ) await rq.add_request(f'https://example.com/alias_{i}') request_queues.append(rq) # All should be different for i in range(3): for j in range(i + 1, 3): assert request_queues[i].id != request_queues[j].id # Verify data isolation for i, rq in enumerate(request_queues): request = await rq.fetch_next_request() assert request is not None assert request.url == f'https://example.com/alias_{i}' await rq.drop() async def test_purge_on_start_enabled(storage_client: StorageClient) -> None: """Test purge behavior when purge_on_start=True: named storages retain data, unnamed storages are purged.""" # Skip this test for memory storage since it doesn't persist data between client instances. if isinstance(storage_client, MemoryStorageClient): pytest.skip('Memory storage does not persist data between client instances.') configuration = Configuration(purge_on_start=True) # First, create all storage types with purge enabled and add data. default_rq = await RequestQueue.open( storage_client=storage_client, configuration=configuration, ) alias_rq = await RequestQueue.open( alias='purge-test-alias', storage_client=storage_client, configuration=configuration, ) named_rq = await RequestQueue.open( name='purge-test-named', storage_client=storage_client, configuration=configuration, ) await default_rq.add_requests( [ 'https://default.example.com/1', 'https://default.example.com/2', 'https://default.example.com/3', ] ) await alias_rq.add_requests( [ 'https://alias.example.com/1', 'https://alias.example.com/2', 'https://alias.example.com/3', ] ) await named_rq.add_requests( [ 'https://named.example.com/1', 'https://named.example.com/2', 'https://named.example.com/3', ] ) default_request = await default_rq.fetch_next_request() alias_request = await alias_rq.fetch_next_request() named_request = await named_rq.fetch_next_request() assert default_request is not None assert alias_request is not None assert named_request is not None await default_rq.mark_request_as_handled(default_request) await alias_rq.mark_request_as_handled(alias_request) await named_rq.mark_request_as_handled(named_request) # Verify data was added default_metadata = await default_rq.get_metadata() alias_metadata = await alias_rq.get_metadata() named_metadata = await named_rq.get_metadata() assert default_metadata.pending_request_count == 2 assert alias_metadata.pending_request_count == 2 assert named_metadata.pending_request_count == 2 assert default_metadata.handled_request_count == 1 assert alias_metadata.handled_request_count == 1 assert named_metadata.handled_request_count == 1 assert default_metadata.total_request_count == 3 assert alias_metadata.total_request_count == 3 assert named_metadata.total_request_count == 3 # Verify that default and alias storages are unnamed assert default_metadata.name is None assert alias_metadata.name is None assert named_metadata.name == 'purge-test-named' # Clear storage cache to simulate "reopening" storages service_locator.storage_instance_manager.clear_cache() # Now "reopen" all storages default_rq_2 = await RequestQueue.open( storage_client=storage_client, configuration=configuration, ) alias_rq_2 = await RequestQueue.open( alias='purge-test-alias', storage_client=storage_client, configuration=configuration, ) named_rq_2 = await RequestQueue.open( name='purge-test-named', storage_client=storage_client, configuration=configuration, ) # Check the data after purge default_metadata_after = await default_rq_2.get_metadata() alias_metadata_after = await alias_rq_2.get_metadata() named_metadata_after = await named_rq_2.get_metadata() # Unnamed storages (alias and default) should be purged (data removed) assert default_metadata_after.pending_request_count == 0 assert alias_metadata_after.pending_request_count == 0 assert named_metadata_after.pending_request_count == 2 assert default_metadata_after.handled_request_count == 0 assert alias_metadata_after.handled_request_count == 0 assert named_metadata_after.handled_request_count == 1 assert default_metadata_after.total_request_count == 0 assert alias_metadata_after.total_request_count == 0 assert named_metadata_after.total_request_count == 3 # Clean up await named_rq_2.drop() await alias_rq_2.drop() await default_rq_2.drop() async def test_purge_on_start_disabled(storage_client: StorageClient) -> None: """Test purge behavior when purge_on_start=False: all storages retain data regardless of type.""" # Skip this test for memory storage since it doesn't persist data between client instances. if isinstance(storage_client, MemoryStorageClient): pytest.skip('Memory storage does not persist data between client instances.') configuration = Configuration(purge_on_start=False) # First, create all storage types with purge disabled and add data. default_rq = await RequestQueue.open( storage_client=storage_client, configuration=configuration, ) alias_rq = await RequestQueue.open( alias='purge-test-alias', storage_client=storage_client, configuration=configuration, ) named_rq = await RequestQueue.open( name='purge-test-named', storage_client=storage_client, configuration=configuration, ) await default_rq.add_requests( [ 'https://default.example.com/1', 'https://default.example.com/2', 'https://default.example.com/3', ] ) await alias_rq.add_requests( [ 'https://alias.example.com/1', 'https://alias.example.com/2', 'https://alias.example.com/3', ] ) await named_rq.add_requests( [ 'https://named.example.com/1', 'https://named.example.com/2', 'https://named.example.com/3', ] ) default_request = await default_rq.fetch_next_request() alias_request = await alias_rq.fetch_next_request() named_request = await named_rq.fetch_next_request() assert default_request is not None assert alias_request is not None assert named_request is not None await default_rq.mark_request_as_handled(default_request) await alias_rq.mark_request_as_handled(alias_request) await named_rq.mark_request_as_handled(named_request) # Verify data was added default_metadata = await default_rq.get_metadata() alias_metadata = await alias_rq.get_metadata() named_metadata = await named_rq.get_metadata() assert default_metadata.pending_request_count == 2 assert alias_metadata.pending_request_count == 2 assert named_metadata.pending_request_count == 2 assert default_metadata.handled_request_count == 1 assert alias_metadata.handled_request_count == 1 assert named_metadata.handled_request_count == 1 assert default_metadata.total_request_count == 3 assert alias_metadata.total_request_count == 3 assert named_metadata.total_request_count == 3 # Verify that default and alias storages are unnamed assert default_metadata.name is None assert alias_metadata.name is None assert named_metadata.name == 'purge-test-named' # Clear storage cache to simulate "reopening" storages service_locator.storage_instance_manager.clear_cache() # Now "reopen" all storages default_rq_2 = await RequestQueue.open( storage_client=storage_client, configuration=configuration, ) alias_rq_2 = await RequestQueue.open( alias='purge-test-alias', storage_client=storage_client, configuration=configuration, ) named_rq_2 = await RequestQueue.open( name='purge-test-named', storage_client=storage_client, configuration=configuration, ) # Check the data after purge default_metadata_after = await default_rq_2.get_metadata() alias_metadata_after = await alias_rq_2.get_metadata() named_metadata_after = await named_rq_2.get_metadata() # Unnamed storages (alias and default) should be purged (data removed) assert default_metadata_after.pending_request_count == 2 assert alias_metadata_after.pending_request_count == 2 assert named_metadata_after.pending_request_count == 2 assert default_metadata_after.handled_request_count == 1 assert alias_metadata_after.handled_request_count == 1 assert named_metadata_after.handled_request_count == 1 assert default_metadata_after.total_request_count == 3 assert alias_metadata_after.total_request_count == 3 assert named_metadata_after.total_request_count == 3 # Clean up await named_rq_2.drop() await alias_rq_2.drop() await default_rq_2.drop() async def test_name_default_not_allowed(storage_client: StorageClient) -> None: """Test that storage can't have default alias as name, to prevent collisions with unnamed storage alias.""" with pytest.raises( ValueError, match=f'Storage name cannot be "{StorageInstanceManager._DEFAULT_STORAGE_ALIAS}" as ' f'it is reserved for default alias.', ): await RequestQueue.open(name=StorageInstanceManager._DEFAULT_STORAGE_ALIAS, storage_client=storage_client) @pytest.mark.parametrize( ('name', 'is_valid'), [ pytest.param('F', True, id='single-char'), pytest.param('7', True, id='single-digit'), pytest.param('FtghdfseySds', True, id='mixed-case'), pytest.param('125673450', True, id='all-digits'), pytest.param('Ft2134Sfe0O1hf', True, id='mixed-alphanumeric'), pytest.param('name-with-dashes', True, id='dashes'), pytest.param('1-value', True, id='number start'), pytest.param('value-1', True, id='number end'), pytest.param('test-1-value', True, id='number middle'), pytest.param('test-------value', True, id='multiple-dashes'), pytest.param('test-VALUES-test', True, id='multiple-cases'), pytest.param('name_with_underscores', False, id='underscores'), pytest.param('name with spaces', False, id='spaces'), pytest.param('-test', False, id='dashes start'), pytest.param('test-', False, id='dashes end'), ], ) async def test_validate_name(storage_client: StorageClient, name: str, *, is_valid: bool) -> None: """Test name validation logic.""" if is_valid: # Should not raise dataset = await RequestQueue.open(name=name, storage_client=storage_client) assert dataset.name == name await dataset.drop() else: with pytest.raises(ValueError, match=rf'Invalid storage name "{name}".*'): await RequestQueue.open(name=name, storage_client=storage_client) async def test_reclaim_request_with_change_state(rq: RequestQueue) -> None: """Test reclaiming a request and changing its state.""" # Add a request await rq.add_request(Request.from_url('https://example.com/original', user_data={'state': 'original'})) # Fetch the request request = await rq.fetch_next_request() assert request is not None assert request.url == 'https://example.com/original' assert request.user_data['state'] == 'original' # Reclaim the request with modified user data request.user_data['state'] = 'modified' result = await rq.reclaim_request(request) assert result is not None assert result.was_already_handled is False # Fetch the reclaimed request reclaimed_request = await rq.fetch_next_request() assert reclaimed_request is not None assert reclaimed_request.url == 'https://example.com/original' assert reclaimed_request.user_data['state'] == 'modified' async def test_request_with_noascii_chars(rq: RequestQueue) -> None: """Test handling requests with non-ASCII characters in user data.""" data_with_special_chars = { 'record_1': 'Supermaxi El Jardín', 'record_2': 'záznam dva', 'record_3': '記録三', } init_request = Request.from_url('https://crawlee.dev', user_data=data_with_special_chars) # Add a request with special user data await rq.add_request(init_request) # Get the request and verify request = await rq.fetch_next_request() assert request is not None assert request.url == 'https://crawlee.dev' assert request.user_data == init_request.user_data ================================================ FILE: tests/unit/storages/test_storage_instance_manager.py ================================================ import asyncio import sys from pathlib import Path from typing import cast from unittest.mock import AsyncMock import pytest from crawlee import service_locator from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient from crawlee.storages import Dataset, KeyValueStore, RequestQueue from crawlee.storages._base import Storage @pytest.fixture(autouse=True) def clean_storage_instance_manager() -> None: """Helper function to clean the storage instance manager before each test.""" service_locator.storage_instance_manager.clear_cache() @pytest.fixture(params=[KeyValueStore, Dataset, RequestQueue]) def storage_type(request: pytest.FixtureRequest) -> type[Storage]: return cast('type[Storage]', request.param) async def test_unique_storage_by_storage_client(tmp_path: Path, storage_type: type[Storage]) -> None: config = Configuration(purge_on_start=True, storage_dir=str(tmp_path)) storage_1 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config) storage_2 = await storage_type.open(storage_client=FileSystemStorageClient(), configuration=config) assert storage_1 is not storage_2 async def test_same_storage_when_different_client(tmp_path: Path, storage_type: type[Storage]) -> None: config = Configuration(purge_on_start=True, storage_dir=str(tmp_path)) storage_1 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config) storage_2 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config) assert storage_1 is storage_2 async def test_unique_storage_by_storage_type(tmp_path: Path) -> None: config = Configuration(purge_on_start=True, storage_dir=str(tmp_path)) storage_client = MemoryStorageClient() kvs = await KeyValueStore.open(storage_client=storage_client, configuration=config) dataset = await Dataset.open(storage_client=storage_client, configuration=config) assert kvs is not dataset async def test_unique_storage_by_name(storage_type: type[Storage]) -> None: """Test that StorageInstanceManager support different storage clients at the same time.""" storage_client = MemoryStorageClient() storage_1 = await storage_type.open(storage_client=storage_client, name='kvs1') storage_2 = await storage_type.open(storage_client=storage_client, name='kvs2') assert storage_1 is not storage_2 async def test_unique_storage_by_unique_cache_key_different_path(tmp_path: Path, storage_type: type[Storage]) -> None: """Test that StorageInstanceManager support unique cache key. Difference in storage_dir.""" path_1 = tmp_path / 'dir1' path_2 = tmp_path / 'dir2' path_1.mkdir() path_2.mkdir() config_1 = Configuration(storage_dir=str(path_1)) config_2 = Configuration(storage_dir=str(path_2)) storage_client = FileSystemStorageClient() storage_1 = await storage_type.open(storage_client=storage_client, configuration=config_1) storage_2 = await storage_type.open(storage_client=storage_client, configuration=config_2) assert storage_1 is not storage_2 async def test_unique_storage_by_unique_cache_key_same_path(tmp_path: Path, storage_type: type[Storage]) -> None: """Test that StorageInstanceManager support unique cache key. Different configs with same storage_dir create same storage.""" config_1 = Configuration(storage_dir=str(tmp_path)) config_2 = Configuration(storage_dir=str(tmp_path)) storage_client = FileSystemStorageClient() storage_1 = await storage_type.open(storage_client=storage_client, configuration=config_1) storage_2 = await storage_type.open(storage_client=storage_client, configuration=config_2) assert storage_1 is storage_2 async def test_identical_storage_default_config(storage_type: type[Storage]) -> None: """Test that StorageInstanceManager correctly caches storage based on the storage client.""" storage_client = MemoryStorageClient() storage_1 = await storage_type.open(storage_client=storage_client) storage_2 = await storage_type.open(storage_client=storage_client) assert storage_1 is storage_2 async def test_identical_storage_default_storage(storage_type: type[Storage]) -> None: """Test that StorageInstanceManager correctly caches storage based on the storage client.""" storage_1 = await storage_type.open() storage_2 = await storage_type.open() assert storage_1 is storage_2 async def test_identical_storage_clear_cache(storage_type: type[Storage]) -> None: storage_1 = await storage_type.open() service_locator.storage_instance_manager.clear_cache() storage_2 = await storage_type.open() assert storage_1 is not storage_2 async def test_identical_storage_remove_from_cache(storage_type: type[Storage]) -> None: storage_1 = await storage_type.open() service_locator.storage_instance_manager.remove_from_cache(storage_1) storage_2 = await storage_type.open() assert storage_1 is not storage_2 async def test_preexisting_unnamed_storage_open_by_id(storage_type: type[Storage]) -> None: """Test that persisted pre-existing unnamed storage can be opened by ID.""" storage_client = FileSystemStorageClient() storage_1 = await storage_type.open(alias='custom_name', storage_client=storage_client) # Make service_locator unaware of this storage service_locator.storage_instance_manager.clear_cache() storage_1_again = await storage_type.open(id=storage_1.id, storage_client=storage_client) assert storage_1.id == storage_1_again.id @pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.Barrier was introduced in Python 3.11.') async def test_concurrent_open_datasets() -> None: """Test that concurrent open datasets with the same name return the same instance.""" from asyncio import Barrier # type:ignore[attr-defined] # noqa: PLC0415 barrier = Barrier(2) async def push_data(data: dict) -> None: await barrier.wait() dataset = await Dataset.open(name='concurrent-storage') await dataset.push_data(data) await asyncio.gather( push_data({'test_1': '1'}), push_data({'test_2': '2'}), ) dataset = await Dataset.open(name='concurrent-storage') items = await dataset.get_data() assert len(items.items) == 2 await dataset.drop() @pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.Barrier was introduced in Python 3.11.') async def test_concurrent_open_datasets_with_same_name_and_alias() -> None: """Test that concurrent open requests for the same storage return the same instance.""" from asyncio import Barrier # type:ignore[attr-defined] # noqa: PLC0415 valid_kwargs: dict[str, str | None] = {} exception_calls = AsyncMock() barrier = Barrier(2) async def open_dataset(name: str | None, alias: str | None) -> None: await barrier.wait() try: await Dataset.open(name=name, alias=alias) valid_kwargs['name'] = name valid_kwargs['alias'] = alias except ValueError: await exception_calls() await asyncio.gather( open_dataset(name=None, alias='concurrent-storage'), open_dataset(name='concurrent-storage', alias=None), ) # Ensure that a ValueError was raised due to name/alias conflict exception_calls.assert_called_once() dataset = await Dataset.open(name=valid_kwargs.get('name'), alias=valid_kwargs.get('alias')) await dataset.drop() ================================================ FILE: tests/unit/test_cli.py ================================================ from __future__ import annotations import os from unittest.mock import ANY, Mock import pytest import readchar from typer.testing import CliRunner import crawlee._cli runner = CliRunner() @pytest.fixture def mock_cookiecutter(monkeypatch: pytest.MonkeyPatch) -> Mock: mock_cookiecutter = Mock() monkeypatch.setattr(target=crawlee._cli, name='cookiecutter', value=mock_cookiecutter) return mock_cookiecutter def test_create_interactive(mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch) -> None: mock_input = iter( [ *'my_project', readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) result = runner.invoke(crawlee._cli.cli, ['create']) assert 'Your project "my_project" was created.' in result.output mock_cookiecutter.assert_called_with( template=ANY, no_input=True, extra_context={ 'project_name': 'my_project', 'package_manager': 'poetry', 'crawler_type': 'beautifulsoup', 'http_client': 'impit', 'enable_apify_integration': False, 'start_url': 'https://crawlee.dev', 'install_project': True, }, ) def test_create_interactive_non_default_template(mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch) -> None: mock_input = iter( [ *'my_project', readchar.key.ENTER, readchar.key.DOWN, readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) result = runner.invoke(crawlee._cli.cli, ['create']) assert 'Your project "my_project" was created.' in result.output mock_cookiecutter.assert_called_with( template=ANY, no_input=True, extra_context={ 'project_name': 'my_project', 'package_manager': 'poetry', 'crawler_type': 'parsel', 'http_client': 'impit', 'enable_apify_integration': False, 'start_url': 'https://crawlee.dev', 'install_project': True, }, ) def test_create_non_interactive(mock_cookiecutter: Mock) -> None: runner.invoke( crawlee._cli.cli, [ 'create', 'my_project', '--crawler-type', 'playwright', '--http-client', 'httpx', '--package-manager', 'pip', '--start-url', 'https://yr.no', '--no-apify', '--no-install', ], ) mock_cookiecutter.assert_called_with( template=ANY, no_input=True, extra_context={ 'project_name': 'my_project', 'package_manager': 'pip', 'crawler_type': 'playwright', 'http_client': 'httpx', 'start_url': 'https://yr.no', 'enable_apify_integration': False, 'install_project': False, }, ) def test_create_existing_folder( mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch, tmp_path_factory: pytest.TempPathFactory ) -> None: mock_input = iter( [ *'my_project', readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) tmp = tmp_path_factory.mktemp('workdir') os.chdir(tmp) (tmp / 'existing_project').mkdir() result = runner.invoke( crawlee._cli.cli, [ 'create', 'existing_project', '--crawler-type', 'playwright', '--http-client', 'httpx', '--package-manager', 'pip', '--start-url', 'https://yr.no', '--no-apify', '--install', ], ) assert 'existing_project already exists' in result.output mock_cookiecutter.assert_called_with( template=ANY, no_input=True, extra_context={ 'project_name': 'my_project', 'package_manager': 'pip', 'crawler_type': 'playwright', 'http_client': 'httpx', 'start_url': 'https://yr.no', 'enable_apify_integration': False, 'install_project': True, }, ) def test_create_existing_folder_interactive( mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch, tmp_path_factory: pytest.TempPathFactory ) -> None: mock_input = iter( [ *'existing_project', readchar.key.ENTER, *'my_project', readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) tmp = tmp_path_factory.mktemp('workdir') os.chdir(tmp) (tmp / 'existing_project').mkdir() result = runner.invoke(crawlee._cli.cli, ['create', '--template', 'playwright']) assert 'existing_project already exists' in result.output mock_cookiecutter.assert_called_with( template=ANY, no_input=True, extra_context={ 'project_name': 'my_project', 'package_manager': 'poetry', 'crawler_type': 'playwright', 'http_client': 'impit', 'start_url': 'https://crawlee.dev', 'enable_apify_integration': False, 'install_project': True, }, ) def test_create_existing_folder_interactive_multiple_attempts( mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch, tmp_path_factory: pytest.TempPathFactory ) -> None: mock_input = iter( [ *'existing_project', readchar.key.ENTER, *'existing_project_2', readchar.key.ENTER, *'my_project', readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) tmp = tmp_path_factory.mktemp('workdir') os.chdir(tmp) (tmp / 'existing_project').mkdir() (tmp / 'existing_project_2').mkdir() result = runner.invoke(crawlee._cli.cli, ['create', '--crawler-type', 'playwright']) assert 'existing_project already exists' in result.output mock_cookiecutter.assert_called_with( template=ANY, no_input=True, extra_context={ 'project_name': 'my_project', 'package_manager': 'poetry', 'crawler_type': 'playwright', 'http_client': 'impit', 'start_url': 'https://crawlee.dev', 'enable_apify_integration': False, 'install_project': True, }, ) ================================================ FILE: tests/unit/test_configuration.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING from anyio import Path as AnyioPath from crawlee import service_locator from crawlee.configuration import Configuration from crawlee.crawlers import HttpCrawler, HttpCrawlingContext from crawlee.statistics import Statistics from crawlee.storage_clients import MemoryStorageClient from crawlee.storage_clients._file_system._storage_client import FileSystemStorageClient if TYPE_CHECKING: from pathlib import Path from yarl import URL def test_global_configuration_works() -> None: assert ( Configuration.get_global_configuration() is Configuration.get_global_configuration() is service_locator.get_configuration() is service_locator.get_configuration() ) def test_global_configuration_works_reversed() -> None: assert ( service_locator.get_configuration() is service_locator.get_configuration() is Configuration.get_global_configuration() is Configuration.get_global_configuration() ) async def test_storage_not_persisted_when_non_persistable_storage_used(tmp_path: Path, server_url: URL) -> None: """Make the Crawler use MemoryStorageClient which can't persist state.""" service_locator.set_configuration(Configuration(storage_dir=str(tmp_path))) crawler = HttpCrawler(storage_client=MemoryStorageClient()) @crawler.router.default_handler async def default_handler(context: HttpCrawlingContext) -> None: await context.push_data({'url': context.request.url}) await crawler.run([str(server_url)]) # Verify that no files were created in the storage directory. content = [path async for path in AnyioPath(tmp_path).iterdir()] assert content == [], 'Expected the storage directory to be empty, but it is not.' async def test_storage_persisted_with_explicit_statistics_with_persistable_storage( tmp_path: Path, server_url: URL ) -> None: """Make the Crawler use MemoryStorageClient which can't persist state, but pass explicit statistics to it which will use global FileSystemStorageClient() that can persist state.""" configuration = Configuration(storage_dir=str(tmp_path)) service_locator.set_configuration(configuration) service_locator.set_storage_client(FileSystemStorageClient()) crawler = HttpCrawler( storage_client=MemoryStorageClient(), statistics=Statistics.with_default_state(persistence_enabled=True) ) @crawler.router.default_handler async def default_handler(context: HttpCrawlingContext) -> None: await context.push_data({'url': context.request.url}) await crawler.run([str(server_url)]) # Verify that files were created in the storage directory. content = [path async for path in AnyioPath(tmp_path).iterdir()] assert content != [], 'Expected the storage directory to contain files, but it does not.' async def test_storage_persisted_when_enabled(tmp_path: Path, server_url: URL) -> None: configuration = Configuration( storage_dir=str(tmp_path), ) storage_client = FileSystemStorageClient() crawler = HttpCrawler( configuration=configuration, storage_client=storage_client, ) @crawler.router.default_handler async def default_handler(context: HttpCrawlingContext) -> None: await context.push_data({'url': context.request.url}) await crawler.run([str(server_url)]) # Verify that files were created in the storage directory. content = [path async for path in AnyioPath(tmp_path).iterdir()] assert content != [], 'Expected the storage directory to contain files, but it does not.' ================================================ FILE: tests/unit/test_log_config.py ================================================ from __future__ import annotations import logging import sys import pytest from crawlee._log_config import CrawleeLogFormatter def get_log_record(level: int, msg: str, exc_info: logging._SysExcInfoType | None = None) -> logging.LogRecord: return logging.LogRecord( name='test', level=level, pathname=__file__, lineno=0, msg=msg, args=(), exc_info=exc_info, ) @pytest.mark.parametrize( ('level', 'msg', 'expected'), [ (logging.DEBUG, 'Debug message', '\x1b[90m[test]\x1b[0m \x1b[34mDEBUG\x1b[0m Debug message'), (logging.INFO, 'Info message', '\x1b[90m[test]\x1b[0m \x1b[32mINFO \x1b[0m Info message'), (logging.WARNING, 'Warning message', '\x1b[90m[test]\x1b[0m \x1b[33mWARN \x1b[0m Warning message'), (logging.ERROR, 'Error message', '\x1b[90m[test]\x1b[0m \x1b[31mERROR\x1b[0m Error message'), ], ids=['debug', 'info', 'warning', 'error'], ) def test_formatted_message(level: int, msg: str, expected: str) -> None: formatter = CrawleeLogFormatter() record = get_log_record(level, msg) formatted_message = formatter.format(record) assert formatted_message == expected def test_formatting_with_exception() -> None: formatter = CrawleeLogFormatter() try: raise ValueError('This is a test exception') except ValueError: exc_info = sys.exc_info() record = get_log_record(logging.ERROR, 'Exception occurred', exc_info=exc_info) formatted_message = formatter.format(record) assert '\x1b[90m[test]\x1b[0m \x1b[31mERROR\x1b[0m Exception occurred' in formatted_message assert 'ValueError: This is a test exception' in formatted_message def test_formatter_without_name() -> None: formatter = CrawleeLogFormatter(include_logger_name=False) record = get_log_record(logging.INFO, 'Info message without name') formatted_message = formatter.format(record) assert formatted_message == '\x1b[32mINFO \x1b[0m Info message without name' ================================================ FILE: tests/unit/test_router.py ================================================ from __future__ import annotations import logging from unittest.mock import AsyncMock, Mock import pytest from crawlee import Request from crawlee._types import BasicCrawlingContext from crawlee.router import Router from crawlee.sessions import Session class MockContext(BasicCrawlingContext): def __init__(self, *, label: str | None) -> None: super().__init__( request=Request.from_url(url='https://example.com/', user_data={'label': label}), session=Session(), send_request=AsyncMock(), add_requests=AsyncMock(), proxy_info=AsyncMock(), push_data=AsyncMock(), use_state=AsyncMock(), get_key_value_store=AsyncMock(), log=logging.getLogger(), ) async def test_router_no_handlers() -> None: router = Router[MockContext]() with pytest.raises(RuntimeError): await router(MockContext(label=None)) async def test_router_no_default_handler() -> None: router = Router[MockContext]() mock_handler = Mock() @router.handler('A') async def handler_a(_context: MockContext) -> None: mock_handler() with pytest.raises(RuntimeError): await router(MockContext(label='B')) mock_handler.assert_not_called() async def test_router_default_handler_invoked() -> None: router = Router[MockContext]() mock_default_handler = Mock() mock_handler_a = Mock() @router.handler('A') async def handler_a(_context: MockContext) -> None: mock_handler_a() @router.default_handler async def default_handler(_context: MockContext) -> None: mock_default_handler() await router(MockContext(label='B')) mock_default_handler.assert_called() mock_handler_a.assert_not_called() async def test_router_specific_handler_invoked() -> None: router = Router[MockContext]() mock_default_handler = Mock() mock_handler_a = Mock() mock_handler_b = Mock() @router.handler('A') async def handler_a(_context: MockContext) -> None: mock_handler_a() @router.handler('B') async def handler_b(_context: MockContext) -> None: mock_handler_b() @router.default_handler async def default_handler(_context: MockContext) -> None: mock_default_handler() await router(MockContext(label='B')) mock_default_handler.assert_not_called() mock_handler_a.assert_not_called() mock_handler_b.assert_called() async def test_router_handler_not_nullified() -> None: router = Router[MockContext]() @router.handler('A') async def handler_a(_context: MockContext) -> None: pass assert handler_a is not None async def test_router_multi_labelled_handler() -> None: router = Router[MockContext]() mock_handler = Mock() @router.handler('A') @router.handler('B') async def handler(_context: MockContext) -> None: mock_handler(_context.request.label) await router(MockContext(label='A')) mock_handler.assert_called_with('A') await router(MockContext(label='B')) mock_handler.assert_called_with('B') assert mock_handler.call_count == 2 ================================================ FILE: tests/unit/test_service_locator.py ================================================ from __future__ import annotations import pytest from crawlee import service_locator from crawlee.configuration import Configuration from crawlee.errors import ServiceConflictError from crawlee.events import LocalEventManager from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient def test_default_configuration() -> None: default_config = Configuration() config = service_locator.get_configuration() assert config == default_config # == because these are in fact different instances, which should be fine def test_custom_configuration() -> None: custom_config = Configuration(default_browser_path='custom_path') service_locator.set_configuration(custom_config) config = service_locator.get_configuration() assert config is custom_config def test_configuration_overwrite_not_possible() -> None: default_config = Configuration() service_locator.set_configuration(default_config) custom_config = Configuration(default_browser_path='custom_path') with pytest.raises(ServiceConflictError): service_locator.set_configuration(custom_config) def test_configuration_conflict() -> None: service_locator.get_configuration() custom_config = Configuration(default_browser_path='custom_path') with pytest.raises(ServiceConflictError, match=r'Configuration is already in use.'): service_locator.set_configuration(custom_config) def test_default_event_manager() -> None: default_event_manager = service_locator.get_event_manager() assert isinstance(default_event_manager, LocalEventManager) def test_custom_event_manager() -> None: custom_event_manager = LocalEventManager() service_locator.set_event_manager(custom_event_manager) event_manager = service_locator.get_event_manager() assert event_manager is custom_event_manager def test_event_manager_overwrite_not_possible() -> None: custom_event_manager = LocalEventManager() service_locator.set_event_manager(custom_event_manager) another_custom_event_manager = LocalEventManager() with pytest.raises(ServiceConflictError): service_locator.set_event_manager(another_custom_event_manager) def test_event_manager_conflict() -> None: service_locator.get_event_manager() custom_event_manager = LocalEventManager() with pytest.raises(ServiceConflictError, match=r'EventManager is already in use.'): service_locator.set_event_manager(custom_event_manager) def test_default_storage_client() -> None: default_storage_client = service_locator.get_storage_client() assert isinstance(default_storage_client, FileSystemStorageClient) def test_custom_storage_client() -> None: custom_storage_client = MemoryStorageClient() service_locator.set_storage_client(custom_storage_client) storage_client = service_locator.get_storage_client() assert storage_client is custom_storage_client def test_storage_client_overwrite_not_possible() -> None: custom_storage_client = MemoryStorageClient() service_locator.set_storage_client(custom_storage_client) another_custom_storage_client = MemoryStorageClient() with pytest.raises(ServiceConflictError): service_locator.set_storage_client(another_custom_storage_client) def test_storage_client_conflict() -> None: service_locator.get_storage_client() custom_storage_client = MemoryStorageClient() with pytest.raises(ServiceConflictError, match=r'StorageClient is already in use.'): service_locator.set_storage_client(custom_storage_client) ================================================ FILE: tests/unit/utils.py ================================================ import sys import pytest run_alone_on_mac = pytest.mark.run_alone if sys.platform == 'darwin' else lambda x: x ================================================ FILE: typos.toml ================================================ # Configuration for typos spell checker # https://github.com/crate-ci/typos [default] extend-ignore-re = [ "https?://[^\\s]+", # Ignore URLs "'gASV[^']+", # Ignore base64-encoded pickle data ] [files] # Extend the default exclude list extend-exclude = [ "*.lock", "*.min.js", "*.min.css", "CHANGELOG.md", ] [default.extend-identifiers] # Add project-specific identifiers that should not be treated as typos ser_json_inf_nan = "ser_json_inf_nan" # Pydantic config parameter asend = "asend" # Python async generator method [default.extend-words] # Add project-specific words that should not be treated as typos mke = "mke" # Sennheiser MKE product name consts = "consts" # Common abbreviation for "constants" ================================================ FILE: website/.eslintrc.json ================================================ { "extends": [ "@apify/eslint-config-ts", "plugin:react/recommended", "plugin:react-hooks/recommended" ], "parserOptions": { "project": "./tsconfig.eslint.json", "ecmaFeatures": { "jsx": true }, "ecmaVersion": 2020 }, "env": { "browser": true }, "settings": { "react": { "version": "detect" } }, "rules": { "quote-props": ["error", "consistent"], "no-void": 0 }, "root": true } ================================================ FILE: website/.yarnrc.yml ================================================ nodeLinker: node-modules enableGlobalCache: true ================================================ FILE: website/babel.config.js ================================================ module.exports = { presets: [require.resolve('@docusaurus/core/lib/babel/preset')], }; ================================================ FILE: website/build_api_reference.sh ================================================ #!/bin/bash # Generate import shortcuts from the modules python generate_module_shortcuts.py ================================================ FILE: website/docusaurus.config.js ================================================ /* eslint-disable global-require */ const path = require('path'); const { externalLinkProcessor } = require('./tools/utils/externalLink'); const GROUP_ORDER = [ 'Autoscaling', 'Browser management', 'Configuration', 'Crawlers', 'Crawling contexts', 'Errors', 'Event data', 'Event managers', 'Functions', 'HTTP clients', 'HTTP parsers', 'Request loaders', 'Session management', 'Statistics', 'Storage clients', 'Storage data', 'Storages', 'Other', ]; const groupSort = (g1, g2) => { if (GROUP_ORDER.includes(g1) && GROUP_ORDER.includes(g2)) { return GROUP_ORDER.indexOf(g1) - GROUP_ORDER.indexOf(g2); } return g1.localeCompare(g2); }; /** @type {Partial} */ module.exports = { title: 'Crawlee for Python · Fast, reliable Python web crawlers.', url: 'https://crawlee.dev', baseUrl: '/python/', trailingSlash: false, organizationName: 'apify', projectName: 'crawlee-python', scripts: [ '/python/js/custom.js', '/crawlee-python/js/custom.js', ], githubHost: 'github.com', future: { experimental_faster: true, v4: { removeLegacyPostBuildHeadAttribute: true, useCssCascadeLayers: false, // this breaks styles on homepage and link colors everywhere }, }, headTags: [ // Intercom messenger { tagName: 'script', innerHTML: `window.intercomSettings={api_base:"https://api-iam.intercom.io",app_id:"kod1r788"};`, attributes: {}, }, // Intercom messenger { tagName: 'script', innerHTML: `(function(){var w=window;var ic=w.Intercom;if(typeof ic==="function"){ic('reattach_activator');ic('update',w.intercomSettings);}else{var d=document;var i=function(){i.c(arguments);};i.q=[];i.c=function(args){i.q.push(args);};w.Intercom=i;var l=function(){var s=d.createElement('script');s.type='text/javascript';s.async=true;s.src='https://widget.intercom.io/widget/kod1r788';var x=d.getElementsByTagName('script')[0];x.parentNode.insertBefore(s,x);};if(document.readyState==='complete'){l();}else if(w.attachEvent){w.attachEvent('onload',l);}else{w.addEventListener('load',l,false);}}})()`, attributes: {}, }, ], favicon: 'img/favicon.ico', customFields: { markdownOptions: { html: true, }, gaGtag: true, repoUrl: 'https://github.com/apify/crawlee-python', }, onBrokenLinks: 'throw', markdown: { mermaid: true, hooks: { onBrokenMarkdownLinks: 'throw', }, }, themes: [ '@docusaurus/theme-mermaid', ], presets: /** @type {import('@docusaurus/types').PresetConfig[]} */ ([ [ '@docusaurus/preset-classic', /** @type {import('@docusaurus/preset-classic').Options} */ ({ docs: { showLastUpdateAuthor: true, showLastUpdateTime: true, path: '../docs', sidebarPath: './sidebars.js', rehypePlugins: [externalLinkProcessor], // disableVersioning: true, editUrl: (doc) => { return `https://github.com/apify/crawlee-python/edit/master/website/${doc.versionDocsDirPath}/${doc.docPath}`; }, }, theme: { customCss: '/src/css/custom.css', }, }), ], ]), plugins: [ [ '@apify/docusaurus-plugin-typedoc-api', { projectRoot: '.', changelogs: false, readmes: false, packages: [{ path: '.' }], typedocOptions: { excludeExternals: false, }, sortSidebar: groupSort, routeBasePath: 'api', pythonOptions: { pythonModulePath: path.join(__dirname, '../src/crawlee'), moduleShortcutsPath: path.join(__dirname, 'module_shortcuts.json'), }, }, ], // [ // '@docusaurus/plugin-client-redirects', // { // redirects: [ // { // from: '/docs', // to: '/docs/quick-start', // }, // { // from: '/docs/next', // to: '/docs/next/quick-start', // }, // { // from: '/docs/guides/environment-variables', // to: '/docs/guides/configuration', // }, // { // from: '/docs/guides/getting-started', // to: '/docs/introduction', // }, // { // from: '/docs/guides/apify-platform', // to: '/docs/deployment/apify-platform', // }, // ], // createRedirects(existingPath) { // if (!existingPath.endsWith('/')) { // return `${existingPath}/`; // } // // return undefined; // Return a falsy value: no redirect created // }, // }, // ], [ 'docusaurus-gtm-plugin', { id: 'GTM-5P7MCS7', }, ], [ '@signalwire/docusaurus-plugin-llms-txt', { enableDescriptions: false, content: { includeVersionedDocs: false, enableLlmsFullTxt: true, relativePaths: false, }, }, ], async function runnableCodeBlock() { return { name: 'runnable-code-block', configureWebpack() { return { resolveLoader: { alias: { 'roa-loader': require.resolve(`${__dirname}/roa-loader/`), }, }, }; }, }; }, // skipping svgo for animated crawlee logo async function doNotUseSVGO() { return { name: 'docusaurus-svgo', configureWebpack(config) { // find the svg rule const svgRule = config.module.rules.find((r) => typeof r === 'object' && r.test.toString() === '/\\.svg$/i'); // find the svgr loader const svgrLoader = svgRule?.oneOf?.[0]; // make copy of svgr loader and disable svgo const svgrLoaderCopy = JSON.parse(JSON.stringify(svgrLoader)); // include only animated logo svgrLoaderCopy.include = /animated-crawlee-logo/; // turn off svgo svgrLoaderCopy.use[0].options.svgo = false; // insert the copy after the original svgr loader svgRule.oneOf.splice(1, 0, svgrLoaderCopy); // exclude animated logo from the first svgr loader (with svgo enabled) svgrLoader.exclude = /animated-crawlee-logo/; return { mergeStrategy: { 'module.rules': 'replace', }, module: { rules: config.module.rules, }, }; }, }; }, [ path.resolve(__dirname, 'src/plugins/docusaurus-plugin-segment'), { writeKey: process.env.SEGMENT_TOKEN, allowedInDev: false, }, ], ], themeConfig: /** @type {import('@docusaurus/preset-classic').ThemeConfig} */ ({ docs: { versionPersistence: 'localStorage', sidebar: { hideable: true, }, }, navbar: { hideOnScroll: true, logo: { src: 'img/crawlee-python-light.svg', srcDark: 'img/crawlee-python-dark.svg', }, title: 'Crawlee for Python', items: [ { type: 'doc', docId: 'quick-start/quick-start', label: 'Docs', position: 'left', }, { type: 'doc', docId: '/examples', label: 'Examples', position: 'left', }, { to: '/api', label: 'API', position: 'left', activeBaseRegex: 'api/(?!.*/changelog)', }, { type: 'doc', label: 'Changelog', docId: 'changelog', className: 'changelog', }, { href: 'https://crawlee.dev/blog', target: '_self', rel: 'dofollow', label: 'Blog', position: 'left', }, ], }, colorMode: { defaultMode: 'light', disableSwitch: false, respectPrefersColorScheme: true, }, prism: { defaultLanguage: 'typescript', theme: require('prism-react-renderer').themes.github, darkTheme: require('prism-react-renderer').themes.dracula, additionalLanguages: ['docker', 'log', 'bash', 'diff', 'json'], }, metadata: [ // eslint-disable-next-line max-len { name: 'description', content: `Crawlee helps you build and maintain your Python crawlers. It's open source and modern, with type hints for Python to help you catch bugs early.` }, // eslint-disable-next-line max-len { name: 'og:description', content: `Crawlee helps you build and maintain your Python crawlers. It's open source and modern, with type hints for Python to help you catch bugs early.` }, ], image: 'img/crawlee-python-og.png', footer: { links: [ { title: 'Docs', items: [ { label: 'Guides', to: 'docs/guides', }, { label: 'Examples', to: 'docs/examples', }, { label: 'API reference', to: 'api', }, { label: 'Changelog', to: 'docs/changelog', }, ], }, { title: 'Product', items: [ { label: 'Discord', href: 'https://discord.com/invite/jyEM2PRvMU', }, { label: 'Stack Overflow', href: 'https://stackoverflow.com/questions/tagged/crawlee-python', }, { label: 'Twitter', href: 'https://twitter.com/apify', }, { label: 'YouTube', href: 'https://www.youtube.com/apify', }, ], }, { title: 'More', items: [ { label: 'Apify platform', href: 'https://apify.com', }, { label: 'Docusaurus', href: 'https://docusaurus.io', }, { label: 'GitHub', href: 'https://github.com/apify/crawlee-python', }, ], }, ], }, algolia: { appId: '5JC94MPMLY', apiKey: '878493fcd7001e3c179b6db6796a999b', // search only (public) API key indexName: 'crawlee_python', placeholder: 'Search documentation', algoliaOptions: { facetFilters: ['version:VERSION'], }, translations: { button: { buttonText: 'Search documentation...', }, }, }, }), }; ================================================ FILE: website/generate_module_shortcuts.py ================================================ #!/usr/bin/env python3 from __future__ import annotations import importlib import inspect import json from pathlib import Path from typing import TYPE_CHECKING if TYPE_CHECKING: from types import ModuleType def get_module_shortcuts(module: ModuleType, parent_classes: list | None = None) -> dict: """Traverse a module and its submodules to identify and register shortcuts for classes.""" shortcuts = {} if parent_classes is None: parent_classes = [] parent_module_name = '.'.join(module.__name__.split('.')[:-1]) module_classes = [] for classname, cls in inspect.getmembers(module, inspect.isclass): module_classes.append(cls) if cls in parent_classes: shortcuts[f'{module.__name__}.{classname}'] = f'{parent_module_name}.{classname}' for _, submodule in inspect.getmembers(module, inspect.ismodule): if submodule.__name__.startswith('apify'): shortcuts.update(get_module_shortcuts(submodule, module_classes)) return shortcuts def resolve_shortcuts(shortcuts: dict) -> None: """Resolve linked shortcuts. For example, if there are shortcuts A -> B and B -> C, resolve them to A -> C. """ for source, target in shortcuts.items(): while target in shortcuts: shortcuts[source] = shortcuts[target] target = shortcuts[target] # noqa: PLW2901 shortcuts = {} for module_name in ['crawlee']: try: module = importlib.import_module(module_name) module_shortcuts = get_module_shortcuts(module) shortcuts.update(module_shortcuts) except ModuleNotFoundError: # noqa: PERF203 pass resolve_shortcuts(shortcuts) with Path('module_shortcuts.json').open('w', encoding='utf-8') as shortcuts_file: json.dump(shortcuts, shortcuts_file, indent=4, sort_keys=True) ================================================ FILE: website/package.json ================================================ { "name": "crawlee", "scripts": { "examples": "docusaurus-examples", "postinstall": "npx patch-package", "start": "rimraf .docusaurus && cp ../CHANGELOG.md ../docs/changelog.md && docusaurus start", "start:fast": "rimraf .docusaurus && cp ../CHANGELOG.md ../docs/changelog.md && CRAWLEE_DOCS_FAST=1 docusaurus start", "build": "rimraf .docusaurus && cp ../CHANGELOG.md ../docs/changelog.md && node --max_old_space_size=16000 node_modules/@docusaurus/core/bin/docusaurus.mjs build", "publish-gh-pages": "docusaurus-publish", "write-translations": "docusaurus write-translations", "version": "docusaurus version", "rename-version": "docusaurus rename-version", "prettify": "prettier --write --config ./tools/docs-prettier.config.js ../docs/guides/*.md", "swizzle": "docusaurus swizzle", "deploy": "rimraf .docusaurus && node --max_old_space_size=16000 node_modules/@docusaurus/core/bin/docusaurus.mjs deploy", "docusaurus": "docusaurus", "clean": "rimraf .docusaurus build", "lint": "yarn lint:code", "lint:fix": "yarn lint:code:fix", "lint:code": "eslint .", "lint:code:fix": "eslint . --fix" }, "dependencies": { "@apify/docusaurus-plugin-typedoc-api": "^5.1.0", "@apify/utilities": "^2.8.0", "@docusaurus/core": "^3.9.2", "@docusaurus/faster": "^3.9.2", "@docusaurus/mdx-loader": "^3.9.2", "@docusaurus/plugin-client-redirects": "^3.9.2", "@docusaurus/preset-classic": "^3.9.2", "@docusaurus/theme-mermaid": "^3.9.2", "@giscus/react": "^3.0.0", "@mdx-js/react": "^3.0.1", "@mermaid-js/layout-elk": "^0.2.0", "@signalwire/docusaurus-plugin-llms-txt": "^1.2.1", "axios": "^1.5.0", "buffer": "^6.0.3", "clsx": "^2.0.0", "crypto-browserify": "^3.12.0", "docusaurus-gtm-plugin": "^0.0.2", "prism-react-renderer": "^2.1.0", "process": "^0.11.10", "prop-types": "^15.8.1", "raw-loader": "^4.0.2", "react": "^19.0.0", "react-dom": "^19.0.0", "react-github-btn": "^1.4.0", "react-lite-youtube-embed": "^3.0.0", "stream-browserify": "^3.0.0", "unist-util-visit": "^5.0.0" }, "devDependencies": { "@apify/eslint-config-ts": "^0.4.0", "@apify/tsconfig": "^0.1.0", "@apify/ui-icons": "^1.23.0", "@docusaurus/module-type-aliases": "^3.9.2", "@docusaurus/types": "^3.9.2", "@types/react": "^19.0.0", "@typescript-eslint/eslint-plugin": "^8.46.0", "@typescript-eslint/parser": "^8.46.0", "eslint": "^10.0.0", "eslint-plugin-react": "^7.37.5", "eslint-plugin-react-hooks": "^7.0.0", "fs-extra": "^11.1.0", "patch-package": "^8.0.0", "path-browserify": "^1.0.1", "prettier": "^3.0.0", "rimraf": "^6.0.0", "typescript": "^5.9.3" }, "packageManager": "yarn@4.13.0" } ================================================ FILE: website/patches/@docusaurus+core+3.4.0.patch ================================================ diff --git a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js index 903f8dc..b6b60bf 100644 --- a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js +++ b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js @@ -30,9 +30,11 @@ function scrollAfterNavigation({ location, previousLocation, }) { window.scrollTo(0, 0); } else { - const id = decodeURIComponent(hash.substring(1)); - const element = document.getElementById(id); - element?.scrollIntoView(); + setTimeout(() => { + const id = decodeURIComponent(hash.substring(1)); + const element = document.getElementById(id); + element?.scrollIntoView(); + }, 100); } } function ClientLifecyclesDispatcher({ children, location, previousLocation, }) { ================================================ FILE: website/patches/@docusaurus+core+3.5.2.patch ================================================ diff --git a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js index 903f8dc..b6b60bf 100644 --- a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js +++ b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js @@ -30,9 +30,11 @@ function scrollAfterNavigation({ location, previousLocation, }) { window.scrollTo(0, 0); } else { - const id = decodeURIComponent(hash.substring(1)); - const element = document.getElementById(id); - element?.scrollIntoView(); + setTimeout(() => { + const id = decodeURIComponent(hash.substring(1)); + const element = document.getElementById(id); + element?.scrollIntoView(); + }, 100); } } function ClientLifecyclesDispatcher({ children, location, previousLocation, }) { ================================================ FILE: website/roa-loader/index.js ================================================ const { createHash } = require('node:crypto'); const { inspect } = require('node:util'); const { urlToRequest } = require('loader-utils'); const signingUrl = new URL('https://api.apify.com/v2/tools/encode-and-sign'); signingUrl.searchParams.set('token', process.env.APIFY_SIGNING_TOKEN); const queue = []; const cache = {}; let working = false; function hash(source) { return createHash('sha1').update(source).digest('hex'); } async function getHash(source) { const cacheKey = hash(source); if (cache[cacheKey]) { return cache[cacheKey]; } const memory = source.match(/playwright|puppeteer/i) ? 4096 : 1024; const res = await (await fetch(signingUrl, { method: 'POST', body: JSON.stringify({ input: JSON.stringify({ code: source }), options: { build: 'latest', contentType: 'application/json; charset=utf-8', memory, timeout: 180, }, }), headers: { 'Content-Type': 'application/json; charset=utf-8', }, })); if (!res.ok) { console.error(`Signing failed: ${res.status} ${res.statusText}`, await res.text()); return 'invalid-token'; } const body = await res.json(); if (!body.data || !body.data.encoded) { console.error(`Signing failed:' ${inspect(body.error) || 'Unknown error'}`, body); return 'invalid-token'; } cache[cacheKey] = body.data.encoded; await new Promise((resolve) => setTimeout(resolve, 100)); return body.data.encoded; } async function encodeAndSign(source) { if (!process.env.APIFY_SIGNING_TOKEN) { return 'invalid-token'; } if (working) { return new Promise((resolve, reject) => { queue.push(() => { return getHash(source).then(resolve, reject); }); }); } let res; try { working = true; res = await getHash(source); while (queue.length) { await queue.shift()(); } } finally { working = false; } return res; } module.exports = async function (code) { if (process.env.CRAWLEE_DOCS_FAST) { return { code, hash: 'fast' }; } console.log(`Signing ${urlToRequest(this.resourcePath)}...`, { working, queue: queue.length }); const codeHash = await encodeAndSign(code); return { code, hash: codeHash }; }; ================================================ FILE: website/roa-loader/package.json ================================================ { "name": "roa-loader", "version": "1.0.0", "description": "", "main": "index.js", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" }, "keywords": [], "author": "", "license": "ISC", "dependencies": { "loader-utils": "^3.2.1" } } ================================================ FILE: website/sidebars.js ================================================ module.exports = { docs: [ 'quick-start/quick-start', { type: 'category', label: 'Introduction', collapsed: false, link: { type: 'doc', id: 'introduction/introduction', }, items: [ 'introduction/setting-up', 'introduction/first-crawler', 'introduction/adding-more-urls', 'introduction/real-world-project', 'introduction/crawling', 'introduction/scraping', 'introduction/saving-data', 'introduction/refactoring', 'introduction/deployment', ], }, { type: 'category', label: 'Guides', collapsed: true, link: { type: 'generated-index', title: 'Guides', slug: '/guides', keywords: ['guides'], }, items: [ { type: 'autogenerated', dirName: 'guides', }, ], }, { type: 'category', label: 'Deployment', collapsed: true, link: { type: 'generated-index', title: 'Deployment guides', description: 'Here you can find guides on how to deploy your crawlers to various cloud providers.', slug: '/deployment', }, items: [ { type: 'doc', id: 'deployment/apify-platform', label: 'Deploy on Apify', }, { type: 'doc', id: 'deployment/aws-lambda', label: 'Deploy on AWS Lambda' }, { type: 'category', label: 'Deploy to Google Cloud', items: [ 'deployment/gcp-cloud-run-functions', 'deployment/gcp-cloud-run', ], }, ], }, { type: 'category', label: 'Examples', collapsed: true, link: { type: 'generated-index', title: 'Examples', slug: '/examples', keywords: ['examples'], }, items: [ { type: 'autogenerated', dirName: 'examples', }, ], }, // { // type: 'category', // label: 'Experiments', // link: { // type: 'generated-index', // title: 'Experiments', // slug: '/experiments', // keywords: ['experiments', 'experimental-features'], // }, // items: [ // { // type: 'autogenerated', // dirName: 'experiments', // }, // ], // }, { type: 'category', label: 'Upgrading', collapsed: true, link: { type: 'generated-index', title: 'Upgrading', slug: '/upgrading', keywords: ['upgrading'], }, items: [ { type: 'autogenerated', dirName: 'upgrading', }, ], }, { type: 'doc', label: 'Changelog', id: 'changelog', }, ], }; ================================================ FILE: website/src/components/ApiLink.jsx ================================================ import React from 'react'; import Link from '@docusaurus/Link'; // eslint-disable-next-line import/no-extraneous-dependencies import { useDocsVersion } from '@docusaurus/theme-common/internal'; import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; // const pkg = require('../../../packages/crawlee/package.json'); // // const [v1, v2] = pkg.version.split('.'); // const stable = [v1, v2].join('.'); const ApiLink = ({ to, children }) => { return ( {children} ); // const version = useDocsVersion(); // const { siteConfig } = useDocusaurusContext(); // // // if (siteConfig.presets[0][1].docs.disableVersioning || version.version === stable) { // if (siteConfig.presets[0][1].docs.disableVersioning) { // return ( // {children} // ); // } // // return ( // {children} // ); }; export default ApiLink; ================================================ FILE: website/src/components/Button.jsx ================================================ import Link from '@docusaurus/Link'; import clsx from 'clsx'; import React from 'react'; import styles from './Button.module.css'; import CrawleeSvg from '../../static/img/crawlee-logo-monocolor.svg'; export default function Button({ children, to, withIcon, type = 'primary', className, isBig }) { return ( {withIcon && } {children} ); } ================================================ FILE: website/src/components/Button.module.css ================================================ .button { display: inline-flex; align-items: center; text-align: center; padding: 8px 16px; border-radius: 8px; font-family: (--ifm-font-family-base); font-size: 16px; font-style: normal; font-weight: 500; line-height: 24px; cursor: pointer; transition: background-color 0.2s; svg { margin-right: 8px; } } .buttonPrimary { background-color: var(--color-black-action); color: var(--color-text-on-primary); border: none; path { stroke: var(--color-text-on-primary); &:first-child { fill: var(--color-text-on-primary); } } } .buttonPrimary:hover { background-color: var(--color-primary-action-hover); } .buttonSecondary { background-color: var(--color-background); color: var(--color-text); border: 1px solid var(--color-border); path { stroke: var(--color-black-action); &:first-child { fill: var(--color-black-action); } } } .buttonSecondary:hover { border: 1px solid var(--color-text); } .big { padding: 12px 24px; } /* TABLET */ @media (min-width: 768px) { .button { width: auto; } } ================================================ FILE: website/src/components/CopyButton.jsx ================================================ /* eslint-disable max-len */ import clsx from 'clsx'; import React, { useState } from 'react'; import styles from './CopyButton.module.css'; export default function CopyButton({ copyText, compact = false, className }) { const [copied, setCopied] = useState(false); const copy = async () => { await navigator.clipboard.writeText(copyText); setCopied(true); setTimeout(() => setCopied(false), 2000); }; return ; } ================================================ FILE: website/src/components/CopyButton.module.css ================================================ .copyButton { all: unset; display: inline-flex; align-items: center; justify-content: center; box-sizing: border-box; cursor: pointer; fill: var(--color-icon); svg { flex-shrink: 0; } } .copyButtonDefault { width: 28px; height: 28px; background-color: var(--color-background-muted); border: 1px solid var(--color-border); border-radius: 6px; transition: background-color 0.12s ease-out; &:hover { background-color: var(--color-hover); } svg { padding: 1px; } } .copyButtonCompact { svg { width: 16px; height: 16px; } } ================================================ FILE: website/src/components/Gradients.jsx ================================================ import React from 'react'; export default function Gradients() { return ( ); } ================================================ FILE: website/src/components/Highlights.jsx ================================================ import React from 'react'; import clsx from 'clsx'; import styles from './Highlights.module.css'; import Gradients from './Gradients'; const FeatureList = [ { title: 'Python with type hints', Svg: require('../../static/img/features/runs-on-py.svg').default, description: ( <> Crawlee for Python is written in a modern way using type hints, providing code completion in your IDE and helping you catch bugs early on build time. ), }, // { // title: 'HTTP scraping', // Svg: require('../../static/img/features/fingerprints.svg').default, // description: ( // <> // Crawlee makes HTTP requests that mimic browser headers and TLS fingerprints. // It also rotates them automatically based on data about real-world traffic. Popular HTML // parsers Cheerio  // and JSDOM are included. // // ), // }, { title: 'Headless browsers', Svg: require('../../static/img/features/works-everywhere.svg').default, description: ( <> Switch your crawlers from HTTP to a headless browser in 3 lines of code. Crawlee builds on top of Playwright and adds its own features. Chrome, Firefox and more. ), // TODO: this is not true yet // Crawlee builds on top of Playwright and adds its own anti-blocking features and human-like fingerprints. Chrome, Firefox and more. }, { title: 'Automatic scaling and proxy management', Svg: require('../../static/img/features/auto-scaling.svg').default, description: ( <> Crawlee automatically manages concurrency based on available system resources and  smartly rotates proxies. Proxies that often time-out, return network errors or bad HTTP codes like 401 or 403 are discarded. ), }, // { // title: 'Queue and Storage', // Svg: require('../../static/img/features/storage.svg').default, // description: ( // <> // You can save files, screenshots and JSON results to disk with one line of code // or plug an adapter for your DB. Your URLs are kept in a queue that ensures their // uniqueness and that you don't lose progress when something fails. // // ), // }, // { // title: 'Helpful utils and configurability', // Svg: require('../../static/img/features/node-requests.svg').default, // description: ( // <> // Crawlee includes tools for extracting social handles or phone numbers, infinite scrolling, blocking // unwanted assets and many more. It works great out of the box, but also provides  // rich configuration options. // // ), // }, ]; function Feature({ Svg, title, description }) { return (
    {Svg ? : null}

    {title}

    {description}

    ); } export default function Highlights() { return (
    {FeatureList.map((props, idx) => ( ))}
    ); } ================================================ FILE: website/src/components/Highlights.module.css ================================================ .features { display: flex; align-items: center; width: 100%; font-size: 18px; line-height: 32px; color: #41465d; } html[data-theme="dark"] .features { color: #b3b8d2; } .feature svg { height: 60px; width: 60px; } .features svg path:nth-child(1) { fill: url(#gradient-1) !important; } .features svg path:nth-child(n + 1) { fill: url(#gradient-2) !important; } html[data-theme="dark"] .featureIcon { background: #272c3d; } .featureIcon { display: flex; justify-content: center; align-items: center; margin-bottom: 24px; border-radius: 8px; background-color: #f2f3fb; width: 48px; height: 48px; } .features h3 { font-weight: 700; font-size: 18px; line-height: 32px; } ================================================ FILE: website/src/components/Homepage/HomepageCliExample.jsx ================================================ import React from 'react'; import CopyButton from '../CopyButton'; import styles from './HomepageCliExample.module.css'; const cliCommand = `uvx 'crawlee[cli]' create my-crawler`; export default function CliExample() { return (
    Or start with a template from our CLI
                        $
                        {cliCommand}
                        
                    
    Built with 🤍 by Apify. Forever free and open-source.
    ); } ================================================ FILE: website/src/components/Homepage/HomepageCliExample.module.css ================================================ .cliExampleSection { display: flex; flex-direction: column; justify-content: center; align-items: center; text-align: center; padding: 16px; } .cliExampleTitle { color: var(--color-text-muted); font-size: 18px; font-style: normal; font-weight: 400; line-height: 28px; margin-bottom: 16px; } .cliExampleCodeBlock { width: fit-content; height: fit-content; padding: 0; border: 0; margin-bottom: 18px; width: 100%; pre { margin: 0; width: 100%; padding: 8px 16px; background-color: var(--color-background-muted); border: 1px solid var(--color-border); display: flex; align-items: center; gap: 16px; font-size: 14px; line-height: 20px; button { margin-left: auto; } } .cliCommandPrefix { color: var(--color-text-muted); user-select: none; } /* TABLET */ @media (min-width: 768px) { max-width: 526px; } } .cliExampleSubtitle { color: var(--color-text-subtle); font-size: 16px; font-style: normal; font-weight: 400; line-height: 24px; } /* TABLET */ @media (min-width: 768px) { .cliExampleSection { padding: 64px 0; } } ================================================ FILE: website/src/components/Homepage/HomepageCtaSection.jsx ================================================ import { useColorMode } from '@docusaurus/theme-common'; import React from 'react'; import AnimatedLogoDark from './animated-crawlee-logo-dark.svg'; import AnimatedLogoLight from './animated-crawlee-logo-light.svg'; import styles from './HomepageCtaSection.module.css'; import homepageStyles from '../../pages/index.module.css'; import Button from '../Button'; export default function HomepageCtaSection() { const { colorMode } = useColorMode(); return (

    Get started now!

    Crawlee won’t fix broken selectors for you (yet), but it makes building and maintaining reliable crawlers faster and easier—so you can focus on what matters most.
    {colorMode === 'dark' ? ( ) : ( )}
    ); } ================================================ FILE: website/src/components/Homepage/HomepageCtaSection.module.css ================================================ .ctaSection { position: relative; display: flex; flex-direction: column; justify-content: center; align-items: center; text-align: center; padding: 16px; padding-bottom: 0; gap: 24px; overflow: clip; } .ctaTitle { color: var(--color-text); font-family: 'Lota Grotesque'; font-size: 36px; font-style: normal; font-weight: 400; line-height: 46px; margin: 0; } .ctaDescription { color: var(--color-text-muted); font-size: 18px; font-style: normal; font-weight: 400; line-height: 28px; max-width: 780px; } .ctaButtonContainer { display: flex; flex-direction: column; justify-content: center; align-items: center; text-align: center; gap: 16px; width: 100%; } .ctaImage { z-index: -1; margin-top: -90px; margin-bottom: -30px; min-height: 400px; } #ctaFadedOutSeparator { position: absolute; top: 370px; width: 100%; z-index: -2; } #fadedOutSeparatorVerticalLeft { position: absolute; left: 190px; bottom: 0; height: 100%; z-index: -2; } #fadedOutSeparatorVerticalRight { position: absolute; right: 190px; bottom: 0; height: 100%; z-index: -2; } #ctaDashedCircleRight { position: absolute; right: -120px; top: 370px; z-index: -2; } /* TABLET */ @media (min-width: 768px) { .ctaSection { padding-top: 80px; } .ctaTitle { font-size: 48px; line-height: 56px; } .ctaButtonContainer { flex-direction: row; } } ================================================ FILE: website/src/components/Homepage/HomepageHeroSection.jsx ================================================ import React from 'react'; import styles from './HomepageHeroSection.module.css'; import homepageStyles from '../../pages/index.module.css'; export default function HomepageHeroSection() { return (

    Build reliable web scrapers. Fast.

    Crawlee is a web scraping library for JavaScript and Python. It handles blocking, crawling, proxies, and browsers for you.

    ); } ================================================ FILE: website/src/components/Homepage/HomepageHeroSection.module.css ================================================ .hero { display: flex; flex-direction: column; align-items: center; justify-content: center; padding: 32px 0; h1 { padding-inline: 12px; } } .heroTitle { color: var(--color-text); font-size: 52px; line-height: 60px; font-weight: 400; text-align: center; margin: 0 0 16px 0; } .heroSubtitle { color: var(--color-text-muted); font-size: 18px; line-height: 28px; font-weight: 400; text-align: center; margin: 0 16px; max-width: 792px; } #separatorHeroHeader { display: none; } #separatorHeroHeader2 { display: none; } #heroDecorativeCircle { width: 60px; height: 60px; right: -60px; top: 0px; } /* TABLET */ @media (min-width: 768px) { .hero { padding: 64px 0 0 0; h1 { padding-inline: 24px; } } .heroTitle { font-size: 54px; line-height: 64px; margin: 0 16px 24px 16px; } .heroSubtitle { margin: 0 16px 30px 16px; } #separatorHeroHeader { display: none; } #separatorHeroHeader2 { display: block; } } /* DESKTOP */ @media (min-width: 1024px) { .hero { padding: 120px 0 0 0; } .heroSubtitle { margin: 30px 16px; } #separatorHeroHeader { display: block; } } ================================================ FILE: website/src/components/Homepage/LanguageInfoWidget.jsx ================================================ import { useColorMode } from '@docusaurus/theme-common'; import ThemedImage from '@theme/ThemedImage'; import clsx from 'clsx'; import React from 'react'; import GitHubButton from 'react-github-btn'; import Button from '../Button'; import CopyButton from '../CopyButton'; import styles from './LanguageInfoWidget.module.css'; export default function LanguageInfoWidget({ language, command, to, githubUrl, }) { const { colorMode } = useColorMode(); return (
    {language === 'JavaScript' && ( )} {language === 'Python' && ( )}
    Star
    {command && ( {command} )}
    ); } ================================================ FILE: website/src/components/Homepage/LanguageInfoWidget.module.css ================================================ .languageGetStartedContainer { margin: 0; display: flex; flex-direction: column; align-items: center; padding-inline: 12px; } .languageGetStartedContainer img { height: 40px; margin-bottom: 16px; } .buttonContainer { display: flex; flex-direction: column; align-items: center; gap: 16px; & > span { line-height: 0; min-height: 28px; } a, a span { min-width: 190px; text-align: center; justify-content: center; } } .buttonContainer:has(+ code) { margin-bottom: 16px; gap: 12px; } .commandContainer { margin: 0; padding: 0; color: var(--color-text); font-size: 12px; font-style: normal; font-weight: 400; line-height: 16px; background-color: transparent; border: 0; display: flex; align-items: center; } .commandContainer button { opacity: 0; transition: opacity var(--ifm-transition-fast) ease-in; } .commandContainer:hover button, .commandContainer button:hover { opacity: 1; } /* TABLET */ @media (min-width: 768px) { .languageGetStartedContainer { margin: 24px 0 40px 0; } .buttonContainer:has(+ code) { flex-direction: row; } .buttonContainer:has(+ code) { a, a span { min-width: 0; } } } ================================================ FILE: website/src/components/Homepage/LanguageSwitch.jsx ================================================ import React, { useCallback, useEffect, useRef, useState } from 'react'; import styles from './LanguageSwitch.module.css'; import clsx from 'clsx'; export default function LanguageSwitch({ options = ['JavaScript', 'Python'], defaultOption = 'JavaScript', onChange, }) { const [activeOption, setActiveOption] = useState(defaultOption) const [backgroundStyle, setBackgroundStyle] = useState({}) const optionRefs = useRef < (HTMLButtonElement | null)[] > ([]) const updateBackgroundStyle = useCallback(() => { const activeIndex = options.indexOf(activeOption) const activeElement = optionRefs.current[activeIndex] if (activeElement) { const { offsetLeft, offsetWidth } = activeElement setBackgroundStyle({ transform: `translateX(${offsetLeft}px)`, width: `${offsetWidth}px`, }) } }, [activeOption, options]) useEffect(() => { updateBackgroundStyle() }, [updateBackgroundStyle]) const handleOptionClick = (option) => { setActiveOption(option) onChange?.(option) } return (
    {options.map((option, index) => ( ))}
    ) } ================================================ FILE: website/src/components/Homepage/LanguageSwitch.module.css ================================================ .languageSwitch { z-index: 1; display: inline-flex; position: relative; background-color: var(--color-background-subtle); border-radius: 6px; padding: 4px; } .switchOption { position: relative; z-index: 1; padding: 6px 16px; font-size: 14px; font-weight: 500; color: var(--color-text-muted); background: none; border: none; cursor: pointer; transition: color 0.3s ease; } .switchOption:hover { color: var(--color-text); } .switchOption.active { color: var(--color-text); } .switchBackground { position: absolute; top: 4px; bottom: 4px; left: 0; border-radius: 6px; background-color: var(--color-background); transition: transform 0.3s ease, width 0.3s ease; } ================================================ FILE: website/src/components/Homepage/RiverSection.jsx ================================================ import Link from '@docusaurus/Link'; import clsx from 'clsx'; import React from 'react'; import styles from './RiverSection.module.css'; export default function RiverSection({ title, description, content, reversed, to }) { return (

    {title}

    {description}

    Learn more
    {content}
    ); } ================================================ FILE: website/src/components/Homepage/RiverSection.module.css ================================================ /* Base styles */ .riverWrapper { width: 100%; border-top: 1px solid var(--color-separator); border-bottom: 1px solid var(--color-separator); } .riverContainer { max-width: 1200px; margin: 0 auto; display: flex; flex-direction: column; /* Tablet layout */ @media (min-width: 768px) { flex-direction: row; &.riverReversed { flex-direction: row-reverse; } } } .riverSection { width: 100%; /* Tablet layout */ @media (min-width: 768px) { min-width: 0; flex-basis: 50%; flex-grow: 0; } } .riverText { padding: 24px 16px; /* Tablet layout */ @media (min-width: 768px) { padding: 40px 32px; } /* Desktop layout */ @media (min-width: 1024px) { padding: 48px 80px; } } /* Text styles */ .riverTitle { flex: 1; margin-top: 0; margin-bottom: 12px; font-size: 32px; font-weight: 400; line-height: 40px; /* Desktop layout */ @media (min-width: 1024px) { max-width: 440px; } } .riverDescription { margin-bottom: 24px; color: var(--color-text-muted); font-size: 16px; line-height: 24px; /* Desktop layout */ @media (min-width: 1024px) { max-width: 440px; } } .riverButton { cursor: pointer; padding: 8px 12px; background-color: transparent; border: 1px solid var(--color-border); border-radius: 12px; display: flex; align-items: center; justify-content: center; font-size: 16px; line-height: 24px; transition: background-color 0.12s ease-out; width: fit-content; color: var(--color-text); &:hover { background-color: var(--color-hover); color: var(--color-text); } path { stroke: var(--color-icon); } } .riverButton::after { content: '→'; margin-inline: 4px; transition: margin 0.3s ease; } .riverButton:hover { color: var(--color-text); &::after { margin: 0 0 0 8px; } } .riverContent { min-height: 180px; background-color: var(--color-background-muted); border-top: 1px solid var(--color-separator); display: flex; flex-direction: column; overflow: hidden; img { max-height: 284px; object-fit: cover; height: 100%; width: 100%; margin-block: auto; } :global(.code-block) { flex-grow: 1; margin-bottom: 0; border-radius: 0; box-shadow: none; :global(div[class*="codeBlockContent"]) { height: 100%; pre { height: 100%; display: flex; align-items: center; background: var(--color-background-muted) !important; } code { height: auto; font-size: 14px; background: var(--color-background-muted); min-width: initial; padding: 16px 8px 16px 4px; span::before { margin-right: 16px; left: unset !important; color: var(--color-text-subtle); opacity: 1; } } } } /* Tablet layout */ @media (min-width: 768px) { border-top: none; border-left: 1px solid var(--color-separator); } .riverReversed & { /* Tablet layout */ @media (min-width: 768px) { border-left: none; border-right: 1px solid var(--color-separator); } } } ================================================ FILE: website/src/components/Homepage/ThreeCardsWithIcon.jsx ================================================ import Link from '@docusaurus/Link'; import clsx from 'clsx'; import React from 'react'; import styles from './ThreeCardsWithIcon.module.css'; export default function ThreeCardsWithIcon({ cards }) { return (
    {cards?.map((card, index) => { const content = ( <>
    {card.icon}

    {card.title}

    {card.description}

    {card.actionLink && ( {card.actionLink.text} )} ); if (card.to) { return ( {content} ); } return (
    {content}
    ); })}
    ); } ================================================ FILE: website/src/components/Homepage/ThreeCardsWithIcon.module.css ================================================ .cardsWrapper { display: flex; flex-direction: column; border-block: 1px solid var(--color-separator); @media (min-width: 768px) { flex-direction: row; } } /* Card styles */ .cardItem { display: flex; flex: 1; flex-direction: column; padding: 40px 24px; background: var(--color-card-background); transition: background 0.1s ease; border-bottom: 1px solid var(--color-separator); &:last-child { border-bottom: 0; } @media (min-width: 768px) { border-bottom: 0; border-right: 1px solid var(--color-separator); &:last-child { border-right: 0; } } } a.cardItem:hover { background: var(--color-card-background-hover); } .cardItem:has(:local(.cardAction)) { padding: 24px; } .cardIcon { margin-bottom: 16px; display: flex; align-items: center; justify-content: center; width: 72px; height: 72px; border-radius: 6px; border: 1px solid var(--color-separator); background: var(--color-background); } .cardIcon img { width: 50px; } .cardTitle { margin: 0; margin-bottom: 8px; color: var(--color-text); font-size: 26px; font-style: normal; font-weight: 400; line-height: 34px; } .cardDescription { color: var(--color-text-muted); font-family: var(--ifm-font-family-base); font-size: 16px; font-style: normal; font-weight: 400; line-height: 24px; margin: 0; margin-bottom: 12px; } .cardAction { color: var(--color-text-muted); font-family: var(--ifm-font-family-base); font-size: 16px; font-style: normal; font-weight: 650; line-height: 24px; width: fit-content; margin-top: auto; } .cardAction::after { content: "→"; margin-left: 4px; transition: margin 0.3s ease; } .cardAction:hover { color: var(--color-text); &::after { margin-left: 8px; } } ================================================ FILE: website/src/components/LLMButtons.jsx ================================================ import { AnthropicIcon, ChatGptIcon, CheckIcon, ChevronDownIcon, CopyIcon, ExternalLinkIcon, LoaderIcon, MarkdownIcon, PerplexityIcon, } from '@apify/ui-icons'; import { useLocation } from '@docusaurus/router'; import clsx from 'clsx'; import React, { useCallback, useEffect, useMemo, useRef, useState, } from 'react'; import styles from './LLMButtons.module.css'; const DROPDOWN_OPTIONS = [ { label: 'Copy for LLM', description: 'Copy page as Markdown for LLMs', showExternalIcon: false, icon: CopyIcon, value: 'copyForLLM', analytics: { buttonText: 'Copy for LLM', element: 'llm-buttons.copyForLLM', }, }, { label: 'View as Markdown', description: 'View this page as plain text', icon: MarkdownIcon, value: 'viewAsMarkdown', showExternalIcon: true, analytics: { buttonText: 'View as Markdown', element: 'llm-buttons.viewAsMarkdown', }, }, { label: 'Open in ChatGPT', description: 'Ask questions about this page', icon: ChatGptIcon, value: 'openInChatGPT', showExternalIcon: true, analytics: { buttonText: 'Open in ChatGPT', element: 'llm-buttons.openInChatGPT', }, }, { label: 'Open in Claude', description: 'Ask questions about this page', icon: AnthropicIcon, value: 'openInClaude', showExternalIcon: true, analytics: { buttonText: 'Open in Claude', element: 'llm-buttons.openInClaude', }, }, { label: 'Open in Perplexity', description: 'Ask questions about this page', icon: PerplexityIcon, value: 'openInPerplexity', showExternalIcon: true, analytics: { buttonText: 'Open in Perplexity', element: 'llm-buttons.openInPerplexity', }, }, ]; const CHAT_GPT_BASE = 'https://chatgpt.com/?hints=search&q='; const CLAUDE_BASE = 'https://claude.ai/new?q='; const PERPLEXITY_BASE = 'https://www.perplexity.ai/search/new?q='; const getPrompt = (currentUrl) => `Read from ${currentUrl} so I can ask questions about it.`; const getMarkdownUrl = (currentUrl) => { const url = new URL(currentUrl); url.pathname = `${url.pathname.replace(/\/$/, '')}.md`; return url.toString(); }; const trackClick = (buttonText, element) => { if (typeof window !== 'undefined' && window.analytics) { window.analytics.track('Clicked', { app: 'crawlee', button_text: buttonText, element, }); } }; const getOptionHref = (value, currentUrl) => { if (!currentUrl) { return undefined; } switch (value) { case 'viewAsMarkdown': return getMarkdownUrl(currentUrl); case 'openInChatGPT': return `${CHAT_GPT_BASE}${encodeURIComponent(getPrompt(currentUrl))}`; case 'openInClaude': return `${CLAUDE_BASE}${encodeURIComponent(getPrompt(currentUrl))}`; case 'openInPerplexity': return `${PERPLEXITY_BASE}${encodeURIComponent(getPrompt(currentUrl))}`; default: return undefined; } }; const Menu = ({ className, components = {}, onMenuOpen, onSelect, options = [], }) => { const [isOpen, setIsOpen] = useState(false); const [focusedIndex, setFocusedIndex] = useState(0); const menuRef = useRef(null); const menuItemRefs = useRef([]); const MenuBaseComponent = components.MenuBase; const closeMenu = useCallback(() => { setIsOpen(false); setFocusedIndex(0); }, []); const toggleMenu = useCallback(() => { setIsOpen((prev) => { if (!prev) { setFocusedIndex(0); } return !prev; }); }, []); const handleKeyDown = useCallback( (event) => { if (event.key === 'Enter' || event.key === ' ') { event.preventDefault(); toggleMenu(); } else if (event.key === 'ArrowDown') { event.preventDefault(); if (!isOpen) { toggleMenu(); } else { setFocusedIndex((prev) => (prev + 1) % options.length); } } else if (event.key === 'ArrowUp') { event.preventDefault(); if (isOpen) { setFocusedIndex((prev) => (prev - 1 + options.length) % options.length); } } }, [toggleMenu, isOpen, options.length], ); const handleOptionSelect = useCallback( (option, event) => { onSelect?.(option, event); closeMenu(); }, [closeMenu, onSelect], ); const handleMenuItemKeyDown = useCallback( (event, option, index) => { if (event.key === 'Enter' || event.key === ' ') { event.preventDefault(); event.currentTarget.click(); return; } if (event.key === 'ArrowDown') { event.preventDefault(); setFocusedIndex((index + 1) % options.length); return; } if (event.key === 'ArrowUp') { event.preventDefault(); setFocusedIndex((index - 1 + options.length) % options.length); return; } if (event.key === 'Escape') { event.preventDefault(); closeMenu(); } }, [options.length, closeMenu], ); useEffect(() => { onMenuOpen?.(isOpen); }, [isOpen, onMenuOpen]); useEffect(() => { if (isOpen && menuItemRefs.current[focusedIndex]) { menuItemRefs.current[focusedIndex].focus(); } }, [isOpen, focusedIndex]); useEffect(() => { if (!isOpen) { return undefined; } const handleClickOutside = (event) => { if (!menuRef.current?.contains(event.target)) { closeMenu(); } }; const handleEscape = (event) => { if (event.key === 'Escape') { closeMenu(); } }; document.addEventListener('mousedown', handleClickOutside); document.addEventListener('keydown', handleEscape); return () => { document.removeEventListener('mousedown', handleClickOutside); document.removeEventListener('keydown', handleEscape); }; }, [closeMenu, isOpen]); return (
    {isOpen && ( )}
    ); }; function getButtonText({ status }) { switch (status) { case 'loading': return 'Copying...'; case 'copied': return 'Copied'; default: return 'Copy for LLM'; } } const onCopyAsMarkdownClick = async ({ setCopyingStatus, currentUrl }) => { const sourceUrl = currentUrl || (typeof window !== 'undefined' ? window.location.href : ''); if (!sourceUrl) { return; } trackClick('Copy for LLM', 'llm-buttons.copyForLLM'); const markdownUrl = getMarkdownUrl(sourceUrl); try { setCopyingStatus('loading'); // Safari requires clipboard writes to be created synchronously inside the user gesture. // We therefore pass a Promise that resolves to a Blob into ClipboardItem instead of // awaiting fetch first — otherwise Safari would reject the clipboard operation. const markdownContent = new ClipboardItem({ 'text/plain': fetch(markdownUrl) .then((response) => { if (!response.ok) { throw new Error(`Failed to fetch markdown: ${response.status}`); } return response.text(); }) .then((content) => new Blob([content], { type: 'text/plain' })), }); await navigator.clipboard.write([markdownContent]); // Show success feedback setCopyingStatus('copied'); } catch (error) { console.error('Failed to copy markdown content:', error); } finally { setTimeout(() => setCopyingStatus('idle'), 2000); } }; const COPYING_STATUS_ICON = { loading: , copied: , idle: , } const MenuBase = React.forwardRef(({ copyingStatus, setCopyingStatus, chevronIconRef, currentUrl, ...buttonProps }, ref) => { const mergedButtonProps = { ...buttonProps, tabIndex: buttonProps.tabIndex ?? 0, }; return (
    { event.stopPropagation(); onCopyAsMarkdownClick({ setCopyingStatus, currentUrl }); }} > {COPYING_STATUS_ICON[copyingStatus]}
    { event.stopPropagation(); onCopyAsMarkdownClick({ setCopyingStatus, currentUrl }); }} className={styles.llmButtonText} > {getButtonText({ status: copyingStatus })}
    ); }); MenuBase.displayName = 'MenuBase'; const Option = ({ label, description, showExternalIcon, icon }) => { const Icon = icon ?? CopyIcon; return (
    {label} {description}
    {showExternalIcon && ( )}
    ); }; export default function LLMButtons() { const [copyingStatus, setCopyingStatus] = useState('idle'); const [isMarkdownAvailable, setIsMarkdownAvailable] = useState(false); const chevronIconRef = useRef(null); const location = useLocation(); const currentUrl = typeof window !== 'undefined' ? `${window.location.origin}${location.pathname}${location.search}${location.hash}` : ''; useEffect(() => { if (!currentUrl) { // TODO: Feel free to tell me how to fix this 🤦‍♂️ // eslint-disable-next-line react-hooks/set-state-in-effect setIsMarkdownAvailable(false); return undefined; } const controller = new AbortController(); const markdownUrl = getMarkdownUrl(currentUrl); const checkMarkdownAvailability = async () => { try { const response = await fetch(markdownUrl, { method: 'HEAD', signal: controller.signal, }); setIsMarkdownAvailable(response.ok); } catch (error) { if (error.name === 'AbortError') { return; } setIsMarkdownAvailable(false); } }; checkMarkdownAvailability(); return () => { controller.abort(); }; }, [currentUrl]); const menuOptions = useMemo( () => DROPDOWN_OPTIONS.map((option) => { const href = getOptionHref(option.value, currentUrl); if (option.value === 'viewAsMarkdown') { if (!isMarkdownAvailable) { return null; } } return { ...option, href, target: href ? '_blank' : undefined, rel: href ? 'noopener noreferrer' : undefined, }; }).filter(Boolean), [isMarkdownAvailable, currentUrl], ); const onMenuOptionClick = useCallback( (option, event) => { if (!option) { return; } if (option.analytics) { trackClick(option.analytics.buttonText, option.analytics.element); } if (option.value === 'copyForLLM') { event?.preventDefault(); onCopyAsMarkdownClick({ setCopyingStatus, currentUrl }); } }, [setCopyingStatus, currentUrl], ); return ( chevronIconRef.current?.classList.toggle( styles.chevronIconOpen, isOpen, )} components={{ MenuBase: (props) => ( ), }} onSelect={onMenuOptionClick} options={menuOptions} /> ); } ================================================ FILE: website/src/components/LLMButtons.module.css ================================================ .llmMenu { display: flex; justify-content: flex-end; flex: 0 0 auto; } @media (max-width: 996px) { .llmMenu { width: 100%; justify-content: flex-start; } } .llmButtonWrapper { display: flex; justify-content: flex-end; width: auto; } .llmButton { display: flex; align-items: center; border-radius: 0.5rem; border: 1px solid var(--color-separator); background-color: var(--color-background-subtle); cursor: pointer; transition: background-color 0.2s ease-in-out, border-color 0.2s ease-in-out; } .copyUpIconWrapper { display: flex; align-items: center; justify-content: center; padding: 0.6rem 0.5rem 0.6rem 0.8rem; } .llmButtonText { display: flex; align-items: center; padding-right: 0.8rem; border-right: 1px solid var(--color-separator); margin: 0; font: 400 0.875rem/1.4 Inter, sans-serif; } .chevronIconWrapper { display: flex; align-items: center; justify-content: center; padding-inline: 0.25rem; } .chevronIcon { transition: transform 0.2s ease-in-out; } .chevronIconOpen { transform: rotate(180deg); } .menu { position: relative; width: fit-content; } .menuDropdown { position: absolute; right: 0; top: calc(100% + 0.5rem); padding: 0.375rem; border-radius: 0.75rem; border: 1px solid var(--color-separator); background-color: var(--color-background); box-shadow: 0 12px 32px rgb(10 11 36 / 20%); min-width: 17rem; max-width: min(20rem, calc(100vw - 1.5rem)); z-index: 2; display: flex; flex-direction: column; gap: 0.25rem; } @media (max-width: 996px) { .menuDropdown { left: 0; right: auto; width: min(20rem, calc(100vw - 1.5rem)); } } .menuOption { display: flex; gap: 0.5rem; padding: 0.25rem 0.5rem; border-radius: 0.5rem; transition: background-color 0.15s ease-in-out; } .menuOption:hover { background: var(--color-hover); } .menuOptionWrapper { border: none; background: transparent; padding: 0; text-align: left; width: 100%; display: block; text-decoration: none; color: inherit; cursor: pointer; outline: none; } .menuOptionWrapper:focus-visible .menuOption { background: var(--color-hover); outline-offset: -2px; } .menuOptionIcon, .menuOptionExternalIcon { flex-shrink: 0; } .menuOptionIcon { margin-top: 0.2rem; } .menuOptionText { flex: 1; display: flex; flex-direction: column; gap: 0.125rem; line-height: 1rem; padding: 4px 0; } .menuOptionLabel { margin: 0; font-size: 0.875rem; line-height: 1rem; font-weight: 400; color: var(--ifm-font-color-base); } .menuOptionDescription { margin: 0; font-size: 0.8rem; color: var(--color-text-subtle); } ================================================ FILE: website/src/components/RunnableCodeBlock.jsx ================================================ import React from 'react'; import clsx from 'clsx'; import CodeBlock from '@theme/CodeBlock'; import Link from '@docusaurus/Link'; import styles from './RunnableCodeBlock.module.css'; const PYTHON_ACTOR_RUNNER = 'HH9rhkFXiZbheuq1V' const RunnableCodeBlock = ({ children, actor, hash, ...props }) => { hash = hash ?? children.hash; if (!children.code) { throw new Error(`RunnableCodeBlock requires "code" and "hash" props Make sure you are importing the code block contents with the roa-loader.`); } if (!hash) { return ( { children.code } ); } const href = `https://console.apify.com/actors/${actor ?? PYTHON_ACTOR_RUNNER}?runConfig=${hash}&asrc=run_on_apify`; return (
    Run on { children.code }
    ); }; export default RunnableCodeBlock; ================================================ FILE: website/src/components/RunnableCodeBlock.module.css ================================================ .button { display: inline-block; padding: 3px 10px; position: absolute; top: calc(var(--ifm-pre-padding) / 2); right: 9px; z-index: 1; font-size: 16px; line-height: 28px; background: var(--prism-background-color); color: var(--prism-color); border: 1px solid var(--ifm-color-emphasis-300); border-radius: var(--ifm-global-radius); opacity: 0.7; font-weight: 600; width: 155px; } @media screen and (max-width: 768px) { .button { display: none; } } .button svg { height: 20px; position: absolute; top: 7.5px; right: 0; } .button:hover { opacity: 1; color: var(--prism-color); } .container { position: relative; } ================================================ FILE: website/src/css/custom.css ================================================ @import url('https://fonts.googleapis.com/css2?family=Be+Vietnam+Pro:wght@400;600;700&display=swap'); html[data-theme='dark'] { --ifm-navbar-background-color: #1a1b23; --ifm-background-color: #1a1b21; --ifm-background-surface-color: #242736; --ifm-font-color-base: #f2f3fb; --ifm-pre-background: #242736; --ifm-color-primary: #5d9df1; --ifm-link-color: #5d9df1; --ifm-heading-color: #f2f3fb; --ifm-navbar-link-color: #f2f3fb; --ifm-menu-color-active: #b2b8cc; --docusaurus-highlighted-code-line-bg: rgba(255, 255, 255, 0.1); --docsearch-text-color: #8c93a8; --docsearch-highlight-color: #f3f4fa; --color-background: #1a1b21; --color-background-subtle: #2a2d39; --color-background-muted: #252832; --color-field-background: #101114; --color-separator: #343847; --color-border: #414758; --color-card-background: #1e2027; --color-card-background-hover: #252832; --color-text: #f3f4fa; --color-text-subtle: #8c93a8; --color-text-muted: #b2b8cc; --color-text-on-primary: #1a1b21; --color-text-placeholder: #6e758a; --color-black-action: #fff; --color-icon: #b2b8cc; --color-hover: #2d313e; --color-primary-action-hover: #d1d5e4; } :root { /* use default system font based on https://devhints.io/css-system-font-stack */ --ifm-font-family-base: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', sans-serif; --ifm-heading-font-family: 'Lota Grotesque', sans-serif; --ifm-font-weight-semibold: 600; --ifm-font-color-base: #242736; --ifm-navbar-item-padding-horizontal: 0; --ifm-navbar-item-padding-vertical: 0; --ifm-navbar-sidebar-width: 100%; --ifm-navbar-link-color: #41465d; --ifm-navbar-shadow: none; --ifm-heading-margin-top: var(--ifm-heading-margin-bottom); --ifm-hero-background-color: transparent; --ifm-code-background: var(--ifm-pre-background) !important; --ifm-code-padding-horizontal: 0.4rem; --ifm-code-padding-vertical: 0.2rem; --ifm-color-primary-lightest: #5d9df1; --ifm-color-primary-lighter: #3a87ee; --ifm-color-primary-light: #2e80ed; --ifm-color-primary: #1672eb; --ifm-color-primary-dark: #1266d5; --ifm-color-primary-darker: #1161c9; --ifm-color-primary-darkest: #0e50a6; --ifm-link-color: hsl(214, 84%, 50%); --ifm-link-hover-color: hsl(214, 84%, 65%); --ifm-link-hover-decoration: none; --ifm-pre-padding: 1.6rem; --ifm-footer-background-color: #272c3d; --ifm-footer-title-color: #f2f3fb; --ifm-footer-link-color: #f2f3fb; --ifm-menu-color-active: #555d76; --max-layout-width: 1680px; --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1); --docsearch-highlight-color: #242836; --ifm-heading-color: #242736; --docsearch-text-color: #6c7590; --docsearch-highlight-color: #242836; --color-background: #fff; --color-background-subtle: #f3f4fa; --color-background-muted: #f8f9fc; --color-field-background: #f8f9fc; --color-separator: #e0e3f2; --color-border: #d0d5e9; --color-card-background: #fff; --color-card-background-hover: #f8f9fc; --color-text: #242836; --color-text-subtle: #6c7590; --color-text-muted: #555d76; --color-text-on-primary: #fff; --color-text-placeholder: #969eb8; --color-black-action: #272d3e; --color-icon: #555d76; --color-hover: #eef0f8; --color-primary-action-hover: #2b3143; } footer, nav { --max-layout-width: 1200px; } @font-face { font-family: 'Lota Grotesque'; src: url('/font/lota.woff2') format('woff2'), url('/font/lota.woff') format('woff'); font-weight: 600; } .footer__title { font-size: 1.25rem; font-weight: 600; } html .DocSearch-Button { border-radius: 6px !important; font-weight: 400 !important; background: var(--color-field-background) !important; border: 1px solid var(--color-border) !important; width: 256px; height: 40px; padding: 0; padding-inline: 4px; /* Annoying, but needed */ /* https://stackoverflow.com/questions/26140050/why-is-font-family-not-inherited-in-button-tags-automatically/26140154 */ font-family: inherit; color: var(--color-text-placeholder); &:hover { color: var(--color-text-muted); box-shadow: none !important; background: var(--color-field-background) !important; } } .DocSearch-Button-Placeholder { display: block !important; font-size: 16px !important; } .DocSearch-Search-Icon { display: none; } div[class*="navbarSearchContainer"] { position: static; } html[data-theme="dark"] .DocSearch-Button { background: none; border: 1px solid var(--docsearch-muted-color); } html[data-theme="dark"] .DocSearch-Button .DocSearch-Search-Icon { color: var(--docsearch-muted-color); } html.plugin-pages .main-wrapper { overflow-x: hidden; } .main-wrapper > div { max-width: var(--max-layout-width); } aside > div > a { padding-left: 16px; } aside > div > a > b { display: none; } @media (max-width: 1200px) { .navbar__toggle { display: inherit; } .navbar__item { display: none; } } @media (max-width: 767px) { .navbar__items--right > div, .navbar__items--right > a { display: none; } } .navbar__toggle { margin: 0; padding: 8px !important; svg { color: var(--color-icon); width: 20px; height: 20px; } } .navbar__title { /* Replaced by SVG */ display: none; } .navbar__inner { /* .container */ max-width: var(--max-layout-width); margin: auto; width: 100%; } .navbar__items { height: 28px; @media (min-width: 768px) { height: 40px; } } .navbar__items--right { gap: 16px; } .navbar__item, .navbar__link { font-size: 16px; font-weight: 500; line-height: 24px; /* 150% */ padding: 0; color: var(--color-text); border-radius: 12px; &:hover, &:focus { color: var(--color-text-muted); background: var(--color-background-muted); } } .navbar__item { padding: 4px 8px; } .navbar__item.dropdown { padding: 4px 16px 4px 8px; a { display: inline-flex; } } .navbar__link--active { color: var(--color-text-muted); background: var(--color-background-muted); } .dropdown > .navbar__link::after { border-color: currentColor; border-style: solid; border-width: 0.1em 0.1em 0 0; content: ''; display: inline-block; height: 0.3em; left: 0.3em; position: relative; vertical-align: top; width: 0.3em; top: 8px; transform: rotate(135deg); } .navbar { border-bottom: 1px solid var(--color-separator); height: auto; background: var(--color-background); padding: 16px; @media (min-width: 768px) { padding: 20px 40px; } @media (min-width: 1024px) { padding: 20px 64px; } } nav[class*='navbarHidden'] { div[class*='navbarLogo'] { display: none; } } .navbar .icon { font-size: 0; padding: 4px; margin-left: 20px; line-height: 0; } .navbar .icon::before { content: ''; display: block; width: 24px; height: 24px; background-size: cover; } .navbar svg[class*="iconExternalLink"], aside svg[class*="iconExternalLink"] { display: none; } header.hero div[class^="heroButtons"] { justify-content: inherit; } article .card h2 { margin-top: 0; } .tsd-kind-icon, .menu__link, .table-of-contents__link { text-overflow: ellipsis; width: 100%; overflow: hidden; white-space: nowrap; } .tsd-flag { user-select: none; } .menu__caret:before, .menu__link--sublist:after { float: right; } .table-of-contents__link { height: 20px; } nav.navbar .dropdown__menu { top: 32px; min-width: 6rem; background: var(--color-card-background); border: 1px solid var(--color-border); } .dropdown__menu .dropdown__link { width: 100%; border-radius: 8px; } .dropdown__menu .dropdown__link--active { color: var(--color-text-muted); background: var(--color-background-muted); } .dropdown__menu .dropdown__link:hover, .dropdown__menu .dropdown__link--active:hover { background: var(--color-background-muted); color: var(--color-text-muted); } .navbar__logo { height: 2rem; } .navbar__logo_appendix { margin-left: -30px; font-weight: bold; } .navbar__logo_appendix_sidebar { display: block; position: absolute; top: 18px; left: 213px; } .main-wrapper { align-items: safe center; } .main-wrapper > div { width: calc(min(100%, var(--max-layout-width))) !important; } .main-wrapper a[class*="sidebarLogo"] { margin: 0; b { display: none; } img { height: 28px; margin-top: 4px; margin-bottom: 24px; margin-left: 24px; } } div[class*="sidebarViewport"] { top: 22px; } html.plugin-pages { font-size: 18px; line-height: 32px; } html.plugin-pages h2 { font-size: 36px; line-height: 48px; } html.plugin-docs .theme-doc-markdown { font-size: 18px; line-height: 32px; } html.plugin-docs .theme-doc-markdown h1 { font-weight: 600; font-size: 48px; line-height: 64px; color: #000; } html[data-theme="dark"].plugin-docs .theme-doc-markdown h1 { color: #fff; } html.plugin-typedoc-api .theme-doc-markdown h1 { color: #000; } html[data-theme="dark"].plugin-typedoc-api .theme-doc-markdown h1 { color: #fff; } html.plugin-docs .theme-doc-markdown h2 { font-size: 36px; line-height: 48px; } html.plugin-docs .theme-doc-markdown h3 { font-size: 28px; line-height: 36px; /*color: #242736;*/ } .theme-doc-toc-desktop .table-of-contents { font-size: 16px; line-height: 24px; } .theme-doc-sidebar-menu .menu__link, .theme-doc-toc-desktop .table-of-contents .toc-highlight { height: auto; color: #6f7490; background: none; } .theme-doc-sidebar-menu .menu__link:hover { background: inherit; } .theme-doc-sidebar-menu .menu__link { font-weight: 400; } .theme-doc-sidebar-menu .menu__link--active { font-weight: 700; color: var(--color-text-muted); } .theme-doc-sidebar-menu .menu__list-item-collapsible, .theme-doc-sidebar-menu .menu__list-item-collapsible--active { background: none; } .theme-doc-toc-desktop .table-of-contents .table-of-contents__link--active { font-weight: 700; } html[data-theme='dark'] .theme-doc-sidebar-menu .menu__link, html[data-theme='dark'] .theme-doc-toc-desktop .table-of-contents .toc-highlight { color: #b3b8d2; } html[data-theme='dark'] .theme-doc-sidebar-menu .menu__link--active, html[data-theme='dark'] .theme-doc-toc-desktop .table-of-contents .table-of-contents__link--active { color: #f2f3fb; } .theme-doc-sidebar-menu .menu__link:hover, .theme-doc-sidebar-menu .menu__link--active, .theme-doc-toc-desktop .table-of-contents .table-of-contents__link:hover, .theme-doc-toc-desktop .table-of-contents .table-of-contents__link--active { color: #242736; } .hero { position: relative; } .apiItemContainer .tsd-readme h1:first-child { display: none; } html .theme-doc-sidebar-container { border: 0; } html .theme-doc-sidebar-container button { border: 0; border-radius: 10px; } html .table-of-contents { border-left: 0; } html .table-of-contents ul { border-left: 2px solid #dfe2f5; } html.plugin-typedoc-api .theme-doc-sidebar-menu > li:first-child::before, html.plugin-typedoc-api .theme-doc-sidebar-menu > li:nth-child(6)::before { text-transform: uppercase; font-size: 18px; line-height: 28px; color: #6f7490; padding: 20px 12px; } /* html.plugin-typedoc-api .theme-doc-sidebar-menu > li:first-child::before { display: block; content: 'Core'; } html.plugin-typedoc-api .theme-doc-sidebar-menu > li:nth-child(6)::before { display: block; content: 'Advanced'; padding-top: 60px; } */ #giscus-comments { display: block; margin-top: 50px; } .video-container { margin: 85px auto 0; max-width: 560px; overflow: hidden; position: relative; width: 100%; border-radius: 10px; } .yt-lite > .lty-playbtn { border: 0; cursor: pointer; } @media screen and (min-width: 768px) { .runnable-code-block .code-block.no-title pre + div { position: absolute; right: 170px; line-height: 28px; } } .runnable-code-block .code-block button { height: 36px; margin-top: 1px; } .runnable-code-block:hover .code-block button { opacity: 0.4; } html[data-theme='dark'] .runnable-code-block svg .apify-logo { fill: #fff; } /* * Reset the line-number counter for each .prism-code scope */ .prism-code { counter-reset: line-number; } /* * Notice the chained .language-ts class name to .prism-code * You can chain more languages in order to add line numbers */ .prism-code.language-ts .token-line::before, .prism-code.language-typescript .token-line::before, .prism-code.language-javascript .token-line::before, .prism-code.language-json .token-line::before, .prism-code.language-json5 .token-line::before, .prism-code.language-python .token-line::before, .prism-code.language-dockerfile .token-line::before, .prism-code.language-XML .token-line::before, .prism-code.language-js .token-line::before, .prism-code.language-python .token-line::before { counter-increment: line-number; content: counter(line-number); margin-right: calc(var(--ifm-pre-padding) * 0.8); text-align: right; min-width: 1.5rem; display: inline-block; opacity: .3; position: sticky; left: var(--ifm-pre-padding); } div[class^="announcementBar_"] { background: #4585b6; color: #fff; } div[class^="announcementBar_"] button { color: #fff; } .markdown blockquote { --ifm-alert-background-color: var(--ifm-color-info-contrast-background); --ifm-alert-background-color-highlight: rgba(84,199,236,.15); --ifm-alert-foreground-color: var(--ifm-color-info-contrast-foreground); --ifm-alert-border-color: var(--ifm-color-info-dark); --ifm-code-background: var(--ifm-alert-background-color-highlight); --ifm-link-color: var(--ifm-alert-foreground-color); --ifm-link-hover-color: var(--ifm-alert-foreground-color); --ifm-link-decoration: underline; --ifm-tabs-color: var(--ifm-alert-foreground-color); --ifm-tabs-color-active: var(--ifm-alert-foreground-color); --ifm-tabs-color-active-border: var(--ifm-alert-border-color); background-color: var(--ifm-alert-background-color); border: var(--ifm-alert-border-width) solid var(--ifm-alert-border-color); border-left-width: var(--ifm-alert-border-left-width); border-radius: var(--ifm-alert-border-radius); box-shadow: var(--ifm-alert-shadow); padding: var(--ifm-alert-padding-vertical) var(--ifm-alert-padding-horizontal); } .tsd-parameters li { margin-bottom: 16px; } .tsd-parameters-title { font-size: 16px; margin-bottom: 16px !important; } .tsd-returns-title { font-size: 16px; } .DocSearch-Button-Key { background: var(--color-background-subtle) !important; box-shadow: none !important; border: 1px solid var(--color-border) !important; padding: 0 !important; color: var(--color-text-muted) !important; } .navbar-sidebar__brand { border-bottom: 1px solid var(--color-separator); flex-direction: column; height: auto; padding: 0; } .menu-primary { padding: 0; .menu__list-item { border-bottom: 1px solid var(--color-separator); margin: 0px 24px !important; a { margin: 8px 0px 4px; padding: 8px; } display: flex; } .menu__link { font-size: 16px; font-weight: 500; line-height: 24px; } } .navbar-sidebar__close { margin-left: 16px; svg { g { stroke: var(--color-icon); } width: 32px; height: 32px; padding: 8px; } } .DocSearch-Modal { font-family: var(--ifm-font-family-base); border-radius: 8px !important; border: 1px solid var(--color-border) !important; background: var(--color-card-background) !important; box-shadow: none !important; button { font-family: var(--ifm-font-family-base); } .DocSearch-Logo { display: none; } .DocSearch-Footer { flex-direction: row; border-top: 1px solid var(--color-border); background: var(--color-background); box-shadow: none; } .DocSearch-Label { color: var(--color-text-subtle); font-size: 14px; font-weight: 400; line-height: 20px; } .DocSearch-Commands-Key { border-radius: 4px; border: 1px solid var(--color-border); background: var(--color-background-subtle); box-shadow: none; g { stroke: var(--color-text-subtle); } } .DocSearch-Clear { color: var(--color-text-subtle); } .DocSearch-Form { border-radius: 6px; border-radius: var(--Radius-6, 6px); border: 1px solid var(--color-border); background: var(--color-background); box-shadow: none; height: 40px; padding: 8px 12px; } .DocSearch-Input { color: var(--color-text); font-size: 14px; line-height: 20px; padding: 0; } .DocSearch-Input::placeholder { color: var(--color-text-placeholder); font-style: italic; } .DocSearch-Search-Icon { width: 16px; height: 16px; path { stroke: var(--color-text-muted); } } .DocSearch-Reset { display: none; } .DocSearch-Help { color: var(--color-text-subtle); } .DocSearch-Hit-source { color: var(--color-text-subtle); font-size: 14px; font-weight: 400; line-height: 20px; padding-bottom: 4px; padding-left: 12px; background: var(--color-card-background); } .DocSearch-Hit { background: transparent; a { background: transparent !important; padding: 0; box-shadow: none; } a:hover { background: var(--color-hover) !important; } } .DocSearch-Hit[aria-selected='true'] a { background: var(--color-hover) !important; } .DocSearch-Hit-Container { background: transparent; height: 50px; } .DocSearch-Screen-Icon { display: none; } .DocSearch-NoResults { margin: 0; display: flex; flex-direction: column; width: 100%; padding: 16px 8px; gap: 24px; .DocSearch-Title { color: var(--color-text); font-size: 16px; font-weight: 500; line-height: 24px; width: fit-content; margin: 0; } } .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-title, .DocSearch-Hit-title { color: var(--color-text) !important; font-size: 16px; font-style: normal; font-weight: 500; line-height: 24px; /* 150% */ } .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-path, .DocSearch-Hit-path, .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-action, .DocSearch-Hit-action, .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-icon, .DocSearch-Hit-icon, .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-Tree, .DocSearch-Hit-Tree { color: var(--color-text-muted) !important; } .DocSearch-Hit[aria-selected='true'] mark, .DocSearch-Hit mark { color: var(--color-text-subtle) !important; } .DocSearch-Help { color: var(--color-text-subtle); font-size: 14px; font-weight: 400; line-height: 16px; } .DocSearch-NoResults-Prefill-List { padding: 0; li { list-style-type: none; margin-top: 4px; } } .DocSearch-Prefill { color: var(--color-text); font-size: 14px; font-weight: 500; line-height: 20px; &:hover { color: var(--color-text-subtle); text-decoration: none; } } .DocSearch-HitsFooter { color: var(--color-text-subtle); font-size: 14px; font-weight: 400; line-height: 16px; a { border: none; } a:hover { color: var(--color-text); } } .DocSearch-Hit-icon { margin-left: 8px; width: auto; height: auto; svg { width: 16px; height: 16px; } } li[id*='recentSearches'] { .DocSearch-Hit-icon { display: none; } } .DocSearch-SearchBar { padding: 16px 16px 8px; } .DocSearch-Hit-Select-Icon { display: none !important; } .DocSearch-Dropdown { padding: 0 8px; } .DocSearch-Cancel { color: var(--color-text-subtle); font-size: 14px; font-weight: 500; line-height: 20px; &:hover { color: var(--color-text); } } .DocSearch-NoResults-Prefill-List ul { padding: 0; } } ================================================ FILE: website/src/pages/home_page_example.py ================================================ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: crawler = PlaywrightCrawler( max_requests_per_crawl=10, # Limit the max requests per crawl. headless=True, # Run in headless mode (set to False to see the browser). browser_type='firefox', # Use Firefox browser. ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page using Playwright API. data = { 'url': context.request.url, 'title': await context.page.title(), } # Push the extracted data to the default dataset. await context.push_data(data) # Extract all links on the page and enqueue them. await context.enqueue_links() # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev']) # Export the entire dataset to a CSV file. await crawler.export_data('results.csv') # Or access the data directly. data = await crawler.get_data() crawler.log.info(f'Extracted data: {data.items}') if __name__ == '__main__': asyncio.run(main()) ================================================ FILE: website/src/pages/index.js ================================================ /* eslint-disable max-len */ import Link from '@docusaurus/Link'; import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; import CodeBlock from '@theme/CodeBlock'; import Layout from '@theme/Layout'; import ThemedImage from '@theme/ThemedImage'; import clsx from 'clsx'; import React from 'react'; import styles from './index.module.css'; import Button from '../components/Button'; import HomepageCliExample from '../components/Homepage/HomepageCliExample'; import HomepageCtaSection from '../components/Homepage/HomepageCtaSection'; import HomepageHeroSection from '../components/Homepage/HomepageHeroSection'; import LanguageInfoWidget from '../components/Homepage/LanguageInfoWidget'; import RiverSection from '../components/Homepage/RiverSection'; import RunnableCodeBlock from '../components/RunnableCodeBlock'; import ThreeCardsWithIcon from '../components/Homepage/ThreeCardsWithIcon'; import HomePageExample from '!!raw-loader!roa-loader!./home_page_example.py'; function GetStartedSection() { return (
    ); } function CodeExampleSection() { return (
    {HomePageExample}
    ); } const benefitsCodeBlockCrawler = `fingerprint_generator = DefaultFingerprintGenerator( header_options=HeaderGeneratorOptions( browsers=['chromium', 'firefox'], devices=['mobile'], locales=['en-US'] ), )`; // TODO: const benefitsCodeBlockHeadless = `crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser() @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: prices = await context.query_selector_all('span.price') await context.enqueue_links()`; function BenefitsSection() { return (

    What are the benefits?

    {benefitsCodeBlockCrawler} } to="/docs/guides/avoid-blocking" />
    } reversed to="/docs/quick-start#choose-your-crawler" />
    {benefitsCodeBlockHeadless} } to="/api" />
    ); } function OtherFeaturesSection() { return (

    What else is in Crawlee?

    Auto scaling

    Crawlers automatically adjust concurrency based on available system resources. Avoid memory errors in small containers and run faster in large ones.

    Smart proxy rotation

    Crawlee uses a pool of sessions represented by different proxies to maintain the proxy performance and keep IPs healthy. Blocked proxies are removed from the pool automatically.
    ), title: 'Queue and storage', description: 'Pause and resume crawlers thanks to a persistent queue of URLs and storage for structured data.', to: '/docs/guides/storages', }, { icon: ( ), title: 'Handy scraping utils', description: 'Sitemaps, infinite scroll, contact extraction, large asset blocking and many more utils included.', to: '/docs/guides/avoid-blocking', }, { icon: ( ), title: 'Routing & middleware', description: 'Keep your code clean and organized while managing complex crawls with a built-in router that streamlines the process.', to: '/api/class/Router', }, ]} />
    ); } function DeployToCloudSection() { return (

    Deploy to cloud

    Crawlee, by Apify, works anywhere, but Apify offers the best experience. Easily turn your project into an{' '} Actor —a serverless micro-app with built-in infra, proxies, and storage.
    1
    Install Apify SDK and Apify CLI.
    2
    Add
    Actor.init()
    to the beginning and{' '}
    Actor.exit()
    to the end of your code.
    3
    Use the Apify CLI to push the code to the Apify platform.
    ); } function BuildFastScrapersSection() { return (

    Crawlee helps you build scrapers faster

    ), title: 'Zero setup required', description: 'Copy code example, install Crawlee and go. No CLI required, no complex file structure, no boilerplate.', actionLink: { text: 'Get started', href: '/docs/quick-start', }, }, { icon: ( ), title: 'Reasonable defaults', description: 'Unblocking, proxy rotation and other core features are already turned on. But also very configurable.', actionLink: { text: 'Learn more', href: '/docs/examples', }, }, { icon: ( ), title: 'Helpful community', description: 'Join our Discord community of over 10k developers and get fast answers to your web scraping questions.', actionLink: { text: 'Join Discord', href: 'https://discord.gg/jyEM2PRvMU', }, }, ]} />
    ); } export default function JavascriptHomepage() { const { siteConfig } = useDocusaurusContext(); return (
    ); } ================================================ FILE: website/src/pages/index.module.css ================================================ /************* PAGE LAYOUT *************/ #homepageContainer { width: calc(100% - 48px) !important; max-width: 1200px !important; border-left: 1px solid var(--color-separator); border-right: 1px solid var(--color-separator); margin: 0 24px; } .dashedSeparator { position: relative; width: 100%; border-bottom: 1px dashed var(--color-separator); } .dashedSeparatorVertical { position: relative; border-right: 1px dashed var(--color-separator); } .dashedDecorativeCircle { width: 120px; height: 120px; border: 1px dashed var(--color-separator); border-radius: 50%; position: absolute; transform: translate(-50%, -50%); } .fadedOutSeparator { border: none; height: 1px; background-image: linear-gradient( 90deg, transparent, transparent 50%, var(--color-background) 50%, var(--color-background) 100% ), linear-gradient( 90deg, var(--color-separator) 0%, transparent 50%, var(--color-separator) 100% ); background-size: 6px 1px, 100% 1px; } .fadedOutSeparatorVertical { border: none; width: 1px; background-image: linear-gradient( 180deg, transparent, transparent 50%, var(--color-background) 50%, var(--color-background) 100% ), linear-gradient( 180deg, var(--color-separator) 0%, transparent 50%, var(--color-separator) 100% ); background-size: 1px 6px, 1px 100%; } .trianglesSeparator { width: 100%; height: 32px; background-position: center; background-repeat: repeat-x; background-image: url("../../static/img/triangles_light.svg"); html[data-theme="dark"] & { background-image: url("../../static/img/triangles_dark.svg"); } /* TABLET */ @media (min-width: 768px) { background-position: unset; background-repeat: repeat; height: 52px; } } /* most separators and decorations are not displayed on mobile */ .dashedSeparatorVertical, .dashedDecorativeCircle, .fadedOutSeparator, .fadedOutSeparatorVertical { display: none; } /* TABLET */ @media (min-width: 768px) { .dashedSeparatorVertical, .dashedDecorativeCircle, .fadedOutSeparator, .fadedOutSeparatorVertical { display: block; } #homepageContainer { width: calc(100% - 80px) !important; margin: 0 40px; } } /* DESKTOP */ @media (min-width: 1024px) { .dashedSeparatorVertical, .dashedDecorativeCircle, .fadedOutSeparator, .fadedOutSeparatorVertical { display: block; } #homepageContainer { width: calc(100% - 128px) !important; margin: 0 64px; } } /************* LANGUAGE GET STARTED SECTION *************/ .languageGetStartedSection { display: flex; flex-direction: column; gap: 32px; margin: 0 0 32px 0; div[class^="languageGetStartedContainer"] { flex: 1; } } /* TABLET */ @media (min-width: 768px) { .languageGetStartedSection { flex-direction: row; align-items: stretch; justify-content: space-around; gap: 0; margin: 0; } } /************* CODE EXAMPLE SECTION *************/ .codeExampleTopSeparator { display: none; } @media (min-width: 768px) { .codeExampleTopSeparator { display: block; } } .languageSwitchContainer { place-self: center; margin: 32px 0 16px 0; } .codeBlockContainer { :global(.theme-code-block) { margin-bottom: 32px; border-radius: 0; box-shadow: none; border-bottom: 1px dashed var(--color-separator); border-top: 1px dashed var(--color-separator); code { font-size: 14px; background: var(--color-background-muted); padding: 16px 8px 16px 4px; span::before { margin-right: 16px !important; left: unset !important; margin-right: 16px !important; color: var(--color-text-subtle) !important; opacity: 1 !important; } } } } /* TABLET */ @media (min-width: 768px) { .codeBlockContainer :global(.theme-code-block) { margin-bottom: 0; border-bottom: none; border-top: none; } .codeExampleSection { position: relative; } .languageSwitchContainer { margin: 0; position: absolute; top: calc(46px - 18px); left: calc(50% - 90px); } .decorativeRow { position: relative; height: 46px; border-bottom: 1px dashed var(--color-separator); &::before { content: " "; position: absolute; left: 40px; height: 100%; border-right: 1px dashed var(--color-separator); } &::after { content: " "; position: absolute; right: 40px; height: 100%; border-left: 1px dashed var(--color-separator); } } .codeBlockContainer { margin: 0 40px; border-left: 1px dashed var(--color-separator); border-right: 1px dashed var(--color-separator); } } @media (min-width: 1024px) { .decorativeRow { &::before { left: 60px; } &::after { right: 60px; } } .codeBlockContainer { margin: 0 60px; } } #ctaDecorativeCircle { width: 120px; height: 120px; } /************** BENEFITS SECTION ***********/ .benefitsSection { margin-bottom: 60px; h2 { margin: 32px 0; text-align: center; padding: 0 12px; /* TABLET */ @media (min-width: 768px) { margin: 80px 0; } } } /************** OTHER FEATURES SECTION ***********/ .otherFeaturesSection { display: flex; flex-direction: column; h2 { padding: 32px 12px; text-align: center; color: var(--color-text); font-weight: 400; line-height: 46px !important; font-size: 36px !important; @media (min-width: 768px) { line-height: 56px !important; font-size: 48px !important; margin: 80px 0 64px; padding: 32px 24px; } } margin-bottom: 40px; @media (min-width: 768px) { margin-bottom: 80px; } } .cardsWithContentContainer { display: flex; flex-direction: column; gap: 20px; background-position-x: 5px; background-image: url("../../static/img/triangles_light.svg"); html[data-theme="dark"] & { background-image: url("../../static/img/triangles_dark.svg"); } @media (min-width: 768px) { gap: 48px; } } .cardsWithImageContainer { display: flex; flex-direction: column; gap: 20px; width: 100%; @media (min-width: 768px) { gap: 32px; flex-direction: row; } } .cardWithImage { flex: 1; display: flex; flex-direction: column; overflow: hidden; background: var(--color-card-background); border-block: 1px solid var(--color-separator); transition: background 0.1s ease; @media (min-width: 768px) { border: 1px solid var(--color-separator); } &:first-child { border-left: 0; } &:last-child { border-right: 0; } &:hover { background: var(--color-card-background-hover); } } .cardWithImage img { width: 100%; height: 250px; object-fit: cover; } .cardWithImage:last-child img { object-position: left 90%; } .cardWithImageText { padding: 40px 24px; border-top: 1px solid var(--color-separator); } .cardWithImageTitle { margin: 0; color: var(--color-text); font-size: 26px; font-style: normal; font-weight: 400; line-height: 34px; } .cardWithImageDescription { margin-top: 12px; color: var(--color-text-muted); font-family: var(--ifm-font-family-base); font-size: 16px; font-style: normal; font-weight: 400; line-height: 24px; } /************** DEPLOY TO CLOUD SECTION ***********/ .deployToCloudSection { padding: 32px 16px; display: flex; flex-direction: column; align-items: center; gap: 48px; } .deployToCloudLeftSide { display: flex; flex-direction: column; flex-basis: 50%; gap: 24px; text-align: center; font-style: normal; font-weight: 400; a { width: fit-content; margin: auto; } h2 { color: var(--color-text); font-family: "Lota Grotesque"; font-size: 38px; line-height: 46px; } } .deployToCloudDescription { color: var(--color-text-muted); font-size: 16px; line-height: 24px; a { color: inherit; text-decoration: underline; } } .deployToCloudRightSide { display: flex; flex-direction: column; gap: 24px; flex-basis: 50%; position: relative; } .deployToCloudStep { display: flex; flex-direction: row; gap: 16px; align-items: center; } .deployToCloudStepNumber { display: flex; justify-content: center; align-items: center; width: 72px; height: 72px; padding: 16px; border-radius: 8px; border: 1px solid var(--color-separator); background: var(--color-background); color: var(--color-text-muted); font-size: 16px; font-style: normal; font-weight: 400; line-height: 24px; z-index: 1; div { display: flex; justify-content: center; align-items: center; height: 40px; width: 40px; border-radius: 50%; border: 1px dashed var(--color-separator); flex-shrink: 0; } } .deployToCloudStepText { display: inline-flex; align-items: baseline; flex-wrap: wrap; gap: 4px; color: var(--color-text); font-size: 14px; font-style: normal; font-weight: 500; line-height: 20px; pre { margin: 0; padding: 0; background-color: transparent; } } #verticalStepLine { position: absolute; left: 36px; height: 100%; z-index: 0; } /* TABLET */ @media (min-width: 768px) { .deployToCloudSection { padding: 96px 40px; flex-direction: row; } .deployToCloudLeftSide { text-align: left; a { margin: 0; } h2 { color: var(--color-text); font-family: "Lota Grotesque"; font-size: 48px; line-height: 58px; } } .deployToCloudDescription { font-size: 18px; line-height: 28px; } } /************** BUILD SCRAPERS FAST SECTION ***********/ .buildFastScrapersSection { position: relative; padding: 40px 0 32px; border-bottom: 1px solid var(--color-separator); h2 { margin: 0; padding: 32px 0; text-align: center; color: var(--color-text); font-weight: 400; padding-inline: 12px; line-height: 46px !important; font-size: 36px !important; @media (min-width: 768px) { padding-inline: 24px; line-height: 56px !important; font-size: 48px !important; padding: 80px 0 64px; } } div[class*="dashedDecorativeCircle"] { display: none; } @media (min-width: 1024px) { padding: 80px 0 60px; div[class*="dashedDecorativeCircle"] { display: block; } } } .buildFastScrapersContent { border-block: 1px solid var(--color-separator); } ================================================ FILE: website/src/plugins/docusaurus-plugin-segment/index.js ================================================ const path = require('path'); module.exports = function (context, options) { const { writeKey, allowedInDev = false } = options; return { name: 'docusaurus-plugin-segment', getClientModules() { return [path.resolve(__dirname, './segment')]; }, injectHtmlTags() { if (process.env.NODE_ENV !== 'production' && !allowedInDev) { return {}; } if (!writeKey) { console.warn('You need to specify a Segment writeKey in the plugin options'); return {}; } return { headTags: [ { tagName: 'script', innerHTML: ` !function(){var i="analytics",analytics=window[i]=window[i]||[];if(!analytics.initialize)if(analytics.invoked)window.console&&console.error&&console.error("Segment snippet included twice.");else{analytics.invoked=!0;analytics.methods=["trackSubmit","trackClick","trackLink","trackForm","pageview","identify","reset","group","track","ready","alias","debug","page","screen","once","off","on","addSourceMiddleware","addIntegrationMiddleware","setAnonymousId","addDestinationMiddleware","register"];analytics.factory=function(e){return function(){if(window[i].initialized)return window[i][e].apply(window[i],arguments);var n=Array.prototype.slice.call(arguments);if(["track","screen","alias","group","page","identify"].indexOf(e)>-1){var c=document.querySelector("link[rel='canonical']");n.push({__t:"bpc",c:c&&c.getAttribute("href")||void 0,p:location.pathname,u:location.href,s:location.search,t:document.title,r:document.referrer})}n.unshift(e);analytics.push(n);return analytics}};for(var n=0;n { // Don't track page views on development if (process.env.NODE_ENV === 'production' && window.analytics) { window.analytics.page({ app: 'crawlee', path: window.location.pathname, url: window.location.href, search: window.location.search, }); } }, 0); }, } : null; ================================================ FILE: website/src/theme/ColorModeToggle/index.js ================================================ import { translate } from '@docusaurus/Translate'; import useIsBrowser from '@docusaurus/useIsBrowser'; import clsx from 'clsx'; import React from 'react'; import IconDarkMode from './dark-mode-icon.svg'; import IconLightMode from './light-mode-icon.svg'; import styles from './styles.module.css'; function ColorModeToggle({ className, value, onChange, }) { const isBrowser = useIsBrowser(); const title = translate( { message: 'Switch between dark and light mode (currently {mode})', id: 'theme.colorToggle.ariaLabel', description: 'The ARIA label for the navbar color mode toggle', }, { mode: value === 'dark' ? translate({ message: 'dark mode', id: 'theme.colorToggle.ariaLabel.mode.dark', description: 'The name for the dark color mode', }) : translate({ message: 'light mode', id: 'theme.colorToggle.ariaLabel.mode.light', description: 'The name for the light color mode', }), }, ); return (
    ); } export default React.memo(ColorModeToggle); ================================================ FILE: website/src/theme/ColorModeToggle/styles.module.css ================================================ .toggleButton { padding: 4px; display: flex; gap: 4px; align-items: center; transition: all var(--ifm-transition-fast); position: relative; border-radius: 150px; background-color: var(--color-background-subtle); } .toggleButton span { width: 44px; height: 36px; border-radius: 50%; background: #fff; position: absolute; transition: all var(--ifm-transition-fast); left: 0; margin: 4px; border-radius: 150px; background-color: var(--color-background); /* Light/L1 */ box-shadow: 0px 0.5px 1.5px 0px rgba(63, 71, 93, 0.15), 0.4px 0.8px 1px -1.2px rgba(63, 71, 93, 0.14), 1px 2px 2.5px -2.5px rgba(63, 71, 93, 0.13); } .toggleButton svg { z-index: 1; margin: 8px 12px; width: 20px; height: 20px; path { stroke: var(--color-icon); } } [data-theme='dark'] .toggleButton span { left: 48px; } .toggleButtonDisabled { cursor: not-allowed; } ================================================ FILE: website/src/theme/DocItem/Content/index.js ================================================ import { useDoc } from '@docusaurus/plugin-content-docs/client'; import LLMButtons from '@site/src/components/LLMButtons'; import Heading from '@theme/Heading'; import MDXContent from '@theme/MDXContent'; import clsx from 'clsx'; import React from 'react'; import styles from './styles.module.css'; function useSyntheticTitle() { const { metadata, frontMatter, contentTitle } = useDoc(); const shouldRender = !frontMatter.hide_title && typeof contentTitle === 'undefined'; if (!shouldRender) { return null; } return metadata.title; } export default function DocItemContent({ children }) { const syntheticTitle = useSyntheticTitle(); return (
    {syntheticTitle && (
    {syntheticTitle && {syntheticTitle}}
    )} {children}
    ); } ================================================ FILE: website/src/theme/DocItem/Content/styles.module.css ================================================ .docItemContent { display: flex; align-items: center; justify-content: space-between; gap: 1rem; flex-wrap: wrap; padding-bottom: calc( var(--ifm-h1-vertical-rhythm-bottom) * var(--ifm-leading) ); h1 { margin: 0 !important; flex: 1 1 auto; min-width: 12rem; } @media (max-width: 767px) { flex-direction: column; align-items: flex-start; gap: 0.75rem; } } ================================================ FILE: website/src/theme/DocItem/Layout/index.js ================================================ import { useDoc } from '@docusaurus/plugin-content-docs/client'; import { useWindowSize, useColorMode } from '@docusaurus/theme-common'; import Giscus from '@giscus/react'; import DocBreadcrumbs from '@theme/DocBreadcrumbs'; import DocItemContent from '@theme/DocItem/Content'; import DocItemFooter from '@theme/DocItem/Footer'; import DocItemPaginator from '@theme/DocItem/Paginator'; import DocItemTOCDesktop from '@theme/DocItem/TOC/Desktop'; import DocItemTOCMobile from '@theme/DocItem/TOC/Mobile'; import DocVersionBadge from '@theme/DocVersionBadge'; import DocVersionBanner from '@theme/DocVersionBanner'; import clsx from 'clsx'; import React from 'react'; import styles from './styles.module.css'; /** * Decide if the toc should be rendered, on mobile or desktop viewports */ function useDocTOC() { const { frontMatter, toc, } = useDoc(); const windowSize = useWindowSize(); const hidden = frontMatter.hide_table_of_contents; const canRender = !hidden && toc.length > 0; const mobile = canRender ? : undefined; const desktop = canRender && (windowSize === 'desktop' || windowSize === 'ssr') ? ( ) : undefined; return { hidden, mobile, desktop, }; } export default function DocItemLayout({ children }) { const docTOC = useDocTOC(); const { colorMode } = useColorMode(); return (
    {docTOC.mobile} {children}
    {docTOC.desktop &&
    {docTOC.desktop}
    }
    ); } ================================================ FILE: website/src/theme/DocItem/Layout/styles.module.css ================================================ .docItemContainer { margin-bottom: 50px; } .docItemContainer header + *, .docItemContainer article > *:first-child { margin-top: 0; } @media (min-width: 997px) { .docItemCol { max-width: 75% !important; } } ================================================ FILE: website/src/theme/Footer/LinkItem/index.js ================================================ import isInternalUrl from '@docusaurus/isInternalUrl'; import Link from '@docusaurus/Link'; import useBaseUrl from '@docusaurus/useBaseUrl'; import clsx from 'clsx'; import React from 'react'; import styles from './index.module.css'; export default function FooterLinkItem({ item }) { const ExternalLinkIcon = require('../../../../static/img/external-link.svg').default; const { to, href, label, prependBaseUrlToHref, className, ...props } = item; const toUrl = useBaseUrl(to); const normalizedHref = useBaseUrl(href, { forcePrependBaseUrl: true }); return ( {label} {href && !isInternalUrl(href) && } ); } ================================================ FILE: website/src/theme/Footer/LinkItem/index.module.css ================================================ .footerLink { color: var(--color-text); cursor: pointer; font-size: 14px; line-height: 20px; &:hover { color: var(--color-text-subtle); path { fill: var(--color-text-subtle); } } } .externalLinkIcon { margin-left: 5px; path { fill: var(--color-text); } } ================================================ FILE: website/src/theme/Footer/index.js ================================================ import Link from '@docusaurus/Link'; import { useThemeConfig } from '@docusaurus/theme-common'; import useBaseUrl from '@docusaurus/useBaseUrl'; import LinkItem from '@theme/Footer/LinkItem'; import NavbarColorModeToggle from '@theme/Navbar/ColorModeToggle'; import ThemedImage from '@theme/ThemedImage'; import clsx from 'clsx'; import React from 'react'; import styles from './index.module.css'; function FooterLinksColumn({ column }) { return (
    {column.title}
      {column.items.map((item, i) => (
    • ))}
    ); } function Footer() { const { footer } = useThemeConfig(); const { links, style } = footer; const HearthIcon = require('../../../static/img/hearth.svg').default; const logoSources = { light: useBaseUrl('/img/crawlee-light.svg'), dark: useBaseUrl('/img/crawlee-dark.svg'), }; if (!footer) { return null; } return (
    Crawlee is forever free and open source
    © {new Date().getFullYear()} Apify
    ); } export default React.memo(Footer); ================================================ FILE: website/src/theme/Footer/index.module.css ================================================ .footer { background: var(--color-background); color: var(--color-text); } .footerBottom, .footerTop { border-top: 1px solid var(--color-separator); @media (min-width: 768px) { padding: 40px 40px; } @media (min-width: 1024px) { padding: 40px 64px; } } .footerTopRow { max-width: var(--max-layout-width); margin: auto; display: flex; flex-direction: column; @media (min-width: 768px) { flex-direction: row; } } .footerTopRowRight { flex-direction: column; display: flex; flex: 3; gap: 32px; padding: 16px 40px 40px; @media (min-width: 768px) { gap: 0; flex-direction: row; padding: 0; justify-content: space-between; } } .footerTopRowLeft { display: flex; flex-direction: column; justify-content: space-between; flex: 2; padding: 32px 40px 24px; gap: 32px; border-bottom: 1px solid var(--color-separator); img { display: block !important; } @media (min-width: 768px) { padding: 0; border: 0; gap: 0; } } .footerBottomRow { max-width: var(--max-layout-width); margin: auto; display: flex; flex-direction: column; align-items: center; gap: 24px; padding: 24px 40px; font-size: 14px; line-height: 20px; text-align: center; @media (min-width: 768px) { gap: 0; padding: 0; flex-direction: row; justify-content: space-between; } } .hearthIcon { margin-right: 8px; path { fill: var(--color-text-muted); } } .footerTitle { font-size: 16px; font-weight: 700; line-height: 24px; } .footerList { margin: 0; li { margin-top: 16px; height: 28px; } } .footerLogo { width: fit-content; } ================================================ FILE: website/src/theme/MDXComponents/A.js ================================================ /* eslint-disable react/prop-types */ import Link from '@docusaurus/Link'; import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; import React from 'react'; export default function MDXA(props) { const { siteConfig } = useDocusaurusContext(); if (props.href?.startsWith(siteConfig.url)) { const { href, ...rest } = props; rest.to = props.href.replace(siteConfig.url + siteConfig.baseUrl, '/'); props = rest; } return ; } ================================================ FILE: website/src/theme/Navbar/Content/index.js ================================================ import Link from '@docusaurus/Link'; import { useLocation } from '@docusaurus/router'; import { useThemeConfig } from '@docusaurus/theme-common'; import { splitNavbarItems, useNavbarMobileSidebar, } from '@docusaurus/theme-common/internal'; import NavbarLogo from '@theme/Navbar/Logo'; import NavbarMobileSidebarToggle from '@theme/Navbar/MobileSidebar/Toggle'; import NavbarSearch from '@theme/Navbar/Search'; import NavbarItem from '@theme/NavbarItem'; import SearchBar from '@theme/SearchBar'; import clsx from 'clsx'; import React from 'react'; import styles from './styles.module.css'; function useNavbarItems() { return useThemeConfig().navbar.items; } function NavbarItems({ items, className }) { return (
    {items.map((item, i) => ( ))}
    ); } function NavbarContentLayout({ left, right }) { return (
    {left}
    {right}
    ); } const VERSIONS_ITEM = { type: 'docsVersionDropdown', position: 'left', label: 'Versions', dropdownItemsAfter: [ { href: 'https://sdk.apify.com/docs/guides/getting-started', label: '2.2', }, { href: 'https://sdk.apify.com/docs/1.3.1/guides/getting-started', label: '1.3', }, ], dropdownItemsBefore: [], }; export default function NavbarContent() { const location = useLocation(); const mobileSidebar = useNavbarMobileSidebar(); const items = useNavbarItems(); const effectiveItems = location.pathname?.endsWith('/python/') || location.pathname?.endsWith('/python') ? items : [...items, VERSIONS_ITEM]; const [leftItems, rightItems] = splitNavbarItems(effectiveItems); const searchBarItem = items.find((item) => item.type === 'search'); return ( } right={ <> {rightItems?.length > 0 && ( )} {!searchBarItem && ( )} Get started {!mobileSidebar.disabled && } } /> ); } ================================================ FILE: website/src/theme/Navbar/Content/styles.module.css ================================================ .navbarItems { display: flex; align-items: center; margin-inline: auto; gap: 16px; } .navbarItems__leftMargin { margin-left: 40px; } .getStartedButton { color: var(--color-text-on-primary); background: var(--color-black-action); border-radius: 8px; font-size: 16px; font-weight: 500; line-height: 24px; padding: 8px 16px !important; border: none; transition: background-color 0.2s; &:hover { color: var(--color-text-on-primary); background-color: var(--color-primary-action-hover); } } ================================================ FILE: website/src/theme/Navbar/Logo/index.js ================================================ import Link from '@docusaurus/Link'; import { useThemeConfig } from '@docusaurus/theme-common'; import useBaseUrl from '@docusaurus/useBaseUrl'; import Logo from '@theme/Logo'; import ThemedImage from '@theme/ThemedImage'; import React from 'react'; import styles from './index.module.css'; export default function LogoWrapper(props) { const ArrowsIcon = require('../../../../static/img/menu-arrows.svg').default; const CheckIcon = require('../../../../static/img/check.svg').default; const { navbar: { logo } } = useThemeConfig(); const javascriptLogo = { light: useBaseUrl('img/crawlee-javascript-light.svg'), dark: useBaseUrl('img/crawlee-javascript-dark.svg'), }; const languageAgnosticLogo = { light: useBaseUrl('img/crawlee-light.svg'), dark: useBaseUrl('img/crawlee-dark.svg'), }; const pythonLogo = { light: useBaseUrl(logo.src), dark: useBaseUrl(logo.srcDark || logo.src), }; return (
    ); } ================================================ FILE: website/src/theme/Navbar/Logo/index.module.css ================================================ .navbarLogo { position: relative; cursor: pointer; /* do not display the other theme logo when loading */ a { img:nth-child(2) { display: none !important; } } } .logoWithArrows { display: flex; align-items: center; width: 220px; svg { margin: 0 2px; g { stroke: var(--color-icon); } } img { display: block !important; height: 28px; } } .menuWrapper { position: absolute; left: 0; top: 100%; z-index: 100; padding-top: 6px; } .menu { width: 230px; border-radius: 8px; border: 1px solid var(--color-border); box-shadow: 0px 4px 8px 0px rgba(36, 39, 54, 0.12), 0px 2px 4px 0px rgba(36, 39, 54, 0.08), 0px 0px 1px 0px rgba(36, 39, 54, 0.24); background: var(--color-card-background); padding: 8px 0; overflow: hidden; transition: all 0.3s; flex-direction: column; align-items: flex-start; padding: 8px; display: none; img { height: 24px; width: auto; display: block !important; } } .navbarLogo:hover { .menu { display: flex; } } .menuItem { padding: 8px; width: 100%; border-radius: 12px; display: flex; justify-content: space-between; align-items: center; path { fill: var(--color-icon); } &:hover { background: var(--color-hover); } } ================================================ FILE: website/src/theme/Navbar/MobileSidebar/Header/index.js ================================================ import Link from '@docusaurus/Link'; import { useLocation } from '@docusaurus/router'; import { useNavbarMobileSidebar } from '@docusaurus/theme-common/internal'; import { translate } from '@docusaurus/Translate'; import IconClose from '@theme/Icon/Close'; import NavbarLogo from '@theme/Navbar/Logo'; import SearchBar from '@theme/SearchBar'; import clsx from 'clsx'; import React from 'react'; import styles from './index.module.css'; function CloseButton() { const mobileSidebar = useNavbarMobileSidebar(); return ( ); } export default function NavbarMobileSidebarHeader() { const { toggle, shown } = useNavbarMobileSidebar(); const closeSidebar = () => shown && toggle(); return (
    Get started
    Get started
    ); } ================================================ FILE: website/src/theme/Navbar/MobileSidebar/Header/index.module.css ================================================ .getStartedButton { color: var(--color-text-on-primary); background: var(--color-black-action); border-radius: 8px; font-size: 16px; font-weight: 500; line-height: 24px; padding: 8px 16px !important; border: none; &:hover { color: var(--color-text-on-primary); } text-align: center; } .navbarHeader { display: flex; width: 100%; align-items: center; justify-content: space-between; padding: 16px; @media (min-width: 768px) { padding: 20px 40px; } @media (min-width: 1024px) { padding: 20px 64px; } } .navbarButtonsWrapper { display: flex; gap: 16px; margin-left: auto; } .navbarButtonsWrapperDesktop { display: flex; @media (max-width: 767px) { display: none; } } .navbarButtonsWrapperMobile { border-top: 1px solid var(--color-separator); display: none; @media (max-width: 767px) { display: flex; } width: 100%; margin: 0; flex-direction: column; gap: 16px; button { width: 100%; } padding: 16px 24px; } ================================================ FILE: website/src/theme/Navbar/MobileSidebar/Layout/index.js ================================================ import { useNavbarSecondaryMenu } from '@docusaurus/theme-common/internal'; import clsx from 'clsx'; import React from 'react'; export default function NavbarMobileSidebarLayout({ header, primaryMenu, secondaryMenu, }) { const { shown: secondaryMenuShown } = useNavbarSecondaryMenu(); return (
    {header}
    {primaryMenu}
    {secondaryMenu}
    ); } ================================================ FILE: website/src/theme/Navbar/MobileSidebar/PrimaryMenu/index.js ================================================ import { useThemeConfig } from '@docusaurus/theme-common'; import { useNavbarMobileSidebar } from '@docusaurus/theme-common/internal'; import NavbarItem from '@theme/NavbarItem'; import React from 'react'; function useNavbarItems() { return useThemeConfig().navbar.items; } // The primary menu displays the navbar items export default function NavbarMobilePrimaryMenu() { const mobileSidebar = useNavbarMobileSidebar(); const items = useNavbarItems(); return (
      {items.map((item, i) => ( mobileSidebar.toggle()} key={i} /> ))}
    ); } ================================================ FILE: website/src/theme/Navbar/MobileSidebar/index.js ================================================ import { useLockBodyScroll, useNavbarMobileSidebar, useWindowSize, } from '@docusaurus/theme-common/internal'; import NavbarMobileSidebarHeader from '@theme/Navbar/MobileSidebar/Header'; import NavbarMobileSidebarLayout from '@theme/Navbar/MobileSidebar/Layout'; import NavbarMobileSidebarPrimaryMenu from '@theme/Navbar/MobileSidebar/PrimaryMenu'; import NavbarMobileSidebarSecondaryMenu from '@theme/Navbar/MobileSidebar/SecondaryMenu'; import React from 'react'; export default function NavbarMobileSidebar() { const mobileSidebar = useNavbarMobileSidebar(); const windowSize = useWindowSize({ desktopBreakpoint: 1200, }); useLockBodyScroll(mobileSidebar.shown); const shouldRender = !mobileSidebar.disabled && windowSize === 'mobile'; if (!shouldRender) { return null; } return ( } primaryMenu={} secondaryMenu={} /> ); } ================================================ FILE: website/src/theme/NavbarItem/ComponentTypes.js ================================================ import { useActiveDocContext, useLayoutDoc } from '@docusaurus/plugin-content-docs/client'; import DefaultNavbarItem from '@theme/NavbarItem/DefaultNavbarItem'; import DocSidebarNavbarItem from '@theme/NavbarItem/DocSidebarNavbarItem'; import DocsVersionDropdownNavbarItem from '@theme/NavbarItem/DocsVersionDropdownNavbarItem'; import DocsVersionNavbarItem from '@theme/NavbarItem/DocsVersionNavbarItem'; import DropdownNavbarItem from '@theme/NavbarItem/DropdownNavbarItem'; import HtmlNavbarItem from '@theme/NavbarItem/HtmlNavbarItem'; import LocaleDropdownNavbarItem from '@theme/NavbarItem/LocaleDropdownNavbarItem'; import SearchNavbarItem from '@theme/NavbarItem/SearchNavbarItem'; import React from 'react'; // const versions = require('../../../versions.json'); // const stable = versions[0]; function DocNavbarItem({ docId, label: staticLabel, docsPluginId, ...props }) { const { activeDoc } = useActiveDocContext(docsPluginId); const doc = useLayoutDoc(docId, docsPluginId); // Draft items are not displayed in the navbar. if (doc === null) { return null; } return ( activeDoc?.path.startsWith(doc.path)} label={staticLabel ?? doc.id} to={doc.path} /> ); } function ApiNavbarItem(ctx) { return ( ); // let version = {}; // // try { // // eslint-disable-next-line react-hooks/rules-of-hooks // version = useDocsVersion(); // } catch { // version.version = stable; // } // // const { siteConfig } = useDocusaurusContext(); // // if (siteConfig.presets[0][1].docs.disableVersioning || version.version === stable) { // return ( // // ); // } // // return ( // // ); } const ComponentTypes = { 'default': DefaultNavbarItem, 'localeDropdown': LocaleDropdownNavbarItem, 'search': SearchNavbarItem, 'dropdown': DropdownNavbarItem, 'html': HtmlNavbarItem, 'custom-api': ApiNavbarItem, 'doc': DocNavbarItem, 'docSidebar': DocSidebarNavbarItem, 'docsVersion': DocsVersionNavbarItem, 'docsVersionDropdown': DocsVersionDropdownNavbarItem, }; export default ComponentTypes; ================================================ FILE: website/static/.nojekyll ================================================ ================================================ FILE: website/static/js/custom.js ================================================ function load() { const versions = document.querySelectorAll('.navbar .dropdown ul a'); const basePath = ''; const types = [`${basePath}/docs/next`, `${basePath}/docs`]; let i = 0; for (const el of versions) { const match = el.href.match(/\/docs\/(\d+\.\d+(\.\d+)?)$/) || el.href.match(/\/docs\/(\d+\.\d+(\.\d+)?)/); if (!types[i++] && !match) { continue; } const version = (types[i++] || match[0]).replace('/docs', '/api'); if (el.classList.contains('api-version-bound')) { continue; } el.addEventListener('click', (e) => { if (version && window.location.pathname.startsWith(`${basePath}/api`)) { window.location.href = version; e.preventDefault(); } }); el.classList.add('api-version-bound'); } } setInterval(() => { if (document.querySelectorAll('.navbar .dropdown ul a').length > 0) { load(); } }, 500); if (window.location.href.startsWith('https://apify.github.io/crawlee-python')) { window.location.href = window.location.href.replace('https://apify.github.io/crawlee-python', 'https://crawlee.dev/python'); } if (window.location.href.startsWith('https://crawlee.dev/crawlee-python')) { window.location.href = window.location.href.replace('https://crawlee.dev/crawlee-python', 'https://crawlee.dev/python'); } ================================================ FILE: website/static/robots.txt ================================================ User-agent: * Sitemap: https://crawlee.dev/python/sitemap.xml ================================================ FILE: website/tools/docs-prettier.config.js ================================================ /** * @type {import('prettier').Options} */ module.exports = { parser: 'markdown', arrowParens: 'avoid', trailingComma: 'all', singleQuote: true, tabWidth: 4, printWidth: 150, proseWrap: 'always', }; ================================================ FILE: website/tools/utils/externalLink.js ================================================ const { parse } = require('url'); const visit = import('unist-util-visit').then((m) => m.visit); const internalUrls = ['crawlee.dev']; /** * @param {import('url').UrlWithStringQuery} href */ function isInternal(href) { return internalUrls.some( (internalUrl) => href.host === internalUrl || (!href.protocol && !href.host && (href.pathname || href.hash)), ); } /** * @type {import('unified').Plugin} */ exports.externalLinkProcessor = () => { return async (tree) => { (await visit)(tree, 'element', (node) => { if ( node.tagName === 'a' && node.properties && typeof node.properties.href === 'string' ) { const href = parse(node.properties.href); if (!isInternal(href)) { node.properties.target = '_blank'; node.properties.rel = 'noopener'; } else { node.properties.target = null; node.properties.rel = null; } } }); }; }; ================================================ FILE: website/tools/website_gif/website_gif.mjs ================================================ /** * How to generate the gifs: * * 1. Set a breakpoint on the marked line * 2. Run the crawler with the debugger * 3. Setup your chrome and recording * 4. Resume, record, ???, profit! */ import { PuppeteerCrawler, sleep } from 'crawlee'; const crawler = new PuppeteerCrawler({ headless: false, maxConcurrency: 1, navigationTimeoutSecs: 100000, requestHandlerTimeoutSecs: 10000, browserPoolOptions: { closeInactiveBrowserAfterSecs: 100000, operationTimeoutSecs: 100000, }, async requestHandler({ request }) { if (request.userData.label === 'start') { console.log('Waiting 5s, prepare recording!'); await sleep(5000); // <--- Set breakpoint here } else { await sleep(250); } }, }); await crawler.run([ { url: 'https://crawlee.dev', userData: { label: 'start' }, uniqueKey: 'dark-start' }, { url: 'https://crawlee.dev/docs/quick-start', uniqueKey: 'dark-1' }, { url: 'https://crawlee.dev/docs/introduction/setting-up', uniqueKey: 'dark-2' }, { url: 'https://crawlee.dev/docs/introduction/first-crawler', uniqueKey: 'dark-3' }, { url: 'https://crawlee.dev/docs/introduction/adding-urls', uniqueKey: 'dark-4' }, { url: 'https://crawlee.dev/docs/introduction/real-world-project', uniqueKey: 'dark-5' }, // Light theme { url: 'https://crawlee.dev', userData: { label: 'start' }, uniqueKey: 'light th-start' }, { url: 'https://crawlee.dev/docs/quick-start', uniqueKey: 'light th-1' }, { url: 'https://crawlee.dev/docs/introduction/setting-up', uniqueKey: 'light th-2' }, { url: 'https://crawlee.dev/docs/introduction/first-crawler', uniqueKey: 'light th-3' }, { url: 'https://crawlee.dev/docs/introduction/adding-urls', uniqueKey: 'light th-4' }, { url: 'https://crawlee.dev/docs/introduction/real-world-project', uniqueKey: 'light th-5' } ]); ================================================ FILE: website/tsconfig.eslint.json ================================================ { "extends": "@apify/tsconfig", "compilerOptions": { "jsx": "preserve" }, "include": [ "src/**/*.js", "src/**/*.ts", "src/**/*.jsx", "src/**/*.tsx" ] }