Showing preview only (2,817K chars total). Download the full file or copy to clipboard to get everything.
Repository: apify/crawlee-python
Branch: master
Commit: 9becf12908f8
Files: 635
Total size: 2.6 MB
Directory structure:
gitextract_o1cy5s8w/
├── .editorconfig
├── .github/
│ ├── CODEOWNERS
│ ├── pull_request_template.md
│ └── workflows/
│ ├── _check_code.yaml
│ ├── _check_docs.yaml
│ ├── _release_docs.yaml
│ ├── _tests.yaml
│ ├── manual_release_stable.yaml
│ ├── on_issue.yaml
│ ├── on_master.yaml
│ ├── on_pull_request.yaml
│ └── on_schedule_tests.yaml
├── .gitignore
├── .markdownlint.yaml
├── .pre-commit-config.yaml
├── .rules.md
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── codecov.yaml
├── docs/
│ ├── deployment/
│ │ ├── apify_platform.mdx
│ │ ├── aws_lambda.mdx
│ │ ├── code_examples/
│ │ │ ├── apify/
│ │ │ │ ├── crawler_as_actor_example.py
│ │ │ │ ├── get_public_url.py
│ │ │ │ ├── log_with_config_example.py
│ │ │ │ ├── proxy_advanced_example.py
│ │ │ │ └── proxy_example.py
│ │ │ ├── aws/
│ │ │ │ ├── beautifulsoup_crawler_lambda.py
│ │ │ │ ├── playwright_crawler_lambda.py
│ │ │ │ └── playwright_dockerfile
│ │ │ └── google/
│ │ │ ├── cloud_run_example.py
│ │ │ └── google_example.py
│ │ ├── google_cloud.mdx
│ │ └── google_cloud_run.mdx
│ ├── examples/
│ │ ├── add_data_to_dataset.mdx
│ │ ├── beautifulsoup_crawler.mdx
│ │ ├── capture_screenshot_using_playwright.mdx
│ │ ├── capturing_page_snapshots_with_error_snapshotter.mdx
│ │ ├── code_examples/
│ │ │ ├── adaptive_playwright_crawler.py
│ │ │ ├── add_data_to_dataset_bs.py
│ │ │ ├── add_data_to_dataset_dataset.py
│ │ │ ├── add_data_to_dataset_pw.py
│ │ │ ├── beautifulsoup_crawler.py
│ │ │ ├── beautifulsoup_crawler_keep_alive.py
│ │ │ ├── beautifulsoup_crawler_stop.py
│ │ │ ├── capture_screenshot_using_playwright.py
│ │ │ ├── configure_json_logging.py
│ │ │ ├── crawl_all_links_on_website_bs.py
│ │ │ ├── crawl_all_links_on_website_pw.py
│ │ │ ├── crawl_multiple_urls_bs.py
│ │ │ ├── crawl_multiple_urls_pw.py
│ │ │ ├── crawl_specific_links_on_website_bs.py
│ │ │ ├── crawl_specific_links_on_website_pw.py
│ │ │ ├── crawl_website_with_relative_links_all_links.py
│ │ │ ├── crawl_website_with_relative_links_same_domain.py
│ │ │ ├── crawl_website_with_relative_links_same_hostname.py
│ │ │ ├── crawl_website_with_relative_links_same_origin.py
│ │ │ ├── export_entire_dataset_to_file_csv.py
│ │ │ ├── export_entire_dataset_to_file_json.py
│ │ │ ├── extract_and_add_specific_links_on_website_bs.py
│ │ │ ├── extract_and_add_specific_links_on_website_pw.py
│ │ │ ├── fill_and_submit_web_form_crawler.py
│ │ │ ├── fill_and_submit_web_form_request.py
│ │ │ ├── parsel_crawler.py
│ │ │ ├── parsel_crawler_with_error_snapshotter.py
│ │ │ ├── playwright_block_requests.py
│ │ │ ├── playwright_crawler.py
│ │ │ ├── playwright_crawler_with_camoufox.py
│ │ │ ├── playwright_crawler_with_error_snapshotter.py
│ │ │ ├── playwright_crawler_with_fingerprint_generator.py
│ │ │ ├── respect_robots_on_skipped_request.py
│ │ │ ├── respect_robots_txt_file.py
│ │ │ ├── resuming_paused_crawl.py
│ │ │ ├── run_parallel_crawlers.py
│ │ │ ├── using_browser_profiles_chrome.py
│ │ │ ├── using_browser_profiles_firefox.py
│ │ │ └── using_sitemap_request_loader.py
│ │ ├── crawl_all_links_on_website.mdx
│ │ ├── crawl_multiple_urls.mdx
│ │ ├── crawl_specific_links_on_website.mdx
│ │ ├── crawl_website_with_relative_links.mdx
│ │ ├── crawler_keep_alive.mdx
│ │ ├── crawler_stop.mdx
│ │ ├── export_entire_dataset_to_file.mdx
│ │ ├── fill_and_submit_web_form.mdx
│ │ ├── json_logging.mdx
│ │ ├── parsel_crawler.mdx
│ │ ├── playwright_crawler.mdx
│ │ ├── playwright_crawler_adaptive.mdx
│ │ ├── playwright_crawler_with_block_requests.mdx
│ │ ├── playwright_crawler_with_camoufox.mdx
│ │ ├── playwright_crawler_with_fingerprint_generator.mdx
│ │ ├── respect_robots_txt_file.mdx
│ │ ├── resuming_paused_crawl.mdx
│ │ ├── run_parallel_crawlers.mdx
│ │ ├── using_browser_profile.mdx
│ │ └── using_sitemap_request_loader.mdx
│ ├── guides/
│ │ ├── architecture_overview.mdx
│ │ ├── avoid_blocking.mdx
│ │ ├── code_examples/
│ │ │ ├── avoid_blocking/
│ │ │ │ ├── default_fingerprint_generator_with_args.py
│ │ │ │ └── playwright_with_fingerprint_generator.py
│ │ │ ├── creating_web_archive/
│ │ │ │ ├── manual_archiving_parsel_crawler.py
│ │ │ │ ├── manual_archiving_playwright_crawler.py
│ │ │ │ └── simple_pw_through_proxy_pywb_server.py
│ │ │ ├── error_handling/
│ │ │ │ ├── change_handle_error_status.py
│ │ │ │ ├── disable_retry.py
│ │ │ │ └── handle_proxy_error.py
│ │ │ ├── http_clients/
│ │ │ │ ├── parsel_curl_impersonate_example.py
│ │ │ │ ├── parsel_httpx_example.py
│ │ │ │ └── parsel_impit_example.py
│ │ │ ├── http_crawlers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── beautifulsoup_example.py
│ │ │ │ ├── custom_crawler_example.py
│ │ │ │ ├── http_example.py
│ │ │ │ ├── lexbor_parser.py
│ │ │ │ ├── lxml_parser.py
│ │ │ │ ├── lxml_saxonche_parser.py
│ │ │ │ ├── parsel_example.py
│ │ │ │ ├── pyquery_parser.py
│ │ │ │ ├── scrapling_parser.py
│ │ │ │ ├── selectolax_adaptive_run.py
│ │ │ │ ├── selectolax_context.py
│ │ │ │ ├── selectolax_crawler.py
│ │ │ │ ├── selectolax_crawler_run.py
│ │ │ │ └── selectolax_parser.py
│ │ │ ├── login_crawler/
│ │ │ │ ├── http_login.py
│ │ │ │ └── playwright_login.py
│ │ │ ├── playwright_crawler/
│ │ │ │ ├── browser_configuration_example.py
│ │ │ │ ├── browser_pool_page_hooks_example.py
│ │ │ │ ├── multiple_launch_example.py
│ │ │ │ ├── navigation_hooks_example.py
│ │ │ │ └── plugin_browser_configuration_example.py
│ │ │ ├── playwright_crawler_adaptive/
│ │ │ │ ├── handler.py
│ │ │ │ ├── init_beautifulsoup.py
│ │ │ │ ├── init_parsel.py
│ │ │ │ ├── init_prediction.py
│ │ │ │ └── pre_nav_hooks.py
│ │ │ ├── playwright_crawler_stagehand/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── browser_classes.py
│ │ │ │ ├── stagehand_run.py
│ │ │ │ └── support_classes.py
│ │ │ ├── proxy_management/
│ │ │ │ ├── inspecting_bs_example.py
│ │ │ │ ├── inspecting_pw_example.py
│ │ │ │ ├── integration_bs_example.py
│ │ │ │ ├── integration_pw_example.py
│ │ │ │ ├── quick_start_example.py
│ │ │ │ ├── session_bs_example.py
│ │ │ │ ├── session_pw_example.py
│ │ │ │ ├── tiers_bs_example.py
│ │ │ │ └── tiers_pw_example.py
│ │ │ ├── request_loaders/
│ │ │ │ ├── rl_basic_example.py
│ │ │ │ ├── rl_basic_example_with_persist.py
│ │ │ │ ├── rl_tandem_example.py
│ │ │ │ ├── rl_tandem_example_explicit.py
│ │ │ │ ├── sitemap_basic_example.py
│ │ │ │ ├── sitemap_example_with_persist.py
│ │ │ │ ├── sitemap_tandem_example.py
│ │ │ │ └── sitemap_tandem_example_explicit.py
│ │ │ ├── request_router/
│ │ │ │ ├── adaptive_crawler_handlers.py
│ │ │ │ ├── basic_request_handlers.py
│ │ │ │ ├── custom_router_default_only.py
│ │ │ │ ├── error_handler.py
│ │ │ │ ├── failed_request_handler.py
│ │ │ │ ├── http_pre_navigation.py
│ │ │ │ ├── playwright_pre_navigation.py
│ │ │ │ └── simple_default_handler.py
│ │ │ ├── running_in_web_server/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── crawler.py
│ │ │ │ └── server.py
│ │ │ ├── scaling_crawlers/
│ │ │ │ ├── max_tasks_per_minute_example.py
│ │ │ │ └── min_and_max_concurrency_example.py
│ │ │ ├── service_locator/
│ │ │ │ ├── service_conflicts.py
│ │ │ │ ├── service_crawler_configuration.py
│ │ │ │ ├── service_crawler_event_manager.py
│ │ │ │ ├── service_crawler_storage_client.py
│ │ │ │ ├── service_locator_configuration.py
│ │ │ │ ├── service_locator_event_manager.py
│ │ │ │ ├── service_locator_storage_client.py
│ │ │ │ ├── service_storage_configuration.py
│ │ │ │ └── service_storage_storage_client.py
│ │ │ ├── session_management/
│ │ │ │ ├── multi_sessions_http.py
│ │ │ │ ├── one_session_http.py
│ │ │ │ ├── sm_basic.py
│ │ │ │ ├── sm_beautifulsoup.py
│ │ │ │ ├── sm_http.py
│ │ │ │ ├── sm_parsel.py
│ │ │ │ ├── sm_playwright.py
│ │ │ │ └── sm_standalone.py
│ │ │ ├── storage_clients/
│ │ │ │ ├── custom_storage_client_example.py
│ │ │ │ ├── file_system_storage_client_basic_example.py
│ │ │ │ ├── file_system_storage_client_configuration_example.py
│ │ │ │ ├── memory_storage_client_basic_example.py
│ │ │ │ ├── redis_storage_client_basic_example.py
│ │ │ │ ├── redis_storage_client_configuration_example.py
│ │ │ │ ├── registering_storage_clients_example.py
│ │ │ │ ├── sql_storage_client_basic_example.py
│ │ │ │ └── sql_storage_client_configuration_example.py
│ │ │ ├── storages/
│ │ │ │ ├── cleaning_do_not_purge_example.py
│ │ │ │ ├── cleaning_purge_explicitly_example.py
│ │ │ │ ├── dataset_basic_example.py
│ │ │ │ ├── dataset_with_crawler_example.py
│ │ │ │ ├── dataset_with_crawler_explicit_example.py
│ │ │ │ ├── helper_add_requests_example.py
│ │ │ │ ├── helper_enqueue_links_example.py
│ │ │ │ ├── kvs_basic_example.py
│ │ │ │ ├── kvs_with_crawler_example.py
│ │ │ │ ├── kvs_with_crawler_explicit_example.py
│ │ │ │ ├── opening.py
│ │ │ │ ├── rq_basic_example.py
│ │ │ │ ├── rq_with_crawler_example.py
│ │ │ │ └── rq_with_crawler_explicit_example.py
│ │ │ └── trace_and_monitor_crawlers/
│ │ │ └── instrument_crawler.py
│ │ ├── crawler_login.mdx
│ │ ├── creating_web_archive.mdx
│ │ ├── error_handling.mdx
│ │ ├── http_clients.mdx
│ │ ├── http_crawlers.mdx
│ │ ├── playwright_crawler.mdx
│ │ ├── playwright_crawler_adaptive.mdx
│ │ ├── playwright_crawler_stagehand.mdx
│ │ ├── proxy_management.mdx
│ │ ├── request_loaders.mdx
│ │ ├── request_router.mdx
│ │ ├── running_in_web_server.mdx
│ │ ├── scaling_crawlers.mdx
│ │ ├── service_locator.mdx
│ │ ├── session_management.mdx
│ │ ├── storage_clients.mdx
│ │ ├── storages.mdx
│ │ └── trace_and_monitor_crawlers.mdx
│ ├── introduction/
│ │ ├── 01_setting_up.mdx
│ │ ├── 02_first_crawler.mdx
│ │ ├── 03_adding_more_urls.mdx
│ │ ├── 04_real_world_project.mdx
│ │ ├── 05_crawling.mdx
│ │ ├── 06_scraping.mdx
│ │ ├── 07_saving_data.mdx
│ │ ├── 08_refactoring.mdx
│ │ ├── 09_running_in_cloud.mdx
│ │ ├── code_examples/
│ │ │ ├── 02_bs.py
│ │ │ ├── 02_bs_better.py
│ │ │ ├── 02_request_queue.py
│ │ │ ├── 03_enqueue_strategy.py
│ │ │ ├── 03_finding_new_links.py
│ │ │ ├── 03_globs.py
│ │ │ ├── 03_original_code.py
│ │ │ ├── 03_transform_request.py
│ │ │ ├── 04_sanity_check.py
│ │ │ ├── 05_crawling_detail.py
│ │ │ ├── 05_crawling_listing.py
│ │ │ ├── 06_scraping.py
│ │ │ ├── 07_final_code.py
│ │ │ ├── 07_first_code.py
│ │ │ ├── 08_main.py
│ │ │ ├── 08_routes.py
│ │ │ ├── 09_apify_sdk.py
│ │ │ ├── __init__.py
│ │ │ └── routes.py
│ │ └── index.mdx
│ ├── pyproject.toml
│ ├── quick-start/
│ │ ├── code_examples/
│ │ │ ├── beautifulsoup_crawler_example.py
│ │ │ ├── parsel_crawler_example.py
│ │ │ ├── playwright_crawler_example.py
│ │ │ └── playwright_crawler_headful_example.py
│ │ └── index.mdx
│ └── upgrading/
│ ├── upgrading_to_v0x.md
│ └── upgrading_to_v1.md
├── pyproject.toml
├── renovate.json
├── src/
│ └── crawlee/
│ ├── __init__.py
│ ├── _autoscaling/
│ │ ├── __init__.py
│ │ ├── _types.py
│ │ ├── autoscaled_pool.py
│ │ ├── py.typed
│ │ ├── snapshotter.py
│ │ └── system_status.py
│ ├── _cli.py
│ ├── _consts.py
│ ├── _log_config.py
│ ├── _request.py
│ ├── _service_locator.py
│ ├── _types.py
│ ├── _utils/
│ │ ├── __init__.py
│ │ ├── blocked.py
│ │ ├── byte_size.py
│ │ ├── console.py
│ │ ├── context.py
│ │ ├── crypto.py
│ │ ├── docs.py
│ │ ├── file.py
│ │ ├── globs.py
│ │ ├── html_to_text.py
│ │ ├── models.py
│ │ ├── raise_if_too_many_kwargs.py
│ │ ├── recoverable_state.py
│ │ ├── recurring_task.py
│ │ ├── requests.py
│ │ ├── robots.py
│ │ ├── sitemap.py
│ │ ├── system.py
│ │ ├── time.py
│ │ ├── try_import.py
│ │ ├── urls.py
│ │ ├── wait.py
│ │ └── web.py
│ ├── browsers/
│ │ ├── __init__.py
│ │ ├── _browser_controller.py
│ │ ├── _browser_plugin.py
│ │ ├── _browser_pool.py
│ │ ├── _playwright_browser.py
│ │ ├── _playwright_browser_controller.py
│ │ ├── _playwright_browser_plugin.py
│ │ ├── _types.py
│ │ └── py.typed
│ ├── configuration.py
│ ├── crawlers/
│ │ ├── __init__.py
│ │ ├── _abstract_http/
│ │ │ ├── __init__.py
│ │ │ ├── _abstract_http_crawler.py
│ │ │ ├── _abstract_http_parser.py
│ │ │ ├── _http_crawling_context.py
│ │ │ └── py.typed
│ │ ├── _adaptive_playwright/
│ │ │ ├── __init__.py
│ │ │ ├── _adaptive_playwright_crawler.py
│ │ │ ├── _adaptive_playwright_crawler_statistics.py
│ │ │ ├── _adaptive_playwright_crawling_context.py
│ │ │ ├── _rendering_type_predictor.py
│ │ │ ├── _result_comparator.py
│ │ │ └── _utils.py
│ │ ├── _basic/
│ │ │ ├── __init__.py
│ │ │ ├── _basic_crawler.py
│ │ │ ├── _basic_crawling_context.py
│ │ │ ├── _context_pipeline.py
│ │ │ ├── _context_utils.py
│ │ │ ├── _logging_utils.py
│ │ │ └── py.typed
│ │ ├── _beautifulsoup/
│ │ │ ├── __init__.py
│ │ │ ├── _beautifulsoup_crawler.py
│ │ │ ├── _beautifulsoup_crawling_context.py
│ │ │ ├── _beautifulsoup_parser.py
│ │ │ ├── _utils.py
│ │ │ └── py.typed
│ │ ├── _http/
│ │ │ ├── __init__.py
│ │ │ ├── _http_crawler.py
│ │ │ └── _http_parser.py
│ │ ├── _parsel/
│ │ │ ├── __init__.py
│ │ │ ├── _parsel_crawler.py
│ │ │ ├── _parsel_crawling_context.py
│ │ │ ├── _parsel_parser.py
│ │ │ └── _utils.py
│ │ ├── _playwright/
│ │ │ ├── __init__.py
│ │ │ ├── _playwright_crawler.py
│ │ │ ├── _playwright_crawling_context.py
│ │ │ ├── _playwright_http_client.py
│ │ │ ├── _playwright_post_nav_crawling_context.py
│ │ │ ├── _playwright_pre_nav_crawling_context.py
│ │ │ ├── _types.py
│ │ │ └── _utils.py
│ │ ├── _types.py
│ │ └── py.typed
│ ├── errors.py
│ ├── events/
│ │ ├── __init__.py
│ │ ├── _event_manager.py
│ │ ├── _local_event_manager.py
│ │ ├── _types.py
│ │ └── py.typed
│ ├── fingerprint_suite/
│ │ ├── __init__.py
│ │ ├── _browserforge_adapter.py
│ │ ├── _consts.py
│ │ ├── _fingerprint_generator.py
│ │ ├── _header_generator.py
│ │ ├── _types.py
│ │ └── py.typed
│ ├── http_clients/
│ │ ├── __init__.py
│ │ ├── _base.py
│ │ ├── _curl_impersonate.py
│ │ ├── _httpx.py
│ │ └── _impit.py
│ ├── otel/
│ │ ├── __init__.py
│ │ └── crawler_instrumentor.py
│ ├── project_template/
│ │ ├── cookiecutter.json
│ │ ├── hooks/
│ │ │ ├── post_gen_project.py
│ │ │ └── pre_gen_project.py
│ │ ├── templates/
│ │ │ ├── main.py
│ │ │ ├── main_beautifulsoup.py
│ │ │ ├── main_parsel.py
│ │ │ ├── main_playwright.py
│ │ │ ├── main_playwright_camoufox.py
│ │ │ ├── main_playwright_chrome.py
│ │ │ ├── main_playwright_firefox.py
│ │ │ ├── main_playwright_webkit.py
│ │ │ ├── routes_beautifulsoup.py
│ │ │ ├── routes_parsel.py
│ │ │ └── routes_playwright.py
│ │ └── {{cookiecutter.project_name}}/
│ │ ├── .dockerignore
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ ├── requirements.txt
│ │ └── {{cookiecutter.__package_name}}/
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── main.py
│ │ └── routes.py
│ ├── proxy_configuration.py
│ ├── py.typed
│ ├── request_loaders/
│ │ ├── __init__.py
│ │ ├── _request_list.py
│ │ ├── _request_loader.py
│ │ ├── _request_manager.py
│ │ ├── _request_manager_tandem.py
│ │ └── _sitemap_request_loader.py
│ ├── router.py
│ ├── sessions/
│ │ ├── __init__.py
│ │ ├── _cookies.py
│ │ ├── _models.py
│ │ ├── _session.py
│ │ ├── _session_pool.py
│ │ └── py.typed
│ ├── statistics/
│ │ ├── __init__.py
│ │ ├── _error_snapshotter.py
│ │ ├── _error_tracker.py
│ │ ├── _models.py
│ │ └── _statistics.py
│ ├── storage_clients/
│ │ ├── __init__.py
│ │ ├── _base/
│ │ │ ├── __init__.py
│ │ │ ├── _dataset_client.py
│ │ │ ├── _key_value_store_client.py
│ │ │ ├── _request_queue_client.py
│ │ │ ├── _storage_client.py
│ │ │ └── py.typed
│ │ ├── _file_system/
│ │ │ ├── __init__.py
│ │ │ ├── _dataset_client.py
│ │ │ ├── _key_value_store_client.py
│ │ │ ├── _request_queue_client.py
│ │ │ ├── _storage_client.py
│ │ │ ├── _utils.py
│ │ │ └── py.typed
│ │ ├── _memory/
│ │ │ ├── __init__.py
│ │ │ ├── _dataset_client.py
│ │ │ ├── _key_value_store_client.py
│ │ │ ├── _request_queue_client.py
│ │ │ ├── _storage_client.py
│ │ │ └── py.typed
│ │ ├── _redis/
│ │ │ ├── __init__.py
│ │ │ ├── _client_mixin.py
│ │ │ ├── _dataset_client.py
│ │ │ ├── _key_value_store_client.py
│ │ │ ├── _request_queue_client.py
│ │ │ ├── _storage_client.py
│ │ │ ├── _utils.py
│ │ │ ├── lua_scripts/
│ │ │ │ ├── atomic_bloom_add_requests.lua
│ │ │ │ ├── atomic_fetch_request.lua
│ │ │ │ ├── atomic_set_add_requests.lua
│ │ │ │ └── reclaim_stale_requests.lua
│ │ │ └── py.typed
│ │ ├── _sql/
│ │ │ ├── __init__.py
│ │ │ ├── _client_mixin.py
│ │ │ ├── _dataset_client.py
│ │ │ ├── _db_models.py
│ │ │ ├── _key_value_store_client.py
│ │ │ ├── _request_queue_client.py
│ │ │ ├── _storage_client.py
│ │ │ └── py.typed
│ │ ├── models.py
│ │ └── py.typed
│ └── storages/
│ ├── __init__.py
│ ├── _base.py
│ ├── _dataset.py
│ ├── _key_value_store.py
│ ├── _request_queue.py
│ ├── _storage_instance_manager.py
│ ├── _utils.py
│ └── py.typed
├── tests/
│ ├── __init__.py
│ ├── e2e/
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ └── project_template/
│ │ ├── test_static_crawlers_templates.py
│ │ └── utils.py
│ └── unit/
│ ├── README.md
│ ├── __init__.py
│ ├── _autoscaling/
│ │ ├── test_autoscaled_pool.py
│ │ ├── test_snapshotter.py
│ │ └── test_system_status.py
│ ├── _statistics/
│ │ ├── test_error_tracker.py
│ │ ├── test_periodic_logging.py
│ │ ├── test_persistence.py
│ │ ├── test_request_max_duration.py
│ │ └── test_request_processing_record.py
│ ├── _utils/
│ │ ├── test_byte_size.py
│ │ ├── test_console.py
│ │ ├── test_crypto.py
│ │ ├── test_file.py
│ │ ├── test_globs.py
│ │ ├── test_html_to_text.py
│ │ ├── test_measure_time.py
│ │ ├── test_raise_if_too_many_kwargs.py
│ │ ├── test_recurring_task.py
│ │ ├── test_requests.py
│ │ ├── test_robots.py
│ │ ├── test_shared_timeout.py
│ │ ├── test_sitemap.py
│ │ ├── test_system.py
│ │ ├── test_timedelta_ms.py
│ │ └── test_urls.py
│ ├── browsers/
│ │ ├── test_browser_pool.py
│ │ ├── test_playwright_browser.py
│ │ ├── test_playwright_browser_controller.py
│ │ └── test_playwright_browser_plugin.py
│ ├── conftest.py
│ ├── crawlers/
│ │ ├── _adaptive_playwright/
│ │ │ ├── test_adaptive_playwright_crawler.py
│ │ │ ├── test_adaptive_playwright_crawler_statistics.py
│ │ │ ├── test_adaptive_playwright_crawling_context.py
│ │ │ └── test_predictor.py
│ │ ├── _basic/
│ │ │ ├── test_basic_crawler.py
│ │ │ └── test_context_pipeline.py
│ │ ├── _beautifulsoup/
│ │ │ └── test_beautifulsoup_crawler.py
│ │ ├── _http/
│ │ │ └── test_http_crawler.py
│ │ ├── _parsel/
│ │ │ └── test_parsel_crawler.py
│ │ └── _playwright/
│ │ ├── test_playwright_crawler.py
│ │ └── test_utils.py
│ ├── events/
│ │ ├── test_event_manager.py
│ │ └── test_local_event_manager.py
│ ├── fingerprint_suite/
│ │ ├── test_adapters.py
│ │ └── test_header_generator.py
│ ├── http_clients/
│ │ ├── test_http_clients.py
│ │ └── test_httpx.py
│ ├── otel/
│ │ └── test_crawler_instrumentor.py
│ ├── proxy_configuration/
│ │ ├── test_new_proxy_info.py
│ │ └── test_tiers.py
│ ├── request_loaders/
│ │ ├── test_request_list.py
│ │ └── test_sitemap_request_loader.py
│ ├── server.py
│ ├── server_endpoints.py
│ ├── server_static/
│ │ └── test.js
│ ├── sessions/
│ │ ├── test_cookies.py
│ │ ├── test_models.py
│ │ ├── test_session.py
│ │ └── test_session_pool.py
│ ├── storage_clients/
│ │ ├── _file_system/
│ │ │ ├── test_fs_dataset_client.py
│ │ │ ├── test_fs_kvs_client.py
│ │ │ └── test_fs_rq_client.py
│ │ ├── _memory/
│ │ │ ├── test_memory_dataset_client.py
│ │ │ ├── test_memory_kvs_client.py
│ │ │ └── test_memory_rq_client.py
│ │ ├── _redis/
│ │ │ ├── test_redis_dataset_client.py
│ │ │ ├── test_redis_kvs_client.py
│ │ │ └── test_redis_rq_client.py
│ │ └── _sql/
│ │ ├── test_sql_dataset_client.py
│ │ ├── test_sql_kvs_client.py
│ │ └── test_sql_rq_client.py
│ ├── storages/
│ │ ├── conftest.py
│ │ ├── test_dataset.py
│ │ ├── test_key_value_store.py
│ │ ├── test_request_manager_tandem.py
│ │ ├── test_request_queue.py
│ │ └── test_storage_instance_manager.py
│ ├── test_cli.py
│ ├── test_configuration.py
│ ├── test_log_config.py
│ ├── test_router.py
│ ├── test_service_locator.py
│ └── utils.py
├── typos.toml
└── website/
├── .eslintrc.json
├── .yarnrc.yml
├── babel.config.js
├── build_api_reference.sh
├── docusaurus.config.js
├── generate_module_shortcuts.py
├── package.json
├── patches/
│ ├── @docusaurus+core+3.4.0.patch
│ └── @docusaurus+core+3.5.2.patch
├── roa-loader/
│ ├── index.js
│ └── package.json
├── sidebars.js
├── src/
│ ├── components/
│ │ ├── ApiLink.jsx
│ │ ├── Button.jsx
│ │ ├── Button.module.css
│ │ ├── CopyButton.jsx
│ │ ├── CopyButton.module.css
│ │ ├── Gradients.jsx
│ │ ├── Highlights.jsx
│ │ ├── Highlights.module.css
│ │ ├── Homepage/
│ │ │ ├── HomepageCliExample.jsx
│ │ │ ├── HomepageCliExample.module.css
│ │ │ ├── HomepageCtaSection.jsx
│ │ │ ├── HomepageCtaSection.module.css
│ │ │ ├── HomepageHeroSection.jsx
│ │ │ ├── HomepageHeroSection.module.css
│ │ │ ├── LanguageInfoWidget.jsx
│ │ │ ├── LanguageInfoWidget.module.css
│ │ │ ├── LanguageSwitch.jsx
│ │ │ ├── LanguageSwitch.module.css
│ │ │ ├── RiverSection.jsx
│ │ │ ├── RiverSection.module.css
│ │ │ ├── ThreeCardsWithIcon.jsx
│ │ │ └── ThreeCardsWithIcon.module.css
│ │ ├── LLMButtons.jsx
│ │ ├── LLMButtons.module.css
│ │ ├── RunnableCodeBlock.jsx
│ │ └── RunnableCodeBlock.module.css
│ ├── css/
│ │ └── custom.css
│ ├── pages/
│ │ ├── home_page_example.py
│ │ ├── index.js
│ │ └── index.module.css
│ ├── plugins/
│ │ └── docusaurus-plugin-segment/
│ │ ├── index.js
│ │ └── segment.js
│ └── theme/
│ ├── ColorModeToggle/
│ │ ├── index.js
│ │ └── styles.module.css
│ ├── DocItem/
│ │ ├── Content/
│ │ │ ├── index.js
│ │ │ └── styles.module.css
│ │ └── Layout/
│ │ ├── index.js
│ │ └── styles.module.css
│ ├── Footer/
│ │ ├── LinkItem/
│ │ │ ├── index.js
│ │ │ └── index.module.css
│ │ ├── index.js
│ │ └── index.module.css
│ ├── MDXComponents/
│ │ └── A.js
│ ├── Navbar/
│ │ ├── Content/
│ │ │ ├── index.js
│ │ │ └── styles.module.css
│ │ ├── Logo/
│ │ │ ├── index.js
│ │ │ └── index.module.css
│ │ └── MobileSidebar/
│ │ ├── Header/
│ │ │ ├── index.js
│ │ │ └── index.module.css
│ │ ├── Layout/
│ │ │ └── index.js
│ │ ├── PrimaryMenu/
│ │ │ └── index.js
│ │ └── index.js
│ └── NavbarItem/
│ └── ComponentTypes.js
├── static/
│ ├── .nojekyll
│ ├── js/
│ │ └── custom.js
│ └── robots.txt
├── tools/
│ ├── docs-prettier.config.js
│ ├── utils/
│ │ └── externalLink.js
│ └── website_gif/
│ └── website_gif.mjs
└── tsconfig.eslint.json
================================================
FILE CONTENTS
================================================
================================================
FILE: .editorconfig
================================================
root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf
[Makefile]
indent_style = tab
[{*.yaml, *.yml}]
indent_size = 2
================================================
FILE: .github/CODEOWNERS
================================================
# Documentation codeowner
/docs/*.md @TC-MO
/docs/*.mdx @TC-MO
================================================
FILE: .github/pull_request_template.md
================================================
### Description
<!-- The purpose of the PR, list of the changes, ... -->
- TODO
### Issues
<!-- If applicable, reference any related GitHub issues -->
- Closes: #TODO
### Testing
<!-- Describe the testing process for these changes -->
- TODO
### Checklist
- [ ] CI passed
================================================
FILE: .github/workflows/_check_code.yaml
================================================
name: Code checks
on:
# Runs when manually triggered from the GitHub UI.
workflow_dispatch:
# Runs when invoked by another workflow.
workflow_call:
permissions:
contents: read
jobs:
actions_lint_check:
name: Actions lint check
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Run actionlint
uses: rhysd/actionlint@v1.7.11
spell_check:
name: Spell check
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Check spelling with typos
uses: crate-ci/typos@v1
lint_check:
name: Lint check
uses: apify/workflows/.github/workflows/python_lint_check.yaml@main
with:
python_versions: '["3.10", "3.11", "3.12", "3.13", "3.14"]'
type_check:
name: Type check
uses: apify/workflows/.github/workflows/python_type_check.yaml@main
with:
python_versions: '["3.10", "3.11", "3.12", "3.13", "3.14"]'
================================================
FILE: .github/workflows/_check_docs.yaml
================================================
name: Doc checks
on:
# Runs when manually triggered from the GitHub UI.
workflow_dispatch:
# Runs when invoked by another workflow.
workflow_call:
permissions:
contents: read
jobs:
doc_checks:
name: Doc checks
uses: apify/workflows/.github/workflows/python_docs_check.yaml@main
================================================
FILE: .github/workflows/_release_docs.yaml
================================================
name: Doc release
on:
# Runs when manually triggered from the GitHub UI.
workflow_dispatch:
# Runs when invoked by another workflow.
workflow_call:
inputs:
ref:
required: true
type: string
permissions:
contents: read
env:
NODE_VERSION: 22
PYTHON_VERSION: 3.14
CHECKOUT_REF: ${{ github.event_name == 'workflow_call' && inputs.ref || github.ref }}
jobs:
release_docs:
name: Doc release
environment:
name: github-pages
permissions:
contents: write
pages: write
id-token: write
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
token: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }}
ref: ${{ env.CHECKOUT_REF }}
- name: Set up Node
uses: actions/setup-node@v6
with:
node-version: ${{ env.NODE_VERSION }}
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Set up uv package manager
uses: astral-sh/setup-uv@v7
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Python dependencies
run: uv run poe install-dev
- name: Build Docusaurus docs
run: uv run poe build-docs
env:
APIFY_SIGNING_TOKEN: ${{ secrets.APIFY_SIGNING_TOKEN }}
SEGMENT_TOKEN: ${{ secrets.SEGMENT_TOKEN }}
- name: Set up GitHub Pages
uses: actions/configure-pages@v5
- name: Upload GitHub Pages artifact
uses: actions/upload-pages-artifact@v4
with:
path: ./website/build
- name: Deploy artifact to GitHub Pages
uses: actions/deploy-pages@v4
- name: Invalidate CloudFront cache
run: |
gh workflow run invalidate-cloudfront.yml \
--repo apify/apify-docs-private \
--field deployment=crawlee-web
echo "✅ CloudFront cache invalidation workflow triggered successfully"
env:
GITHUB_TOKEN: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }}
================================================
FILE: .github/workflows/_tests.yaml
================================================
name: Tests
on:
# Runs when manually triggered from the GitHub UI.
workflow_dispatch:
# Runs when invoked by another workflow.
workflow_call:
permissions:
contents: read
jobs:
unit_tests:
name: Unit tests
uses: apify/workflows/.github/workflows/python_unit_tests.yaml@main
secrets: inherit
with:
python_versions: '["3.10", "3.11", "3.12", "3.13", "3.14"]'
operating_systems: '["ubuntu-latest", "windows-latest", "macos-latest"]'
python_version_for_codecov: "3.14"
operating_system_for_codecov: ubuntu-latest
tests_concurrency: "8"
================================================
FILE: .github/workflows/manual_release_stable.yaml
================================================
name: Stable release
on:
# Runs when manually triggered from the GitHub UI, with options to specify the type of release.
workflow_dispatch:
inputs:
release_type:
description: Release type
required: true
type: choice
default: auto
options:
- auto
- custom
- patch
- minor
- major
custom_version:
description: The custom version to bump to (only for "custom" type)
required: false
type: string
default: ""
concurrency:
group: release
cancel-in-progress: false
permissions:
contents: read
jobs:
code_checks:
name: Code checks
uses: ./.github/workflows/_check_code.yaml
release_prepare:
name: Release prepare
needs: [code_checks]
runs-on: ubuntu-latest
outputs:
version_number: ${{ steps.release_prepare.outputs.version_number }}
tag_name: ${{ steps.release_prepare.outputs.tag_name }}
changelog: ${{ steps.release_prepare.outputs.changelog }}
release_notes: ${{ steps.release_prepare.outputs.release_notes }}
steps:
- uses: apify/workflows/git-cliff-release@main
name: Release prepare
id: release_prepare
with:
release_type: ${{ inputs.release_type }}
custom_version: ${{ inputs.custom_version }}
existing_changelog_path: CHANGELOG.md
changelog_update:
name: Changelog update
needs: [release_prepare]
permissions:
contents: write
uses: apify/workflows/.github/workflows/python_bump_and_update_changelog.yaml@main
with:
version_number: ${{ needs.release_prepare.outputs.version_number }}
changelog: ${{ needs.release_prepare.outputs.changelog }}
secrets: inherit
github_release:
name: GitHub release
needs: [release_prepare, changelog_update]
runs-on: ubuntu-latest
permissions:
contents: write
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: GitHub release
uses: softprops/action-gh-release@v2
with:
tag_name: ${{ needs.release_prepare.outputs.tag_name }}
name: ${{ needs.release_prepare.outputs.version_number }}
target_commitish: ${{ needs.changelog_update.outputs.changelog_commitish }}
body: ${{ needs.release_prepare.outputs.release_notes }}
pypi_publish:
name: PyPI publish
needs: [release_prepare, changelog_update]
runs-on: ubuntu-latest
permissions:
contents: write
id-token: write # Required for OIDC authentication.
environment:
name: pypi
url: https://pypi.org/project/crawlee
steps:
- name: Prepare distribution
uses: apify/workflows/prepare-pypi-distribution@main
with:
package_name: crawlee
is_prerelease: ""
version_number: ${{ needs.release_prepare.outputs.version_number }}
ref: ${{ needs.changelog_update.outputs.changelog_commitish }}
# Publishes the package to PyPI using PyPA official GitHub action with OIDC authentication.
- name: Publish package to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
# TODO: add job for publish package to Conda
# https://github.com/apify/crawlee-python/issues/104
doc_release:
name: Doc release
needs: [changelog_update, pypi_publish]
permissions:
contents: write
pages: write
id-token: write
uses: ./.github/workflows/_release_docs.yaml
with:
# Use the ref from the changelog update to include the updated changelog.
ref: ${{ needs.changelog_update.outputs.changelog_commitish }}
secrets: inherit
================================================
FILE: .github/workflows/on_issue.yaml
================================================
name: CI (issue)
on:
# Runs when a new issue is opened.
issues:
types:
- opened
permissions:
contents: read
jobs:
label_issues:
name: Add labels
runs-on: ubuntu-latest
permissions:
issues: write
steps:
# Add the "t-tooling" label to all new issues
- uses: actions/github-script@v8
with:
script: |
github.rest.issues.addLabels({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
labels: ["t-tooling"]
})
================================================
FILE: .github/workflows/on_master.yaml
================================================
name: CI (master)
on:
push:
branches:
- master
tags-ignore:
- "**" # Ignore all tags to avoid duplicate executions triggered by tag pushes.
concurrency:
group: release
cancel-in-progress: false
permissions:
contents: read
jobs:
doc_checks:
name: Doc checks
uses: ./.github/workflows/_check_docs.yaml
doc_release:
# Skip this for non-"docs" commits.
if: startsWith(github.event.head_commit.message, 'docs')
name: Doc release
needs: [doc_checks]
permissions:
contents: write
pages: write
id-token: write
uses: ./.github/workflows/_release_docs.yaml
with:
# Use the same ref as the one that triggered the workflow.
ref: ${{ github.ref }}
secrets: inherit
code_checks:
name: Code checks
uses: ./.github/workflows/_check_code.yaml
tests:
# Skip this for "docs" commits.
if: "!startsWith(github.event.head_commit.message, 'docs')"
name: Tests
uses: ./.github/workflows/_tests.yaml
secrets: inherit
release_prepare:
# Run this only for "feat", "fix", "perf", "refactor" and "style" commits.
if: >-
startsWith(github.event.head_commit.message, 'feat') ||
startsWith(github.event.head_commit.message, 'fix') ||
startsWith(github.event.head_commit.message, 'perf') ||
startsWith(github.event.head_commit.message, 'refactor') ||
startsWith(github.event.head_commit.message, 'style')
name: Release prepare
needs: [code_checks, tests]
runs-on: ubuntu-latest
outputs:
version_number: ${{ steps.release_prepare.outputs.version_number }}
tag_name: ${{ steps.release_prepare.outputs.tag_name }}
changelog: ${{ steps.release_prepare.outputs.changelog }}
steps:
- uses: apify/workflows/git-cliff-release@main
id: release_prepare
name: Release prepare
with:
release_type: prerelease
existing_changelog_path: CHANGELOG.md
changelog_update:
name: Changelog update
needs: [release_prepare]
permissions:
contents: write
uses: apify/workflows/.github/workflows/python_bump_and_update_changelog.yaml@main
with:
version_number: ${{ needs.release_prepare.outputs.version_number }}
changelog: ${{ needs.release_prepare.outputs.changelog }}
secrets: inherit
pypi_publish:
name: PyPI publish
needs: [release_prepare, changelog_update]
runs-on: ubuntu-latest
permissions:
contents: write
id-token: write # Required for OIDC authentication.
environment:
name: pypi
url: https://pypi.org/project/crawlee
steps:
- name: Prepare distribution
uses: apify/workflows/prepare-pypi-distribution@main
with:
package_name: crawlee
is_prerelease: "yes"
version_number: ${{ needs.release_prepare.outputs.version_number }}
ref: ${{ needs.changelog_update.outputs.changelog_commitish }}
- name: Publish package to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
doc_release_post_publish:
name: Doc release post publish
needs: [changelog_update, pypi_publish]
permissions:
contents: write
pages: write
id-token: write
uses: ./.github/workflows/_release_docs.yaml
with:
# Use the ref from the changelog update to include the updated changelog.
ref: ${{ needs.changelog_update.outputs.changelog_commitish }}
secrets: inherit
================================================
FILE: .github/workflows/on_pull_request.yaml
================================================
name: CI (PR)
on:
# Runs whenever a pull request is opened or updated.
pull_request:
permissions:
contents: read
pull-requests: read
jobs:
pr_title_check:
name: PR title check
runs-on: ubuntu-latest
steps:
- uses: amannn/action-semantic-pull-request@v6.1.1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
doc_checks:
name: Doc checks
uses: ./.github/workflows/_check_docs.yaml
code_checks:
name: Code checks
uses: ./.github/workflows/_check_code.yaml
tests:
name: Tests
uses: ./.github/workflows/_tests.yaml
secrets: inherit
================================================
FILE: .github/workflows/on_schedule_tests.yaml
================================================
name: Scheduled tests
on:
# Runs when manually triggered from the GitHub UI.
workflow_dispatch:
# Runs on a daily schedule at 06:00 UTC.
schedule:
- cron: '0 6 * * *'
concurrency:
group: scheduled-tests
cancel-in-progress: false
permissions:
contents: read
env:
NODE_VERSION: 22
PYTHON_VERSION: 3.14
TESTS_CONCURRENCY: 1
jobs:
end_to_end_tests:
name: End-to-end tests
strategy:
fail-fast: false
max-parallel: 12
matrix:
crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup"]
http-client: ["httpx", "curl_impersonate"]
package-manager: ["pip", "uv", "poetry"]
runs-on: "ubuntu-latest"
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Setup node
uses: actions/setup-node@v6
with:
node-version: ${{ env.NODE_VERSION }}
- name: Install dependencies
run: npm install -g apify-cli
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v6
with:
python-version: ${{ env.PYTHON_VERSION }}
# installed to be able to patch crawlee in the poetry.lock with custom wheel file for poetry based templates
- name: Install poetry
run: pipx install poetry
- name: Set up uv package manager
uses: astral-sh/setup-uv@v7
with:
python-version: ${{ env.PYTHON_VERSION }}
# Sync the project, but no need to install the browsers into the test runner environment.
- name: Install Python dependencies
run: uv run poe install-sync
- name: Run templates end-to-end tests
run: uv run poe e2e-templates-tests -m "${{ matrix.http-client }} and ${{ matrix.crawler-type }} and ${{ matrix.package-manager }}"
env:
APIFY_TEST_USER_API_TOKEN: ${{ secrets.APIFY_TEST_USER_API_TOKEN }}
================================================
FILE: .gitignore
================================================
# AI assistant files
.agent
.agents
.ai
.aider
.claude
.codeium
.continue
.copilot
.cursor
.gemini
.llm
.llms
.openai
.serena
.windsurf
.zed-ai
AGENTS.local.md
CLAUDE.local.md
GEMINI.local.md
# Cache
__pycache__
.pytest_cache
.ruff_cache
.ty_cache
.uv-cache
# Virtual envs
.direnv
.env
.envrc
.python-version
.venv
# Other Python tools
.ropeproject
# Mise
mise.toml
.mise.toml
# Egg and build artifacts
*.egg-info/
*.egg
dist/
build/
# Coverage reports
.coverage*
htmlcov
coverage-unit.xml
coverage-integration.xml
# IDE, editors
*~
.DS_Store
.idea
.nvim.lua
.vscode
.zed
Session.vim
# Docs
docs/changelog.md
# Website build artifacts, node dependencies
website/build
website/node_modules
website/.yarn
website/.docusaurus
website/api-typedoc-generated.json
website/apify-shared-docspec-dump.jsonl
website/docspec-dump.jsonl
website/module_shortcuts.json
website/typedoc-types*
# npm lockfile (we use yarn)
website/package-lock.json
# Default directory for memory storage
storage/
# Tmp dir
tmp/
================================================
FILE: .markdownlint.yaml
================================================
default: true
line-length:
line_length: 120
MD007:
indent: 4
MD004:
style: dash
no-inline-html: false
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: local
hooks:
- id: lint-check
name: Lint check
entry: uv run poe lint
language: system
pass_filenames: false
- id: type-check
name: Type check
entry: uv run poe type-check
language: system
pass_filenames: false
================================================
FILE: .rules.md
================================================
# Coding guidelines
This file provides guidance to programming agents when working with code in this repository.
## Development Commands
All commands use `uv` (package manager) and `poe` (task runner):
```bash
# Install all dependencies (dev + extras + pre-commit + playwright)
uv run poe install-dev
# Run full check suite (lint + type-check + unit tests)
uv run poe check-code
# Linting (ruff format check + ruff check)
uv run poe lint
# Auto-fix formatting
uv run poe format
# Type checking (ty)
uv run poe type-check
# Run all unit tests
uv run poe unit-tests
# Run a single test file
uv run pytest tests/unit/path/to/test_file.py
# Run a single test by name
uv run pytest tests/unit/path/to/test_file.py::test_name -v
# Run tests with coverage XML report
uv run poe unit-tests-cov
# Build package
uv run poe build
# Clean build artifacts
uv run poe clean
```
Note: `uv run poe unit-tests` first runs tests marked `@pytest.mark.run_alone` in isolation, then runs the rest with `-x` (fail-fast) and parallelism via `pytest-xdist`.
## Code Style
- **Linter/formatter**: Ruff with `select = ["ALL"]` and specific ignores
- **Line length**: 120 characters
- **Quotes**: Single quotes (double for docstrings)
- **Docstrings**: Google format (enforced by Ruff)
- **Type checker**: ty (Astral's type checker), target Python 3.10
- **Async mode**: pytest-asyncio in `auto` mode (no need for `@pytest.mark.asyncio`)
- **Commit format**: Conventional Commits (`feat:`, `fix:`, `docs:`, `refactor:`, `test:`, etc.)
## Architecture
### Crawler Hierarchy
```
BasicCrawler[TCrawlingContext, TStatisticsState]
├── AbstractHttpCrawler → HttpCrawler, BeautifulSoupCrawler, ParselCrawler
├── PlaywrightCrawler
└── AdaptivePlaywrightCrawler (extends PlaywrightCrawler)
```
- **BasicCrawler** (`src/crawlee/crawlers/_basic/`): Core request lifecycle, autoscaling pool, retries, session management, router dispatch. Generic over `TCrawlingContext`.
- **AbstractHttpCrawler** (`src/crawlee/crawlers/_abstract_http/`): Adds HTTP client integration, response parsing, pre-navigation hooks. Generic over parser result type.
- **PlaywrightCrawler** (`src/crawlee/crawlers/_playwright/`): Browser-based crawling with Playwright.
### Context Pipeline (Middleware Pattern)
Contexts are progressively enhanced through `ContextPipeline` middleware:
```
BasicCrawlingContext → HttpCrawlingContext → ParsedHttpCrawlingContext → BeautifulSoupCrawlingContext
```
Each middleware is an async generator that wraps the next handler, enabling setup/teardown around request processing.
### Storage Layer
Three-tier design:
- **High-level**: `Dataset`, `KeyValueStore`, `RequestQueue` in `src/crawlee/storages/`
- **Storage clients** (`src/crawlee/storage_clients/`): `FileSystemStorageClient` (default), `MemoryStorageClient`, `SqlStorageClient`, `RedisStorageClient`
- **Instance caching**: `StorageInstanceManager` is a global singleton that caches storage instances by ID/name
### Service Locator
`src/crawlee/_service_locator.py` is a global singleton managing `Configuration`, `EventManager`, `StorageClient`, and `StorageInstanceManager`. Prevents double-initialization with `ServiceConflictError`.
### HTTP Clients
Pluggable via `HttpClient` interface in `src/crawlee/http_clients/`:
- `ImpitHttpClient` (default), `HttpxHttpClient`, `CurlImpersonateHttpClient`
- Each provides `crawl()` (for crawler pipeline) and `send_request()` (for in-handler use)
### Request Model
`Request` (`src/crawlee/_request.py`) uses `unique_key` for deduplication. Lifecycle states: `UNPROCESSED → DONE`. Crawlee-specific metadata stored in `user_data['__crawlee']`.
### Router
```python
@crawler.router.default_handler
async def handler(context: BeautifulSoupCrawlingContext): ...
@crawler.router.handler(label='detail')
async def detail(context: BeautifulSoupCrawlingContext): ...
```
Requests are routed by their `label` field; unmatched requests go to the default handler.
### Key Directories
- `src/crawlee/crawlers/` - All crawler implementations
- `src/crawlee/storages/` - Dataset, KVS, RequestQueue
- `src/crawlee/storage_clients/` - Backend implementations
- `src/crawlee/http_clients/` - HTTP client implementations
- `src/crawlee/browsers/` - Playwright browser pool and plugins
- `src/crawlee/sessions/` - Session management with cookie persistence
- `src/crawlee/events/` - Event system (persist state, progress, aborting)
- `src/crawlee/_autoscaling/` - Autoscaled pool for concurrency control
- `src/crawlee/fingerprint_suite/` - Anti-bot fingerprint generation
- `src/crawlee/project_template/` - CLI scaffolding template (excluded from linting)
- `tests/unit/` - Unit tests
- `tests/e2e/` - End-to-end tests (require `apify-cli` + API token)
================================================
FILE: CHANGELOG.md
================================================
# Changelog
All notable changes to this project will be documented in this file.
## [1.6.0](https://github.com/apify/crawlee-python/releases/tag/v1.6.0) (2026-03-20)
### 🚀 Features
- Allow non-href links extract & enqueue ([#1781](https://github.com/apify/crawlee-python/pull/1781)) ([6db365d](https://github.com/apify/crawlee-python/commit/6db365d1625206d8d691256c9cd4b44a821238bb)) by [@kozlice](https://github.com/kozlice)
- Add `post_navigation_hooks` to crawlers ([#1795](https://github.com/apify/crawlee-python/pull/1795)) ([38ceda6](https://github.com/apify/crawlee-python/commit/38ceda635a18cb2f14efc7c8e8b67f3adb7e53fd)) by [@Mantisus](https://github.com/Mantisus)
- Add page lifecycle hooks to `BrowserPool` ([#1791](https://github.com/apify/crawlee-python/pull/1791)) ([6f2ac13](https://github.com/apify/crawlee-python/commit/6f2ac13fea4cfa8a65e6e41430d3e8d28cc3a787)) by [@Mantisus](https://github.com/Mantisus)
- Expose `BrowserType` and `CrawleePage` ([#1798](https://github.com/apify/crawlee-python/pull/1798)) ([b50b9f2](https://github.com/apify/crawlee-python/commit/b50b9f2a8396dcee2bd7eaf76c94d24912c2bc5f)) by [@Mantisus](https://github.com/Mantisus)
- Expose `use_state` in `BasicCrawler` ([#1799](https://github.com/apify/crawlee-python/pull/1799)) ([d121873](https://github.com/apify/crawlee-python/commit/d121873a7f5902b911dd04b4aa9eaf75a8449323)) by [@Mantisus](https://github.com/Mantisus)
### 🐛 Bug Fixes
- **redis:** Do not remove handled request data from request queue ([#1787](https://github.com/apify/crawlee-python/pull/1787)) ([3008c61](https://github.com/apify/crawlee-python/commit/3008c61dcbe07ccdf3c43f198b37582cc1356c9a)) by [@kozlice](https://github.com/kozlice)
- **redis:** Update actual `Request` state in request queue Redis storage client ([#1789](https://github.com/apify/crawlee-python/pull/1789)) ([787231c](https://github.com/apify/crawlee-python/commit/787231cebeb863ee2b4395964a79a37053dbec01)) by [@Mantisus](https://github.com/Mantisus)
## [1.5.0](https://github.com/apify/crawlee-python/releases/tag/v1.5.0) (2026-03-06)
### 🚀 Features
- Use specialized Playwright docker images in templates ([#1757](https://github.com/apify/crawlee-python/pull/1757)) ([747c0cf](https://github.com/apify/crawlee-python/commit/747c0cf4a82296a2e3ea5cac5ef4c9578ea62a0c)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1756](https://github.com/apify/crawlee-python/issues/1756)
- Add `discover_valid_sitemaps` utility ([#1777](https://github.com/apify/crawlee-python/pull/1777)) ([872447b](https://github.com/apify/crawlee-python/commit/872447b60bbdb3926068064a971492807b1bdfbb)) by [@Mantisus](https://github.com/Mantisus), closes [#1740](https://github.com/apify/crawlee-python/issues/1740)
### 🐛 Bug Fixes
- Prevent list modification during iteration in BrowserPool ([#1703](https://github.com/apify/crawlee-python/pull/1703)) ([70309d9](https://github.com/apify/crawlee-python/commit/70309d9bf568d268a26b3ba6392be2b6ff284c65)) by [@vdusek](https://github.com/vdusek)
- Fix ` max_requests_per_crawl` excluding failed requests ([#1766](https://github.com/apify/crawlee-python/pull/1766)) ([d6bb0b4](https://github.com/apify/crawlee-python/commit/d6bb0b4a9dc5dd6668d076fbfa1b5e748deaee0d)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1765](https://github.com/apify/crawlee-python/issues/1765)
- **playwright:** Dispose of `APIResponse` body for `send_request` ([#1771](https://github.com/apify/crawlee-python/pull/1771)) ([29d301b](https://github.com/apify/crawlee-python/commit/29d301bf9d7795f2fbaddb99235a7157b880f60c)) by [@kozlice](https://github.com/kozlice)
- Return `None` from `add_request` when storage client fails to enqueue request ([#1775](https://github.com/apify/crawlee-python/pull/1775)) ([944753a](https://github.com/apify/crawlee-python/commit/944753a71956c30f3ce0896ffa24be7de5348933)) by [@Mantisus](https://github.com/Mantisus)
- Re-use pre-existing browser context in `PlaywrightBrowserController` ([#1778](https://github.com/apify/crawlee-python/pull/1778)) ([4487543](https://github.com/apify/crawlee-python/commit/44875433df83d433aa69ada458b91df3ad569f5e)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1776](https://github.com/apify/crawlee-python/issues/1776)
## [1.4.0](https://github.com/apify/crawlee-python/releases/tag/v1.4.0) (2026-02-17)
### 🚀 Features
- Dynamic memory snapshots ([#1715](https://github.com/apify/crawlee-python/pull/1715)) ([568a7b1](https://github.com/apify/crawlee-python/commit/568a7b186dedda19ad814ee8af3cd8e256cc4ad9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1704](https://github.com/apify/crawlee-python/issues/1704)
- Add `MySQL` and `MariaDB` support for `SqlStorageClient` ([#1749](https://github.com/apify/crawlee-python/pull/1749)) ([202b500](https://github.com/apify/crawlee-python/commit/202b5009ea5d35ea779eb5b8db1fc575f90ca7bb)) by [@Mantisus](https://github.com/Mantisus)
### 🐛 Bug Fixes
- Make log levels consistent in ServiceLocator ([#1746](https://github.com/apify/crawlee-python/pull/1746)) ([4163413](https://github.com/apify/crawlee-python/commit/4163413049485b035c38efd6a4a7d41502a44cfc)) by [@janbuchar](https://github.com/janbuchar)
- Fix `PlaywrightCrawler` unintentionally setting the global configuration ([#1747](https://github.com/apify/crawlee-python/pull/1747)) ([fa58438](https://github.com/apify/crawlee-python/commit/fa58438026eb72a6002c8d494725bf4e48b4407e)) by [@Pijukatel](https://github.com/Pijukatel)
- Fix `Snapshotter` handling of out of order samples ([#1735](https://github.com/apify/crawlee-python/pull/1735)) ([387c712](https://github.com/apify/crawlee-python/commit/387c712306055d901b1c0df4a9666967f039aefd)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1734](https://github.com/apify/crawlee-python/issues/1734)
### ⚡ Performance
- Optimize metadata records processing in `SqlStorageClient` ([#1551](https://github.com/apify/crawlee-python/pull/1551)) ([df1347a](https://github.com/apify/crawlee-python/commit/df1347aacf05c05980000d15b36b65996119ea86)) by [@Mantisus](https://github.com/Mantisus), closes [#1533](https://github.com/apify/crawlee-python/issues/1533)
## [1.3.2](https://github.com/apify/crawlee-python/releases/tag/v1.3.2) (2026-02-09)
### 🐛 Bug Fixes
- Use `max()` instead of `min()` for `request_max_duration` statistic ([#1701](https://github.com/apify/crawlee-python/pull/1701)) ([85c4335](https://github.com/apify/crawlee-python/commit/85c43351a05ada1369b720061f6f1a7e158340b6)) by [@vdusek](https://github.com/vdusek)
- Prevent mutation of default URL patterns list in `block_requests` ([#1702](https://github.com/apify/crawlee-python/pull/1702)) ([fcf9adb](https://github.com/apify/crawlee-python/commit/fcf9adb6a0cfeaa87ca482372d4e066584eb28d6)) by [@vdusek](https://github.com/vdusek)
- Keep None values for `user_data` in `Request` ([#1707](https://github.com/apify/crawlee-python/pull/1707)) ([3c575bc](https://github.com/apify/crawlee-python/commit/3c575bc2b0f1c89c99d134ad3a3fa7455ccc6910)) by [@Mantisus](https://github.com/Mantisus), closes [#1706](https://github.com/apify/crawlee-python/issues/1706)
- Respect `max_open_pages_per_browser` limit for `PlaywrightBrowserController` on concurrent `new_page` calls ([#1712](https://github.com/apify/crawlee-python/pull/1712)) ([2e5534b](https://github.com/apify/crawlee-python/commit/2e5534b98913d5cbd6b721b2423d063772024417)) by [@Mantisus](https://github.com/Mantisus)
## [1.3.1](https://github.com/apify/crawlee-python/releases/tag/v1.3.1) (2026-01-30)
### 🐛 Bug Fixes
- Reset all counter in metadata with `purge` for `RequestQueue` ([#1686](https://github.com/apify/crawlee-python/pull/1686)) ([ee09260](https://github.com/apify/crawlee-python/commit/ee0926084589f1b6e15840b6185ec5433be3b72f)) by [@Mantisus](https://github.com/Mantisus), closes [#1682](https://github.com/apify/crawlee-python/issues/1682)
- Set default `http3=False` for `ImpitHttpClient` ([#1685](https://github.com/apify/crawlee-python/pull/1685)) ([3f390f6](https://github.com/apify/crawlee-python/commit/3f390f677540a3905038d7db6a6d1efad32fd045)) by [@Mantisus](https://github.com/Mantisus), closes [#1683](https://github.com/apify/crawlee-python/issues/1683)
- Prevent get_request from permanently blocking requests ([#1684](https://github.com/apify/crawlee-python/pull/1684)) ([da416f9](https://github.com/apify/crawlee-python/commit/da416f98fb453904d62e7d29d8f24611ffb3ba8d)) by [@Mirza-Samad-Ahmed-Baig](https://github.com/Mirza-Samad-Ahmed-Baig)
- Do not share state between different crawlers unless requested ([#1669](https://github.com/apify/crawlee-python/pull/1669)) ([64c246b](https://github.com/apify/crawlee-python/commit/64c246bedea14f86e607d23adc5bec644c578364)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1627](https://github.com/apify/crawlee-python/issues/1627)
## [1.3.0](https://github.com/apify/crawlee-python/releases/tag/v1.3.0) (2026-01-20)
### 🚀 Features
- Expose `AdaptivePlaywrightCrawlerStatisticState` for `AdaptivePlaywrightCrawler` ([#1635](https://github.com/apify/crawlee-python/pull/1635)) ([1bb4bcb](https://github.com/apify/crawlee-python/commit/1bb4bcb4ccbec347ad9c14f70e9e946d48e3c38e)) by [@Mantisus](https://github.com/Mantisus)
### 🐛 Bug Fixes
- Prevent race condition in concurrent storage creation ([#1626](https://github.com/apify/crawlee-python/pull/1626)) ([7f17a43](https://github.com/apify/crawlee-python/commit/7f17a4347d5884962767e757a92ec173688fed7b)) by [@Mantisus](https://github.com/Mantisus), closes [#1621](https://github.com/apify/crawlee-python/issues/1621)
- Create correct statistics for `AdaptivePlaywrightCrawler` on initialization with a custom parser ([#1637](https://github.com/apify/crawlee-python/pull/1637)) ([bff7260](https://github.com/apify/crawlee-python/commit/bff726055dd0d7e07a2c546b15cbee22abd85960)) by [@Mantisus](https://github.com/Mantisus), closes [#1630](https://github.com/apify/crawlee-python/issues/1630)
- Fix adding extra link for `EnqueueLinksFunction` with `limit` ([#1674](https://github.com/apify/crawlee-python/pull/1674)) ([71d7867](https://github.com/apify/crawlee-python/commit/71d7867b14f7f07cac06899f5da006091af4a954)) by [@Mantisus](https://github.com/Mantisus), closes [#1673](https://github.com/apify/crawlee-python/issues/1673)
## [1.2.1](https://github.com/apify/crawlee-python/releases/tag/v1.2.1) (2025-12-16)
### 🐛 Bug Fixes
- Fix short error summary ([#1605](https://github.com/apify/crawlee-python/pull/1605)) ([b751208](https://github.com/apify/crawlee-python/commit/b751208d9a56e9d923e4559baeba35e2eede0450)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1602](https://github.com/apify/crawlee-python/issues/1602)
- Freeze core `Request` fields ([#1603](https://github.com/apify/crawlee-python/pull/1603)) ([ae6d86b](https://github.com/apify/crawlee-python/commit/ae6d86b8c82900116032596201d94cd7875aaadc)) by [@Mantisus](https://github.com/Mantisus)
- Respect `enqueue_strategy` after redirects in `enqueue_links` ([#1607](https://github.com/apify/crawlee-python/pull/1607)) ([700df91](https://github.com/apify/crawlee-python/commit/700df91bc9be1299388030a3e48e4dbc6f5b85a0)) by [@Mantisus](https://github.com/Mantisus), closes [#1606](https://github.com/apify/crawlee-python/issues/1606)
- Protect `Request` from partial mutations on request handler failure ([#1585](https://github.com/apify/crawlee-python/pull/1585)) ([a69caf8](https://github.com/apify/crawlee-python/commit/a69caf87edecc755287c53c8cc0ca4725af5d411)) by [@Mantisus](https://github.com/Mantisus), closes [#1514](https://github.com/apify/crawlee-python/issues/1514)
## [1.2.0](https://github.com/apify/crawlee-python/releases/tag/v1.2.0) (2025-12-08)
### 🚀 Features
- Add additional kwargs to Crawler's export_data ([#1597](https://github.com/apify/crawlee-python/pull/1597)) ([5977f37](https://github.com/apify/crawlee-python/commit/5977f376b93a7c0d4dd53f0d331a4b04fedba2c6)) by [@vdusek](https://github.com/vdusek), closes [#526](https://github.com/apify/crawlee-python/issues/526)
- Add `goto_options` for `PlaywrightCrawler` ([#1599](https://github.com/apify/crawlee-python/pull/1599)) ([0b82f3b](https://github.com/apify/crawlee-python/commit/0b82f3b6fb175223ea2aa5b348afcd5fdb767972)) by [@Mantisus](https://github.com/Mantisus), closes [#1576](https://github.com/apify/crawlee-python/issues/1576)
### 🐛 Bug Fixes
- Only apply requestHandlerTimeout to request handler ([#1474](https://github.com/apify/crawlee-python/pull/1474)) ([0dfb6c2](https://github.com/apify/crawlee-python/commit/0dfb6c2a13b6650736245fa39b3fbff397644df7)) by [@janbuchar](https://github.com/janbuchar)
- Handle the case when `error_handler` returns `Request` ([#1595](https://github.com/apify/crawlee-python/pull/1595)) ([8a961a2](https://github.com/apify/crawlee-python/commit/8a961a2b07d0d33a7302dbb13c17f3d90999d390)) by [@Mantisus](https://github.com/Mantisus)
- Align `Request.state` transitions with `Request` lifecycle ([#1601](https://github.com/apify/crawlee-python/pull/1601)) ([383225f](https://github.com/apify/crawlee-python/commit/383225f9f055d95ffb1302b8cf96f42ec264f1fc)) by [@Mantisus](https://github.com/Mantisus)
## [1.1.1](https://github.com/apify/crawlee-python/releases/tag/v1.1.1) (2025-12-02)
### 🐛 Bug Fixes
- Unify separators in `unique_key` construction ([#1569](https://github.com/apify/crawlee-python/pull/1569)) ([af46a37](https://github.com/apify/crawlee-python/commit/af46a3733b059a8052489296e172f005def953f7)) by [@vdusek](https://github.com/vdusek), closes [#1512](https://github.com/apify/crawlee-python/issues/1512)
- Fix `same-domain` strategy ignoring public suffix ([#1572](https://github.com/apify/crawlee-python/pull/1572)) ([3d018b2](https://github.com/apify/crawlee-python/commit/3d018b21a28a4bee493829783057188d6106a69b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1571](https://github.com/apify/crawlee-python/issues/1571)
- Make context helpers work in `FailedRequestHandler` and `ErrorHandler` ([#1570](https://github.com/apify/crawlee-python/pull/1570)) ([b830019](https://github.com/apify/crawlee-python/commit/b830019350830ac33075316061659e2854f7f4a5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1532](https://github.com/apify/crawlee-python/issues/1532)
- Fix non-ASCII character corruption in `FileSystemStorageClient` on systems without UTF-8 default encoding ([#1580](https://github.com/apify/crawlee-python/pull/1580)) ([f179f86](https://github.com/apify/crawlee-python/commit/f179f8671b0b6af9264450e4fef7e49d1cecd2bd)) by [@Mantisus](https://github.com/Mantisus), closes [#1579](https://github.com/apify/crawlee-python/issues/1579)
- Respect `<base>` when enqueuing ([#1590](https://github.com/apify/crawlee-python/pull/1590)) ([de517a1](https://github.com/apify/crawlee-python/commit/de517a1629cc29b20568143eb64018f216d4ba33)) by [@Mantisus](https://github.com/Mantisus), closes [#1589](https://github.com/apify/crawlee-python/issues/1589)
## [1.1.0](https://github.com/apify/crawlee-python/releases/tag/v1.1.0) (2025-11-18)
### 🚀 Features
- Add `chrome` `BrowserType` for `PlaywrightCrawler` to use the Chrome browser ([#1487](https://github.com/apify/crawlee-python/pull/1487)) ([b06937b](https://github.com/apify/crawlee-python/commit/b06937bbc3afe3c936b554bfc503365c1b2c526b)) by [@Mantisus](https://github.com/Mantisus), closes [#1071](https://github.com/apify/crawlee-python/issues/1071)
- Add `RedisStorageClient` based on Redis v8.0+ ([#1406](https://github.com/apify/crawlee-python/pull/1406)) ([d08d13d](https://github.com/apify/crawlee-python/commit/d08d13d39203c24ab61fe254b0956d6744db3b5f)) by [@Mantisus](https://github.com/Mantisus)
- Add support for Python 3.14 ([#1553](https://github.com/apify/crawlee-python/pull/1553)) ([89e9130](https://github.com/apify/crawlee-python/commit/89e9130cabee0fbc974b29c26483b7fa0edf627c)) by [@Mantisus](https://github.com/Mantisus)
- Add `transform_request_function` parameter for `SitemapRequestLoader` ([#1525](https://github.com/apify/crawlee-python/pull/1525)) ([dc90127](https://github.com/apify/crawlee-python/commit/dc901271849b239ba2a947e8ebff8e1815e8c4fb)) by [@Mantisus](https://github.com/Mantisus)
### 🐛 Bug Fixes
- Improve indexing of the `request_queue_records` table for `SqlRequestQueueClient` ([#1527](https://github.com/apify/crawlee-python/pull/1527)) ([6509534](https://github.com/apify/crawlee-python/commit/65095346a9d8b703b10c91e0510154c3c48a4176)) by [@Mantisus](https://github.com/Mantisus), closes [#1526](https://github.com/apify/crawlee-python/issues/1526)
- Improve error handling for `RobotsTxtFile.load` ([#1524](https://github.com/apify/crawlee-python/pull/1524)) ([596a311](https://github.com/apify/crawlee-python/commit/596a31184914a254b3e7a81fd2f48ea8eda7db49)) by [@Mantisus](https://github.com/Mantisus)
- Fix `crawler_runtime` not being updated during run and only in the end ([#1540](https://github.com/apify/crawlee-python/pull/1540)) ([0d6c3f6](https://github.com/apify/crawlee-python/commit/0d6c3f6d3337ddb6cab4873747c28cf95605d550)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1541](https://github.com/apify/crawlee-python/issues/1541)
- Ensure persist state event emission when exiting `EventManager` context ([#1562](https://github.com/apify/crawlee-python/pull/1562)) ([6a44f17](https://github.com/apify/crawlee-python/commit/6a44f172600cbcacebab899082d6efc9105c4e03)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1560](https://github.com/apify/crawlee-python/issues/1560)
## [1.0.4](https://github.com/apify/crawlee-python/releases/tag/v1.0.4) (2025-10-24)
### 🐛 Bug Fixes
- Respect `enqueue_strategy` in `enqueue_links` ([#1505](https://github.com/apify/crawlee-python/pull/1505)) ([6ee04bc](https://github.com/apify/crawlee-python/commit/6ee04bc08c50a70f2e956a79d4ce5072a726c3a8)) by [@Mantisus](https://github.com/Mantisus), closes [#1504](https://github.com/apify/crawlee-python/issues/1504)
- Exclude incorrect links before checking `robots.txt` ([#1502](https://github.com/apify/crawlee-python/pull/1502)) ([3273da5](https://github.com/apify/crawlee-python/commit/3273da5fee62ec9254666b376f382474c3532a56)) by [@Mantisus](https://github.com/Mantisus), closes [#1499](https://github.com/apify/crawlee-python/issues/1499)
- Resolve compatibility issue between `SqlStorageClient` and `AdaptivePlaywrightCrawler` ([#1496](https://github.com/apify/crawlee-python/pull/1496)) ([ce172c4](https://github.com/apify/crawlee-python/commit/ce172c425a8643a1d4c919db4f5e5a6e47e91deb)) by [@Mantisus](https://github.com/Mantisus), closes [#1495](https://github.com/apify/crawlee-python/issues/1495)
- Fix `BasicCrawler` statistics persistence ([#1490](https://github.com/apify/crawlee-python/pull/1490)) ([1eb1c19](https://github.com/apify/crawlee-python/commit/1eb1c19aa6f9dda4a0e3f7eda23f77a554f95076)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1501](https://github.com/apify/crawlee-python/issues/1501)
- Save context state in result for `AdaptivePlaywrightCrawler` after isolated processing in `SubCrawler` ([#1488](https://github.com/apify/crawlee-python/pull/1488)) ([62b7c70](https://github.com/apify/crawlee-python/commit/62b7c70b54085fc65a660062028014f4502beba9)) by [@Mantisus](https://github.com/Mantisus), closes [#1483](https://github.com/apify/crawlee-python/issues/1483)
## [1.0.3](https://github.com/apify/crawlee-python/releases/tag/v1.0.3) (2025-10-17)
### 🐛 Bug Fixes
- Add support for Pydantic v2.12 ([#1471](https://github.com/apify/crawlee-python/pull/1471)) ([35c1108](https://github.com/apify/crawlee-python/commit/35c110878c2f445a2866be2522ea8703e9b371dd)) by [@Mantisus](https://github.com/Mantisus), closes [#1464](https://github.com/apify/crawlee-python/issues/1464)
- Fix database version warning message ([#1485](https://github.com/apify/crawlee-python/pull/1485)) ([18a545e](https://github.com/apify/crawlee-python/commit/18a545ee8add92e844acd0068f9cb8580a82e1c9)) by [@Mantisus](https://github.com/Mantisus)
- Fix `reclaim_request` in `SqlRequestQueueClient` to correctly update the request state ([#1486](https://github.com/apify/crawlee-python/pull/1486)) ([1502469](https://github.com/apify/crawlee-python/commit/150246957f8f7f1ceb77bb77e3a02a903c50cae1)) by [@Mantisus](https://github.com/Mantisus), closes [#1484](https://github.com/apify/crawlee-python/issues/1484)
- Fix `KeyValueStore.auto_saved_value` failing in some scenarios ([#1438](https://github.com/apify/crawlee-python/pull/1438)) ([b35dee7](https://github.com/apify/crawlee-python/commit/b35dee78180e57161b826641d45a61b8d8f6ef51)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1354](https://github.com/apify/crawlee-python/issues/1354)
## [1.0.2](https://github.com/apify/crawlee-python/releases/tag/v1.0.2) (2025-10-08)
### 🐛 Bug Fixes
- Use Self type in the open() method of storage clients ([#1462](https://github.com/apify/crawlee-python/pull/1462)) ([4ec6f6c](https://github.com/apify/crawlee-python/commit/4ec6f6c08f81632197f602ff99151338b3eba6e7)) by [@janbuchar](https://github.com/janbuchar)
- Add storages name validation ([#1457](https://github.com/apify/crawlee-python/pull/1457)) ([84de11a](https://github.com/apify/crawlee-python/commit/84de11a3a603503076f5b7df487c9abab68a9015)) by [@Mantisus](https://github.com/Mantisus), closes [#1434](https://github.com/apify/crawlee-python/issues/1434)
- Pin pydantic version to <2.12.0 to avoid compatibility issues ([#1467](https://github.com/apify/crawlee-python/pull/1467)) ([f11b86f](https://github.com/apify/crawlee-python/commit/f11b86f7ed57f98e83dc1b52f15f2017a919bf59)) by [@vdusek](https://github.com/vdusek)
## [1.0.1](https://github.com/apify/crawlee-python/releases/tag/v1.0.1) (2025-10-06)
### 🐛 Bug Fixes
- Fix memory leak in `PlaywrightCrawler` on browser context creation ([#1446](https://github.com/apify/crawlee-python/pull/1446)) ([bb181e5](https://github.com/apify/crawlee-python/commit/bb181e58d8070fba38e62d6e57fe981a00e5f035)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1443](https://github.com/apify/crawlee-python/issues/1443)
- Update templates to handle optional httpx client ([#1440](https://github.com/apify/crawlee-python/pull/1440)) ([c087efd](https://github.com/apify/crawlee-python/commit/c087efd39baedf46ca3e5cae1ddc1acd6396e6c1)) by [@Pijukatel](https://github.com/Pijukatel)
## [1.0.0](https://github.com/apify/crawlee-python/releases/tag/v1.0.0) (2025-09-29)
- Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v1) for more details.
- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v1) to ensure a smooth update.
### 🚀 Features
- Add utility for load and parse Sitemap and `SitemapRequestLoader` ([#1169](https://github.com/apify/crawlee-python/pull/1169)) ([66599f8](https://github.com/apify/crawlee-python/commit/66599f8d085f3a8622e130019b6fdce2325737de)) by [@Mantisus](https://github.com/Mantisus), closes [#1161](https://github.com/apify/crawlee-python/issues/1161)
- Add periodic status logging and `status_message_callback` parameter for customization ([#1265](https://github.com/apify/crawlee-python/pull/1265)) ([b992fb2](https://github.com/apify/crawlee-python/commit/b992fb2a457dedd20fc3014d7a4a8afe14602342)) by [@Mantisus](https://github.com/Mantisus), closes [#96](https://github.com/apify/crawlee-python/issues/96)
- Add crawlee-cli option to skip project installation ([#1294](https://github.com/apify/crawlee-python/pull/1294)) ([4d5aef0](https://github.com/apify/crawlee-python/commit/4d5aef05613d10c1442fe449d1cf0f63392c98e3)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1122](https://github.com/apify/crawlee-python/issues/1122)
- Improve `Crawlee` CLI help text ([#1297](https://github.com/apify/crawlee-python/pull/1297)) ([afbe10f](https://github.com/apify/crawlee-python/commit/afbe10f15d93353f5bc551bf9f193414179d0dd7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1295](https://github.com/apify/crawlee-python/issues/1295)
- Add basic `OpenTelemetry` instrumentation ([#1255](https://github.com/apify/crawlee-python/pull/1255)) ([a92d8b3](https://github.com/apify/crawlee-python/commit/a92d8b3f843ee795bba7e14710bb1faa1fdbf292)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1254](https://github.com/apify/crawlee-python/issues/1254)
- Add `ImpitHttpClient` http-client client using the `impit` library ([#1151](https://github.com/apify/crawlee-python/pull/1151)) ([0d0d268](https://github.com/apify/crawlee-python/commit/0d0d2681a4379c0e7ba54c49c86dabfef641610f)) by [@Mantisus](https://github.com/Mantisus)
- Prevent overloading system memory when running locally ([#1270](https://github.com/apify/crawlee-python/pull/1270)) ([30de3bd](https://github.com/apify/crawlee-python/commit/30de3bd7722cbc34db9fc582b4bda7dc2dfa90ff)) by [@janbuchar](https://github.com/janbuchar), closes [#1232](https://github.com/apify/crawlee-python/issues/1232)
- Expose `PlaywrightPersistentBrowser` class ([#1314](https://github.com/apify/crawlee-python/pull/1314)) ([b5fa955](https://github.com/apify/crawlee-python/commit/b5fa95508d7c099ff3a342577f338439283a975f)) by [@Mantisus](https://github.com/Mantisus)
- Add `impit` option for Crawlee CLI ([#1312](https://github.com/apify/crawlee-python/pull/1312)) ([508d7ce](https://github.com/apify/crawlee-python/commit/508d7ce4d998f37ab2adcf9c057c3c635a69f863)) by [@Mantisus](https://github.com/Mantisus)
- Persist RequestList state ([#1274](https://github.com/apify/crawlee-python/pull/1274)) ([cc68014](https://github.com/apify/crawlee-python/commit/cc680147ba3cc8b35b9da70274e53e6f5dd92434)) by [@janbuchar](https://github.com/janbuchar), closes [#99](https://github.com/apify/crawlee-python/issues/99)
- Persist `DefaultRenderingTypePredictor` state ([#1340](https://github.com/apify/crawlee-python/pull/1340)) ([fad4c25](https://github.com/apify/crawlee-python/commit/fad4c25fc712915c4a45b24e3290b6f5dbd8a683)) by [@Mantisus](https://github.com/Mantisus), closes [#1272](https://github.com/apify/crawlee-python/issues/1272)
- Persist the `SitemapRequestLoader` state ([#1347](https://github.com/apify/crawlee-python/pull/1347)) ([27ef9ad](https://github.com/apify/crawlee-python/commit/27ef9ad194552ea9f1321d91a7a52054be9a8a51)) by [@Mantisus](https://github.com/Mantisus), closes [#1269](https://github.com/apify/crawlee-python/issues/1269)
- Add support for NDU storages ([#1401](https://github.com/apify/crawlee-python/pull/1401)) ([5dbd212](https://github.com/apify/crawlee-python/commit/5dbd212663e7abc37535713f4c6e3a5bbf30a12e)) by [@vdusek](https://github.com/vdusek), closes [#1175](https://github.com/apify/crawlee-python/issues/1175)
- Add RQ id, name, alias args to `add_requests` and `enqueue_links` methods ([#1413](https://github.com/apify/crawlee-python/pull/1413)) ([1cae2bc](https://github.com/apify/crawlee-python/commit/1cae2bca0b1508fcb3cb419dc239caf33e20a7ef)) by [@Mantisus](https://github.com/Mantisus), closes [#1402](https://github.com/apify/crawlee-python/issues/1402)
- Add `SqlStorageClient` based on `sqlalchemy` v2+ ([#1339](https://github.com/apify/crawlee-python/pull/1339)) ([07c75a0](https://github.com/apify/crawlee-python/commit/07c75a078b443b58bfaaeb72eb2aa1439458dc47)) by [@Mantisus](https://github.com/Mantisus), closes [#307](https://github.com/apify/crawlee-python/issues/307)
### 🐛 Bug Fixes
- Fix memory estimation not working on MacOS ([#1330](https://github.com/apify/crawlee-python/pull/1330)) ([ab020eb](https://github.com/apify/crawlee-python/commit/ab020eb821a75723225b652d64babd84c368183f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1329](https://github.com/apify/crawlee-python/issues/1329)
- Fix retry count to not count the original request ([#1328](https://github.com/apify/crawlee-python/pull/1328)) ([74fa1d9](https://github.com/apify/crawlee-python/commit/74fa1d936cb3c29cf62d87862a96b4266694af2f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1326](https://github.com/apify/crawlee-python/issues/1326)
- [**breaking**] Remove unused "stats" field from RequestQueueMetadata ([#1331](https://github.com/apify/crawlee-python/pull/1331)) ([0a63bef](https://github.com/apify/crawlee-python/commit/0a63bef514b0bdcd3d6f208b386f706d0fe561e6)) by [@vdusek](https://github.com/vdusek)
- Ignore unknown parameters passed in cookies ([#1336](https://github.com/apify/crawlee-python/pull/1336)) ([50d3ef7](https://github.com/apify/crawlee-python/commit/50d3ef7540551383d26d40f3404b435bde35b47d)) by [@Mantisus](https://github.com/Mantisus), closes [#1333](https://github.com/apify/crawlee-python/issues/1333)
- Fix `timeout` for `stream` method in `ImpitHttpClient` ([#1352](https://github.com/apify/crawlee-python/pull/1352)) ([54b693b](https://github.com/apify/crawlee-python/commit/54b693b838f135a596e1e9493b565bc558b19a3a)) by [@Mantisus](https://github.com/Mantisus)
- Include reason in the session rotation warning logs ([#1363](https://github.com/apify/crawlee-python/pull/1363)) ([d6d7a45](https://github.com/apify/crawlee-python/commit/d6d7a45dd64a906419d9552c45062d726cbb1a0f)) by [@vdusek](https://github.com/vdusek), closes [#1318](https://github.com/apify/crawlee-python/issues/1318)
- Improve crawler statistics logging ([#1364](https://github.com/apify/crawlee-python/pull/1364)) ([1eb6da5](https://github.com/apify/crawlee-python/commit/1eb6da5dd85870124593dcad877284ccaed9c0ce)) by [@vdusek](https://github.com/vdusek), closes [#1317](https://github.com/apify/crawlee-python/issues/1317)
- Do not add a request that is already in progress to `MemoryRequestQueueClient` ([#1384](https://github.com/apify/crawlee-python/pull/1384)) ([3af326c](https://github.com/apify/crawlee-python/commit/3af326c9dfa8fffd56a42ca42981374613739e39)) by [@Mantisus](https://github.com/Mantisus), closes [#1383](https://github.com/apify/crawlee-python/issues/1383)
- Save `RequestQueueState` for `FileSystemRequestQueueClient` in default KVS ([#1411](https://github.com/apify/crawlee-python/pull/1411)) ([6ee60a0](https://github.com/apify/crawlee-python/commit/6ee60a08ac1f9414e1b792f4935cc3799cb5089a)) by [@Mantisus](https://github.com/Mantisus), closes [#1410](https://github.com/apify/crawlee-python/issues/1410)
- Set default desired concurrency for non-browser crawlers to 10 ([#1419](https://github.com/apify/crawlee-python/pull/1419)) ([1cc9401](https://github.com/apify/crawlee-python/commit/1cc940197600d2539bda967880d7f9d241eb8c3e)) by [@vdusek](https://github.com/vdusek)
### 🚜 Refactor
- [**breaking**] Introduce new storage client system ([#1194](https://github.com/apify/crawlee-python/pull/1194)) ([de1c03f](https://github.com/apify/crawlee-python/commit/de1c03f70dbd4ae1773fd49c632b3cfcfab82c26)) by [@vdusek](https://github.com/vdusek), closes [#92](https://github.com/apify/crawlee-python/issues/92), [#147](https://github.com/apify/crawlee-python/issues/147), [#783](https://github.com/apify/crawlee-python/issues/783), [#1247](https://github.com/apify/crawlee-python/issues/1247)
- [**breaking**] Split `BrowserType` literal into two different literals based on context ([#1070](https://github.com/apify/crawlee-python/pull/1070)) ([72b5698](https://github.com/apify/crawlee-python/commit/72b5698fa0647ea02b08da5651736cc37c4c0f6a)) by [@Pijukatel](https://github.com/Pijukatel)
- [**breaking**] Change method `HttpResponse.read` from sync to async ([#1296](https://github.com/apify/crawlee-python/pull/1296)) ([83fa8a4](https://github.com/apify/crawlee-python/commit/83fa8a416b6d2d4e27c678b9bf99bd1b8799f57b)) by [@Mantisus](https://github.com/Mantisus)
- [**breaking**] Replace `HttpxHttpClient` with `ImpitHttpClient` as default HTTP client ([#1307](https://github.com/apify/crawlee-python/pull/1307)) ([c803a97](https://github.com/apify/crawlee-python/commit/c803a976776a76846866d533e3a3ee8144e248c4)) by [@Mantisus](https://github.com/Mantisus), closes [#1079](https://github.com/apify/crawlee-python/issues/1079)
- [**breaking**] Change Dataset unwind parameter to accept list of strings ([#1357](https://github.com/apify/crawlee-python/pull/1357)) ([862a203](https://github.com/apify/crawlee-python/commit/862a20398f00fe91802fe7a1ccd58b05aee053a1)) by [@vdusek](https://github.com/vdusek)
- [**breaking**] Remove `Request.id` field ([#1366](https://github.com/apify/crawlee-python/pull/1366)) ([32f3580](https://github.com/apify/crawlee-python/commit/32f3580e9775a871924ab1233085d0c549c4cd52)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1358](https://github.com/apify/crawlee-python/issues/1358)
- [**breaking**] Refactor storage creation and caching, configuration and services ([#1386](https://github.com/apify/crawlee-python/pull/1386)) ([04649bd](https://github.com/apify/crawlee-python/commit/04649bde60d46b2bc18ae4f6e3fd9667d02a9cef)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1379](https://github.com/apify/crawlee-python/issues/1379)
## [0.6.12](https://github.com/apify/crawlee-python/releases/tag/v0.6.12) (2025-07-30)
### 🚀 Features
- Add `retire_browser_after_page_count` parameter for `BrowserPool` ([#1266](https://github.com/apify/crawlee-python/pull/1266)) ([603aa2b](https://github.com/apify/crawlee-python/commit/603aa2b192ef4bc42d88244bd009fffdb0614c06)) by [@Mantisus](https://github.com/Mantisus)
### 🐛 Bug Fixes
- Use `perf_counter_ns` for request duration tracking ([#1260](https://github.com/apify/crawlee-python/pull/1260)) ([9e92f6b](https://github.com/apify/crawlee-python/commit/9e92f6b54400ce5004fbab770e2e4ac42f73148f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1256](https://github.com/apify/crawlee-python/issues/1256)
- Fix memory estimation not working on MacOS (#1330) ([8558954](https://github.com/apify/crawlee-python/commit/8558954feeb7d5e91378186974a29851fedae9c8)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1329](https://github.com/apify/crawlee-python/issues/1329)
- Fix retry count to not count the original request (#1328) ([1aff3aa](https://github.com/apify/crawlee-python/commit/1aff3aaf0cdbe452a3731192449a445e5b2d7a63)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1326](https://github.com/apify/crawlee-python/issues/1326)
- Ignore unknown parameters passed in cookies (#1336) ([0f2610c](https://github.com/apify/crawlee-python/commit/0f2610c0ee1154dc004de60fc57fe7c9f478166a)) by [@Mantisus](https://github.com/Mantisus), closes [#1333](https://github.com/apify/crawlee-python/issues/1333)
## [0.6.11](https://github.com/apify/crawlee-python/releases/tag/v0.6.11) (2025-06-23)
### 🚀 Features
- Add `stream` method for `HttpClient` ([#1241](https://github.com/apify/crawlee-python/pull/1241)) ([95c68b0](https://github.com/apify/crawlee-python/commit/95c68b0b2d0bf9e093c1d0ee1002625172f7a868)) by [@Mantisus](https://github.com/Mantisus)
### 🐛 Bug Fixes
- Fix `ClientSnapshot` overload calculation ([#1228](https://github.com/apify/crawlee-python/pull/1228)) ([a4fc1b6](https://github.com/apify/crawlee-python/commit/a4fc1b6e83143650666108c289c084ea0463b80c)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1207](https://github.com/apify/crawlee-python/issues/1207)
- Use `PSS` instead of `RSS` to estimate children process memory usage on Linux ([#1210](https://github.com/apify/crawlee-python/pull/1210)) ([436032f](https://github.com/apify/crawlee-python/commit/436032f2de5f7d7fa1016033f1bb224159a8e6bf)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1206](https://github.com/apify/crawlee-python/issues/1206)
- Do not raise an error to check 'same-domain' if there is no hostname in the url ([#1251](https://github.com/apify/crawlee-python/pull/1251)) ([a6c3aab](https://github.com/apify/crawlee-python/commit/a6c3aabf5f8341f215275077b6768a56118bc656)) by [@Mantisus](https://github.com/Mantisus)
## [0.6.10](https://github.com/apify/crawlee-python/releases/tag/v0.6.10) (2025-06-02)
### 🐛 Bug Fixes
- Allow config change on `PlaywrightCrawler` ([#1186](https://github.com/apify/crawlee-python/pull/1186)) ([f17bf31](https://github.com/apify/crawlee-python/commit/f17bf31456b702631aa7e0c26d4f07fd5eb7d1bd)) by [@mylank](https://github.com/mylank), closes [#1185](https://github.com/apify/crawlee-python/issues/1185)
- Add `payload` to `SendRequestFunction` to support `POST` request ([#1202](https://github.com/apify/crawlee-python/pull/1202)) ([e7449f2](https://github.com/apify/crawlee-python/commit/e7449f206c580cb8383a66e4c9ff5f67c5ce8409)) by [@Mantisus](https://github.com/Mantisus)
- Fix match check for specified enqueue strategy for requests with redirect ([#1199](https://github.com/apify/crawlee-python/pull/1199)) ([d84c30c](https://github.com/apify/crawlee-python/commit/d84c30cbd7c94d6525d3b6e8e86b379050454c0e)) by [@Mantisus](https://github.com/Mantisus), closes [#1198](https://github.com/apify/crawlee-python/issues/1198)
- Set `WindowsSelectorEventLoopPolicy` only for curl-impersonate template without `playwright` ([#1209](https://github.com/apify/crawlee-python/pull/1209)) ([f3b839f](https://github.com/apify/crawlee-python/commit/f3b839ffc2ccc1b889b6d5928f35f57b725e27f1)) by [@Mantisus](https://github.com/Mantisus), closes [#1204](https://github.com/apify/crawlee-python/issues/1204)
- Add support non-GET requests for `PlaywrightCrawler` ([#1208](https://github.com/apify/crawlee-python/pull/1208)) ([dbb9f44](https://github.com/apify/crawlee-python/commit/dbb9f44c71af15e1f86766fa0ba68281dd85fd9e)) by [@Mantisus](https://github.com/Mantisus), closes [#1201](https://github.com/apify/crawlee-python/issues/1201)
- Respect `EnqueueLinksKwargs` for `extract_links` function ([#1213](https://github.com/apify/crawlee-python/pull/1213)) ([c9907d6](https://github.com/apify/crawlee-python/commit/c9907d6ff4c3a4a719b279cea77694c00a5a963d)) by [@Mantisus](https://github.com/Mantisus), closes [#1212](https://github.com/apify/crawlee-python/issues/1212)
## [0.6.9](https://github.com/apify/crawlee-python/releases/tag/v0.6.9) (2025-05-02)
### 🚀 Features
- Add an internal `HttpClient` to be used in `send_request` for `PlaywrightCrawler` using `APIRequestContext` bound to the browser context ([#1134](https://github.com/apify/crawlee-python/pull/1134)) ([e794f49](https://github.com/apify/crawlee-python/commit/e794f4985d3a018ee76d634fe2b2c735fb450272)) by [@Mantisus](https://github.com/Mantisus), closes [#928](https://github.com/apify/crawlee-python/issues/928)
- Make timeout error log cleaner ([#1170](https://github.com/apify/crawlee-python/pull/1170)) ([78ea9d2](https://github.com/apify/crawlee-python/commit/78ea9d23e0b2d73286043b68393e462f636625c9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1158](https://github.com/apify/crawlee-python/issues/1158)
- Add `on_skipped_request` decorator, to process links skipped according to `robots.txt` rules ([#1166](https://github.com/apify/crawlee-python/pull/1166)) ([bd16f14](https://github.com/apify/crawlee-python/commit/bd16f14a834eebf485aea6b6a83f2b18bf16b504)) by [@Mantisus](https://github.com/Mantisus), closes [#1160](https://github.com/apify/crawlee-python/issues/1160)
### 🐛 Bug Fixes
- Fix handle error without `args` in `_get_error_message` for `ErrorTracker` ([#1181](https://github.com/apify/crawlee-python/pull/1181)) ([21944d9](https://github.com/apify/crawlee-python/commit/21944d908b8404d2ad6c182104e7a8c27be12a6e)) by [@Mantisus](https://github.com/Mantisus), closes [#1179](https://github.com/apify/crawlee-python/issues/1179)
- Temporarily add `certifi<=2025.1.31` dependency ([#1183](https://github.com/apify/crawlee-python/pull/1183)) ([25ff961](https://github.com/apify/crawlee-python/commit/25ff961990f9abc9d0673ba6573dfcf46dd6e53f)) by [@Pijukatel](https://github.com/Pijukatel)
## [0.6.8](https://github.com/apify/crawlee-python/releases/tag/v0.6.8) (2025-04-25)
### 🚀 Features
- Handle unprocessed requests in `add_requests_batched` ([#1159](https://github.com/apify/crawlee-python/pull/1159)) ([7851175](https://github.com/apify/crawlee-python/commit/7851175304d63e455223b25853021cfbe15d68bd)) by [@Pijukatel](https://github.com/Pijukatel), closes [#456](https://github.com/apify/crawlee-python/issues/456)
- Add `respect_robots_txt_file` option ([#1162](https://github.com/apify/crawlee-python/pull/1162)) ([c23f365](https://github.com/apify/crawlee-python/commit/c23f365bfd263b4357edf82c14a7c6ff8dee45e4)) by [@Mantisus](https://github.com/Mantisus)
### 🐛 Bug Fixes
- Update `UnprocessedRequest` to match actual data ([#1155](https://github.com/apify/crawlee-python/pull/1155)) ([a15a1f3](https://github.com/apify/crawlee-python/commit/a15a1f3528c7cbcf78d3bda5a236bcee1d492764)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1150](https://github.com/apify/crawlee-python/issues/1150)
- Fix the order in which cookies are saved to the `SessionCookies` and the handler is executed for `PlaywrightCrawler` ([#1163](https://github.com/apify/crawlee-python/pull/1163)) ([82ff69a](https://github.com/apify/crawlee-python/commit/82ff69acd8e409f56be56dd061aae0f854ec25b4)) by [@Mantisus](https://github.com/Mantisus)
- Call `failed_request_handler` for `SessionError` when session rotation count exceeds maximum ([#1147](https://github.com/apify/crawlee-python/pull/1147)) ([b3637b6](https://github.com/apify/crawlee-python/commit/b3637b68ec7eae9de7f1b923fa2f68885da64b90)) by [@Mantisus](https://github.com/Mantisus)
## [0.6.7](https://github.com/apify/crawlee-python/releases/tag/v0.6.7) (2025-04-17)
### 🚀 Features
- Add `ErrorSnapshotter` to `ErrorTracker` ([#1125](https://github.com/apify/crawlee-python/pull/1125)) ([9666092](https://github.com/apify/crawlee-python/commit/9666092c6a59ac4d34409038d5476e5b6fb58a26)) by [@Pijukatel](https://github.com/Pijukatel), closes [#151](https://github.com/apify/crawlee-python/issues/151)
### 🐛 Bug Fixes
- Improve validation errors in Crawlee CLI ([#1140](https://github.com/apify/crawlee-python/pull/1140)) ([f2d33df](https://github.com/apify/crawlee-python/commit/f2d33dff178a3d3079eb3807feb9645a25cc7a93)) by [@vdusek](https://github.com/vdusek), closes [#1138](https://github.com/apify/crawlee-python/issues/1138)
- Disable logger propagation to prevent duplicate logs ([#1156](https://github.com/apify/crawlee-python/pull/1156)) ([0b3648d](https://github.com/apify/crawlee-python/commit/0b3648d5d399f0af23520f7fb8ee635d38b512c4)) by [@vdusek](https://github.com/vdusek)
## [0.6.6](https://github.com/apify/crawlee-python/releases/tag/v0.6.6) (2025-04-03)
### 🚀 Features
- Add `statistics_log_format` parameter to `BasicCrawler` ([#1061](https://github.com/apify/crawlee-python/pull/1061)) ([635ae4a](https://github.com/apify/crawlee-python/commit/635ae4a56c65e434783ca721f4164203f465abf0)) by [@Mantisus](https://github.com/Mantisus), closes [#700](https://github.com/apify/crawlee-python/issues/700)
- Add Session binding capability via `session_id` in `Request` ([#1086](https://github.com/apify/crawlee-python/pull/1086)) ([cda7b31](https://github.com/apify/crawlee-python/commit/cda7b314ffda3104e4fd28a5e85c9e238d8866a4)) by [@Mantisus](https://github.com/Mantisus), closes [#1076](https://github.com/apify/crawlee-python/issues/1076)
- Add `requests` argument to `EnqueueLinksFunction` ([#1024](https://github.com/apify/crawlee-python/pull/1024)) ([fc8444c](https://github.com/apify/crawlee-python/commit/fc8444c245c7607d3e378a4835d7d3355c4059be)) by [@Pijukatel](https://github.com/Pijukatel)
### 🐛 Bug Fixes
- Add port for `same-origin` strategy check ([#1096](https://github.com/apify/crawlee-python/pull/1096)) ([9e24598](https://github.com/apify/crawlee-python/commit/9e245987d0aab0ba9c763689f12958b5a332db46)) by [@Mantisus](https://github.com/Mantisus)
- Fix handling of loading empty `metadata` file for queue ([#1042](https://github.com/apify/crawlee-python/pull/1042)) ([b00876e](https://github.com/apify/crawlee-python/commit/b00876e8dcb30a12d3737bd31237da9daada46bb)) by [@Mantisus](https://github.com/Mantisus), closes [#1029](https://github.com/apify/crawlee-python/issues/1029)
- Update favicon ([#1114](https://github.com/apify/crawlee-python/pull/1114)) ([eba900f](https://github.com/apify/crawlee-python/commit/eba900fc1e8d918c6fc464657c53004a3e0fe668)) by [@baldasseva](https://github.com/baldasseva)
- **website:** Use correct image source ([#1115](https://github.com/apify/crawlee-python/pull/1115)) ([ee7806f](https://github.com/apify/crawlee-python/commit/ee7806fc2f9b7b590d9668cc9f86009a898a3da6)) by [@baldasseva](https://github.com/baldasseva)
## [0.6.5](https://github.com/apify/crawlee-python/releases/tag/v0.6.5) (2025-03-13)
### 🐛 Bug Fixes
- Update to `browserforge` workaround ([#1075](https://github.com/apify/crawlee-python/pull/1075)) ([2378cf8](https://github.com/apify/crawlee-python/commit/2378cf84ab1ed06473049a9ddfca2ba6f166306d)) by [@Pijukatel](https://github.com/Pijukatel)
## [0.6.4](https://github.com/apify/crawlee-python/releases/tag/v0.6.4) (2025-03-12)
### 🐛 Bug Fixes
- Add a check thread before set `add_signal_handler` ([#1068](https://github.com/apify/crawlee-python/pull/1068)) ([6983bda](https://github.com/apify/crawlee-python/commit/6983bda2dbc202b3ecbf7db62b11deee007b4b5f)) by [@Mantisus](https://github.com/Mantisus)
- Temporary workaround for `browserforge` import time code execution ([#1073](https://github.com/apify/crawlee-python/pull/1073)) ([17d914f](https://github.com/apify/crawlee-python/commit/17d914f78242078f88c07d686a567d1091255eb1)) by [@Pijukatel](https://github.com/Pijukatel)
## [0.6.3](https://github.com/apify/crawlee-python/releases/tag/v0.6.3) (2025-03-07)
### 🚀 Features
- Add project template with `uv` package manager ([#1057](https://github.com/apify/crawlee-python/pull/1057)) ([9ec06e5](https://github.com/apify/crawlee-python/commit/9ec06e58032aa11af46ac9cd1ea7bb002a18eb13)) by [@Mantisus](https://github.com/Mantisus), closes [#1053](https://github.com/apify/crawlee-python/issues/1053)
- Use fingerprint generator in `PlaywrightCrawler` by default ([#1060](https://github.com/apify/crawlee-python/pull/1060)) ([09cec53](https://github.com/apify/crawlee-python/commit/09cec532911043623eeb475aa8552c70bd94f8b7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1054](https://github.com/apify/crawlee-python/issues/1054)
### 🐛 Bug Fixes
- Update project templates for Poetry v2.x compatibility ([#1049](https://github.com/apify/crawlee-python/pull/1049)) ([96dc2f9](https://github.com/apify/crawlee-python/commit/96dc2f9b53b0a2d0f1d0c73d10e5244114e849ff)) by [@Mantisus](https://github.com/Mantisus), closes [#954](https://github.com/apify/crawlee-python/issues/954)
- Remove tmp folder for PlaywrightCrawler in non-headless mode ([#1046](https://github.com/apify/crawlee-python/pull/1046)) ([3a7f444](https://github.com/apify/crawlee-python/commit/3a7f444fb7ee9a0ab1867c8c9586b15aab1e7df2)) by [@Mantisus](https://github.com/Mantisus)
## [0.6.2](https://github.com/apify/crawlee-python/releases/tag/v0.6.2) (2025-03-05)
### 🚀 Features
- Extend ErrorTracker with error grouping ([#1014](https://github.com/apify/crawlee-python/pull/1014)) ([561de5c](https://github.com/apify/crawlee-python/commit/561de5c6b76af386cad5ac804a22fb7af227e460)) by [@Pijukatel](https://github.com/Pijukatel)
## [0.6.1](https://github.com/apify/crawlee-python/releases/tag/v0.6.1) (2025-03-03)
### 🐛 Bug Fixes
- Add `browserforge` to mandatory dependencies ([#1044](https://github.com/apify/crawlee-python/pull/1044)) ([ddfbde8](https://github.com/apify/crawlee-python/commit/ddfbde89dd3e3cbef0f3954936f4a41c3d6df909)) by [@Pijukatel](https://github.com/Pijukatel)
## [0.6.0](https://github.com/apify/crawlee-python/releases/tag/v0.6.0) (2025-03-03)
- Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v06) for more details.
- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v06) to ensure a smooth update.
### 🚀 Features
- Integrate browserforge fingerprints ([#829](https://github.com/apify/crawlee-python/pull/829)) ([2b156b4](https://github.com/apify/crawlee-python/commit/2b156b4ba688f9111195422e6058dff30eb1f782)) by [@Pijukatel](https://github.com/Pijukatel), closes [#549](https://github.com/apify/crawlee-python/issues/549)
- Add AdaptivePlaywrightCrawler ([#872](https://github.com/apify/crawlee-python/pull/872)) ([5ba70b6](https://github.com/apify/crawlee-python/commit/5ba70b6e846a908a55db461ab0c85e3946f2bc7c)) by [@Pijukatel](https://github.com/Pijukatel)
- Implement `_snapshot_client` for `Snapshotter` ([#957](https://github.com/apify/crawlee-python/pull/957)) ([ba4d384](https://github.com/apify/crawlee-python/commit/ba4d384228d030c20c580ed01fae0e78af3a9543)) by [@Mantisus](https://github.com/Mantisus), closes [#60](https://github.com/apify/crawlee-python/issues/60)
- Add adaptive context helpers ([#964](https://github.com/apify/crawlee-python/pull/964)) ([e248f17](https://github.com/apify/crawlee-python/commit/e248f17fad7b6d1fc5e23a0a1e961db66068a411)) by [@Pijukatel](https://github.com/Pijukatel), closes [#249](https://github.com/apify/crawlee-python/issues/249)
- [**breaking**] Enable additional status codes arguments to PlaywrightCrawler ([#959](https://github.com/apify/crawlee-python/pull/959)) ([87cf446](https://github.com/apify/crawlee-python/commit/87cf446a7cbaa900e28abd93d4c8a2e0d1747059)) by [@Pijukatel](https://github.com/Pijukatel), closes [#953](https://github.com/apify/crawlee-python/issues/953)
- Replace `HeaderGenerator` implementation by `browserforge` implementation ([#960](https://github.com/apify/crawlee-python/pull/960)) ([c2f8c93](https://github.com/apify/crawlee-python/commit/c2f8c93a4ad57c4ede354545bf925bf3707899c9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#937](https://github.com/apify/crawlee-python/issues/937)
### 🐛 Bug Fixes
- Fix playwright template and dockerfile ([#972](https://github.com/apify/crawlee-python/pull/972)) ([c33b34d](https://github.com/apify/crawlee-python/commit/c33b34dd6e253b1261c700857bb5c4bbec6d5c14)) by [@janbuchar](https://github.com/janbuchar), closes [#969](https://github.com/apify/crawlee-python/issues/969)
- Fix installing dependencies via pip in project template ([#977](https://github.com/apify/crawlee-python/pull/977)) ([1e3b8eb](https://github.com/apify/crawlee-python/commit/1e3b8eb1cdb57bf2f7256e8ae5f0706b0afc3ba9)) by [@janbuchar](https://github.com/janbuchar), closes [#975](https://github.com/apify/crawlee-python/issues/975)
- Fix default migration storage ([#1018](https://github.com/apify/crawlee-python/pull/1018)) ([6a0c4d9](https://github.com/apify/crawlee-python/commit/6a0c4d94593f7e94f24eee8a97fc7bc83c4d02e1)) by [@Pijukatel](https://github.com/Pijukatel), closes [#991](https://github.com/apify/crawlee-python/issues/991)
- Fix logger name for http based loggers ([#1023](https://github.com/apify/crawlee-python/pull/1023)) ([bfb3944](https://github.com/apify/crawlee-python/commit/bfb394446351c8f3b9879a9905607f7c929f2542)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1021](https://github.com/apify/crawlee-python/issues/1021)
- Remove allow_redirects override in CurlImpersonateHttpClient ([#1017](https://github.com/apify/crawlee-python/pull/1017)) ([01d855a](https://github.com/apify/crawlee-python/commit/01d855a43389a6b4b16ec74767624fa7eb13151f)) by [@2tunnels](https://github.com/2tunnels), closes [#1016](https://github.com/apify/crawlee-python/issues/1016)
- Remove follow_redirects override in HttpxHttpClient ([#1015](https://github.com/apify/crawlee-python/pull/1015)) ([88afda3](https://github.com/apify/crawlee-python/commit/88afda33e77be84bc91ad1239740b8e661bef2a2)) by [@2tunnels](https://github.com/2tunnels), closes [#1013](https://github.com/apify/crawlee-python/issues/1013)
- Fix flaky test_common_headers_and_user_agent ([#1030](https://github.com/apify/crawlee-python/pull/1030)) ([58aa70e](https://github.com/apify/crawlee-python/commit/58aa70e9600d313b823a1376ab9b36fb416c1c4a)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1027](https://github.com/apify/crawlee-python/issues/1027)
### 🚜 Refactor
- [**breaking**] Remove unused config properties ([#978](https://github.com/apify/crawlee-python/pull/978)) ([4b7fe29](https://github.com/apify/crawlee-python/commit/4b7fe2930540a5fbd753135e3ce29dc80f80c543)) by [@vdusek](https://github.com/vdusek)
- [**breaking**] Remove Base prefix from abstract class names ([#980](https://github.com/apify/crawlee-python/pull/980)) ([8ccb5d4](https://github.com/apify/crawlee-python/commit/8ccb5d41a1dae9b02088b433266ac89bd089561a)) by [@vdusek](https://github.com/vdusek)
- [**breaking**] Сhange default `incognito context` to `persistent context` for `Playwright` ([#985](https://github.com/apify/crawlee-python/pull/985)) ([f01520d](https://github.com/apify/crawlee-python/commit/f01520d22b31af9f0f13ca162cc47e6aa9744c6d)) by [@Mantisus](https://github.com/Mantisus), closes [#721](https://github.com/apify/crawlee-python/issues/721), [#963](https://github.com/apify/crawlee-python/issues/963)
- [**breaking**] Change `Session` cookies from `dict` to `SessionCookies` with `CookieJar` ([#984](https://github.com/apify/crawlee-python/pull/984)) ([6523b3a](https://github.com/apify/crawlee-python/commit/6523b3ade0eed53b0363ddce250c557024339b5e)) by [@Mantisus](https://github.com/Mantisus), closes [#710](https://github.com/apify/crawlee-python/issues/710), [#933](https://github.com/apify/crawlee-python/issues/933)
- [**breaking**] Replace enum with literal for `EnqueueStrategy` ([#1019](https://github.com/apify/crawlee-python/pull/1019)) ([d2481ef](https://github.com/apify/crawlee-python/commit/d2481ef71d3539979c5b1129387e72b4126fe366)) by [@vdusek](https://github.com/vdusek)
- [**breaking**] Update status code handling ([#1028](https://github.com/apify/crawlee-python/pull/1028)) ([6b59471](https://github.com/apify/crawlee-python/commit/6b5947125e63abdfff481b0669398fc9a7293e55)) by [@Mantisus](https://github.com/Mantisus), closes [#830](https://github.com/apify/crawlee-python/issues/830), [#998](https://github.com/apify/crawlee-python/issues/998)
- [**breaking**] Move `cli` dependencies to optional dependencies ([#1011](https://github.com/apify/crawlee-python/pull/1011)) ([4382959](https://github.com/apify/crawlee-python/commit/43829590c6b4efd1dc9b833373f82a842a0a1a8e)) by [@Mantisus](https://github.com/Mantisus), closes [#703](https://github.com/apify/crawlee-python/issues/703), [#1010](https://github.com/apify/crawlee-python/issues/1010)
## [0.5.4](https://github.com/apify/crawlee-python/releases/tag/v0.5.4) (2025-02-05)
### 🚀 Features
- Add support `use_incognito_pages` for `browser_launch_options` in `PlaywrightCrawler` ([#941](https://github.com/apify/crawlee-python/pull/941)) ([eae3a33](https://github.com/apify/crawlee-python/commit/eae3a33a1842ebbdac5f9c51866a4be4bcf1ae2c)) by [@Mantisus](https://github.com/Mantisus)
### 🐛 Bug Fixes
- Fix session management with retire ([#947](https://github.com/apify/crawlee-python/pull/947)) ([caee03f](https://github.com/apify/crawlee-python/commit/caee03fe3a43cc1d7a8d3f9e19b42df1bdb1c0aa)) by [@Mantisus](https://github.com/Mantisus)
- Fix templates - poetry-plugin-export version and camoufox template name ([#952](https://github.com/apify/crawlee-python/pull/952)) ([7addea6](https://github.com/apify/crawlee-python/commit/7addea6605359cceba208e16ec9131724bdb3e9b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#951](https://github.com/apify/crawlee-python/issues/951)
- Fix convert relative link to absolute in `enqueue_links` for response with redirect ([#956](https://github.com/apify/crawlee-python/pull/956)) ([694102e](https://github.com/apify/crawlee-python/commit/694102e163bb9021a4830d2545d153f6f8f3de90)) by [@Mantisus](https://github.com/Mantisus), closes [#955](https://github.com/apify/crawlee-python/issues/955)
- Fix `CurlImpersonateHttpClient` cookies handler ([#946](https://github.com/apify/crawlee-python/pull/946)) ([ed415c4](https://github.com/apify/crawlee-python/commit/ed415c433da2a40b0ee62534f0730d0737e991b8)) by [@Mantisus](https://github.com/Mantisus)
## [0.5.3](https://github.com/apify/crawlee-python/releases/tag/v0.5.3) (2025-01-31)
### 🚀 Features
- Add keep_alive flag to `crawler.__init__` ([#921](https://github.com/apify/crawlee-python/pull/921)) ([7a82d0c](https://github.com/apify/crawlee-python/commit/7a82d0cbdbe6c8739d4bf6a9b014e31f07e5a520)) by [@Pijukatel](https://github.com/Pijukatel), closes [#891](https://github.com/apify/crawlee-python/issues/891)
- Add `block_requests` helper for `PlaywrightCrawler` ([#919](https://github.com/apify/crawlee-python/pull/919)) ([1030459](https://github.com/apify/crawlee-python/commit/103045994908f80cffee5ccfff91a040e0042f48)) by [@Mantisus](https://github.com/Mantisus), closes [#848](https://github.com/apify/crawlee-python/issues/848)
- Return request handlers from decorator methods to allow further decoration ([#934](https://github.com/apify/crawlee-python/pull/934)) ([9ec0aae](https://github.com/apify/crawlee-python/commit/9ec0aae54e2a340d29c893567ae80bf8bd4510a9)) by [@mylank](https://github.com/mylank)
- Add `transform_request_function` for `enqueue_links` ([#923](https://github.com/apify/crawlee-python/pull/923)) ([6b15957](https://github.com/apify/crawlee-python/commit/6b159578f612251e6d2253a72b6521430f4f9b09)) by [@Mantisus](https://github.com/Mantisus), closes [#894](https://github.com/apify/crawlee-python/issues/894)
- Add `time_remaining_secs` property to `MIGRATING` event data ([#940](https://github.com/apify/crawlee-python/pull/940)) ([b44501b](https://github.com/apify/crawlee-python/commit/b44501bcadbd12673a8f47aa92f12da8e404f60b)) by [@fnesveda](https://github.com/fnesveda)
- Add LogisticalRegressionPredictor - rendering type predictor for adaptive crawling ([#930](https://github.com/apify/crawlee-python/pull/930)) ([8440499](https://github.com/apify/crawlee-python/commit/8440499468db115a4c478e9bcdb692554d1655c5)) by [@Pijukatel](https://github.com/Pijukatel)
### 🐛 Bug Fixes
- Fix crawler not retrying user handler if there was timeout in the handler ([#909](https://github.com/apify/crawlee-python/pull/909)) ([f4090ef](https://github.com/apify/crawlee-python/commit/f4090ef0ea0281d53dab16a77ceea2ef6ac43d76)) by [@Pijukatel](https://github.com/Pijukatel), closes [#907](https://github.com/apify/crawlee-python/issues/907)
- Optimize memory consumption for `HttpxHttpClient`, fix proxy handling ([#905](https://github.com/apify/crawlee-python/pull/905)) ([d7ad480](https://github.com/apify/crawlee-python/commit/d7ad480834263ae0480049cb0a8db4dfc3946d8d)) by [@Mantisus](https://github.com/Mantisus), closes [#895](https://github.com/apify/crawlee-python/issues/895)
- Fix `BrowserPool` and `PlaywrightBrowserPlugin` closure ([#932](https://github.com/apify/crawlee-python/pull/932)) ([997543d](https://github.com/apify/crawlee-python/commit/997543d2fa5afba49929f4407ee95d7a4933a50d)) by [@Mantisus](https://github.com/Mantisus)
## [0.5.2](https://github.com/apify/crawlee-python/releases/tag/v0.5.2) (2025-01-17)
### 🐛 Bug Fixes
- Avoid `use_state` race conditions. Remove key argument to `use_state` ([#868](https://github.com/apify/crawlee-python/pull/868)) ([000b976](https://github.com/apify/crawlee-python/commit/000b9761211502d86a893a31e3ca21998a6e3b99)) by [@Pijukatel](https://github.com/Pijukatel), closes [#856](https://github.com/apify/crawlee-python/issues/856)
- Restore proxy functionality for PlaywrightCrawler broken in v0.5 ([#889](https://github.com/apify/crawlee-python/pull/889)) ([908c944](https://github.com/apify/crawlee-python/commit/908c944ff9b1fc8ed7eb35f0078a1de71e34d5c5)) by [@Mantisus](https://github.com/Mantisus), closes [#887](https://github.com/apify/crawlee-python/issues/887)
- Fix the usage of Configuration ([#899](https://github.com/apify/crawlee-python/pull/899)) ([0f1cf6f](https://github.com/apify/crawlee-python/commit/0f1cf6f0b52c92ca4e465a2a01f8111cd9ab42ec)) by [@vdusek](https://github.com/vdusek), closes [#670](https://github.com/apify/crawlee-python/issues/670)
## [0.5.1](https://github.com/apify/crawlee-python/releases/tag/v0.5.1) (2025-01-07)
### 🐛 Bug Fixes
- Make result of RequestList.is_empty independent of fetch_next_request calls ([#876](https://github.com/apify/crawlee-python/pull/876)) ([d50249e](https://github.com/apify/crawlee-python/commit/d50249ecbfe2a04f508fcdc3261e050349bd0da2)) by [@janbuchar](https://github.com/janbuchar)
## [0.5.0](https://github.com/apify/crawlee-python/releases/tag/v0.5.0) (2025-01-02)
- Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v05) for more details.
- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v05) to ensure a smooth update.
### 🚀 Features
- Add possibility to use None as no proxy in tiered proxies ([#760](https://github.com/apify/crawlee-python/pull/760)) ([0fbd017](https://github.com/apify/crawlee-python/commit/0fbd01723b9fe2e3410e0f358cab2f22848b08d0)) by [@Pijukatel](https://github.com/Pijukatel), closes [#687](https://github.com/apify/crawlee-python/issues/687)
- Add `use_state` context method ([#682](https://github.com/apify/crawlee-python/pull/682)) ([868b41e](https://github.com/apify/crawlee-python/commit/868b41ebd4c8003fa60ab07887577d0fb85b6ecc)) by [@Mantisus](https://github.com/Mantisus), closes [#191](https://github.com/apify/crawlee-python/issues/191)
- Add pre-navigation hooks router to AbstractHttpCrawler ([#791](https://github.com/apify/crawlee-python/pull/791)) ([0f23205](https://github.com/apify/crawlee-python/commit/0f23205923065074c522b3de9d47218a204dfa78)) by [@Pijukatel](https://github.com/Pijukatel), closes [#635](https://github.com/apify/crawlee-python/issues/635)
- Add example of how to integrate Camoufox into PlaywrightCrawler ([#789](https://github.com/apify/crawlee-python/pull/789)) ([246cfc4](https://github.com/apify/crawlee-python/commit/246cfc4ebc8bce1d15e1dddd62d652bd65869328)) by [@Pijukatel](https://github.com/Pijukatel), closes [#684](https://github.com/apify/crawlee-python/issues/684)
- Expose event types, improve on/emit signature, allow parameterless listeners ([#800](https://github.com/apify/crawlee-python/pull/800)) ([c102c4c](https://github.com/apify/crawlee-python/commit/c102c4c894a00b09adfd5f4911563c81cf3e98b4)) by [@janbuchar](https://github.com/janbuchar), closes [#561](https://github.com/apify/crawlee-python/issues/561)
- Add stop method to BasicCrawler ([#807](https://github.com/apify/crawlee-python/pull/807)) ([6d01af4](https://github.com/apify/crawlee-python/commit/6d01af4231d02b4349a8719f5ed18d812843fde5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#651](https://github.com/apify/crawlee-python/issues/651)
- Add `html_to_text` helper function ([#792](https://github.com/apify/crawlee-python/pull/792)) ([2b9d970](https://github.com/apify/crawlee-python/commit/2b9d97009dd653870681bb3cadbb46b214ff1a73)) by [@Pijukatel](https://github.com/Pijukatel), closes [#659](https://github.com/apify/crawlee-python/issues/659)
- [**breaking**] Implement `RequestManagerTandem`, remove `add_request` from `RequestList`, accept any iterable in `RequestList` constructor ([#777](https://github.com/apify/crawlee-python/pull/777)) ([4172652](https://github.com/apify/crawlee-python/commit/4172652079e5e91190c1cc5e2138fd41a7c84a6b)) by [@janbuchar](https://github.com/janbuchar)
### 🐛 Bug Fixes
- Fix circular import in `KeyValueStore` ([#805](https://github.com/apify/crawlee-python/pull/805)) ([8bdf49d](https://github.com/apify/crawlee-python/commit/8bdf49d1cb2a94b66f69fd1b77063a4113517fae)) by [@Mantisus](https://github.com/Mantisus), closes [#804](https://github.com/apify/crawlee-python/issues/804)
- [**breaking**] Refactor service usage to rely on `service_locator` ([#691](https://github.com/apify/crawlee-python/pull/691)) ([1d31c6c](https://github.com/apify/crawlee-python/commit/1d31c6c7e7a9ec7cee5b2de900568d9f77db65ba)) by [@vdusek](https://github.com/vdusek), closes [#369](https://github.com/apify/crawlee-python/issues/369), [#539](https://github.com/apify/crawlee-python/issues/539), [#699](https://github.com/apify/crawlee-python/issues/699)
- Pass `verify` in httpx client ([#802](https://github.com/apify/crawlee-python/pull/802)) ([074d083](https://github.com/apify/crawlee-python/commit/074d0836b55e52f13726e7cd1c21602623fda4fc)) by [@Mantisus](https://github.com/Mantisus), closes [#798](https://github.com/apify/crawlee-python/issues/798)
- Fix `page_options` for `PlaywrightBrowserPlugin` ([#796](https://github.com/apify/crawlee-python/pull/796)) ([bd3bdd4](https://github.com/apify/crawlee-python/commit/bd3bdd4046c2ddea62feb77322033cad50f382dd)) by [@Mantisus](https://github.com/Mantisus), closes [#755](https://github.com/apify/crawlee-python/issues/755)
- Fix event migrating handler in `RequestQueue` ([#825](https://github.com/apify/crawlee-python/pull/825)) ([fd6663f](https://github.com/apify/crawlee-python/commit/fd6663f903bc7eecd1000da89e06197b43dfb962)) by [@Mantisus](https://github.com/Mantisus), closes [#815](https://github.com/apify/crawlee-python/issues/815)
- Respect user configuration for work with status codes ([#812](https://github.com/apify/crawlee-python/pull/812)) ([8daf4bd](https://github.com/apify/crawlee-python/commit/8daf4bd49c1b09a0924f827daedebf7600ac609b)) by [@Mantisus](https://github.com/Mantisus), closes [#708](https://github.com/apify/crawlee-python/issues/708), [#756](https://github.com/apify/crawlee-python/issues/756)
- `abort-on-error` for successive runs ([#834](https://github.com/apify/crawlee-python/pull/834)) ([0cea673](https://github.com/apify/crawlee-python/commit/0cea67387bf366800b447de784af580159b199ee)) by [@Mantisus](https://github.com/Mantisus)
- Relax ServiceLocator restrictions ([#837](https://github.com/apify/crawlee-python/pull/837)) ([aa3667f](https://github.com/apify/crawlee-python/commit/aa3667f344d78945df3eca77431e1409f43f8bb5)) by [@janbuchar](https://github.com/janbuchar), closes [#806](https://github.com/apify/crawlee-python/issues/806)
- Fix typo in exports ([#841](https://github.com/apify/crawlee-python/pull/841)) ([8fa6ac9](https://github.com/apify/crawlee-python/commit/8fa6ac994fe4f3f6430cb796a0c6a732c93c672b)) by [@janbuchar](https://github.com/janbuchar)
### 🚜 Refactor
- [**breaking**] Refactor HttpCrawler, BeautifulSoupCrawler, ParselCrawler inheritance ([#746](https://github.com/apify/crawlee-python/pull/746)) ([9d3c269](https://github.com/apify/crawlee-python/commit/9d3c2697c91ce93028ca86a91d85d465d36c1ad7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#350](https://github.com/apify/crawlee-python/issues/350)
- [**breaking**] Remove `json_` and `order_no` from `Request` ([#788](https://github.com/apify/crawlee-python/pull/788)) ([5381d13](https://github.com/apify/crawlee-python/commit/5381d13aa51a757fc1906f400788555df090a1af)) by [@Mantisus](https://github.com/Mantisus), closes [#94](https://github.com/apify/crawlee-python/issues/94)
- [**breaking**] Rename PwPreNavContext to PwPreNavCrawlingContext ([#827](https://github.com/apify/crawlee-python/pull/827)) ([84b61a3](https://github.com/apify/crawlee-python/commit/84b61a3d25bee42faed4e81cd156663f251b3d3d)) by [@vdusek](https://github.com/vdusek)
- [**breaking**] Rename PlaywrightCrawler kwargs: browser_options, page_options ([#831](https://github.com/apify/crawlee-python/pull/831)) ([ffc6048](https://github.com/apify/crawlee-python/commit/ffc6048e9dc5c5e862271fa50c48bb0fb6f0a18f)) by [@Pijukatel](https://github.com/Pijukatel)
- [**breaking**] Update the crawlers & storage clients structure ([#828](https://github.com/apify/crawlee-python/pull/828)) ([0ba04d1](https://github.com/apify/crawlee-python/commit/0ba04d1633881043928a408678932c46fb90e21f)) by [@vdusek](https://github.com/vdusek), closes [#764](https://github.com/apify/crawlee-python/issues/764)
## [0.4.5](https://github.com/apify/crawlee-python/releases/tag/v0.4.5) (2024-12-06)
### 🚀 Features
- Improve project bootstrapping ([#538](https://github.com/apify/crawlee-python/pull/538)) ([367899c](https://github.com/apify/crawlee-python/commit/367899cbad5021674f6e41c4dd7eb2266fe043aa)) by [@janbuchar](https://github.com/janbuchar), closes [#317](https://github.com/apify/crawlee-python/issues/317), [#414](https://github.com/apify/crawlee-python/issues/414), [#495](https://github.com/apify/crawlee-python/issues/495), [#511](https://github.com/apify/crawlee-python/issues/511)
### 🐛 Bug Fixes
- Add upper bound of HTTPX version ([#775](https://github.com/apify/crawlee-python/pull/775)) ([b59e34d](https://github.com/apify/crawlee-python/commit/b59e34d6301e26825d88608152ffb337ef602a9f)) by [@vdusek](https://github.com/vdusek)
- Fix incorrect use of desired concurrency ratio ([#780](https://github.com/apify/crawlee-python/pull/780)) ([d1f8bfb](https://github.com/apify/crawlee-python/commit/d1f8bfb68ce2ef13b550ce415a3689858112a4c7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#759](https://github.com/apify/crawlee-python/issues/759)
- Remove pydantic constraint <2.10.0 and update timedelta validator, serializer type hints ([#757](https://github.com/apify/crawlee-python/pull/757)) ([c0050c0](https://github.com/apify/crawlee-python/commit/c0050c0ee76e5deb28f174ecf276b0e6abf68b9d)) by [@Pijukatel](https://github.com/Pijukatel)
## [0.4.4](https://github.com/apify/crawlee-python/releases/tag/v0.4.4) (2024-11-29)
### 🚀 Features
- Expose browser_options and page_options to PlaywrightCrawler ([#730](https://github.com/apify/crawlee-python/pull/730)) ([dbe85b9](https://github.com/apify/crawlee-python/commit/dbe85b90e59def281cfc6617a0eb869a4adf2fc0)) by [@vdusek](https://github.com/vdusek), closes [#719](https://github.com/apify/crawlee-python/issues/719)
- Add `abort_on_error` property ([#731](https://github.com/apify/crawlee-python/pull/731)) ([6dae03a](https://github.com/apify/crawlee-python/commit/6dae03a68a2d23c68c78d8d44611d43e40eb9404)) by [@Mantisus](https://github.com/Mantisus), closes [#704](https://github.com/apify/crawlee-python/issues/704)
### 🐛 Bug Fixes
- Fix init of context managers and context handling in `BasicCrawler` ([#714](https://github.com/apify/crawlee-python/pull/714)) ([486fe6d](https://github.com/apify/crawlee-python/commit/486fe6d6cd56cb560ab51a32ec0286d9e32267cb)) by [@vdusek](https://github.com/vdusek)
## [0.4.3](https://github.com/apify/crawlee-python/releases/tag/v0.4.3) (2024-11-21)
### 🐛 Bug Fixes
- Pydantic 2.10.0 issues ([#716](https://github.com/apify/crawlee-python/pull/716)) ([8d8b3fc](https://github.com/apify/crawlee-python/commit/8d8b3fcff8be10edf5351f5324c7ba112c1d2ba0)) by [@Pijukatel](https://github.com/Pijukatel)
## [0.4.2](https://github.com/apify/crawlee-python/releases/tag/v0.4.2) (2024-11-20)
### 🐛 Bug Fixes
- Respect custom HTTP headers in `PlaywrightCrawler` ([#685](https://github.com/apify/crawlee-python/pull/685)) ([a84125f](https://github.com/apify/crawlee-python/commit/a84125f031347426de44b8f015c87882c8f96f72)) by [@Mantisus](https://github.com/Mantisus)
- Fix serialization payload in Request. Fix Docs for Post Request ([#683](https://github.com/apify/crawlee-python/pull/683)) ([e8b4d2d](https://github.com/apify/crawlee-python/commit/e8b4d2d4989fd9967403b828c914cb7ae2ef9b8b)) by [@Mantisus](https://github.com/Mantisus), closes [#668](https://github.com/apify/crawlee-python/issues/668)
- Accept string payload in the Request constructor ([#697](https://github.com/apify/crawlee-python/pull/697)) ([19f5add](https://github.com/apify/crawlee-python/commit/19f5addc0223d68389eea47864830c709335ab6e)) by [@vdusek](https://github.com/vdusek)
- Fix snapshots handling ([#692](https://github.com/apify/crawlee-python/pull/692)) ([4016c0d](https://github.com/apify/crawlee-python/commit/4016c0d8121a8950ab1df22188eac838a011c39f)) by [@Pijukatel](https://github.com/Pijukatel)
## [0.4.1](https://github.com/apify/crawlee-python/releases/tag/v0.4.1) (2024-11-11)
### 🚀 Features
- Add `max_crawl_depth` option to `BasicCrawler` ([#637](https://github.com/apify/crawlee-python/pull/637)) ([77deaa9](https://github.com/apify/crawlee-python/commit/77deaa964e2c1e74af1c5117a13d8d8257f0e27e)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#460](https://github.com/apify/crawlee-python/issues/460)
- Add BeautifulSoupParser type alias ([#674](https://github.com/apify/crawlee-python/pull/674)) ([b2cf88f](https://github.com/apify/crawlee-python/commit/b2cf88ffea8d75808c9210850a03fcc70b0b9e3d)) by [@Pijukatel](https://github.com/Pijukatel)
### 🐛 Bug Fixes
- Fix total_size usage in memory size monitoring ([#661](https://github.com/apify/crawlee-python/pull/661)) ([c2a3239](https://github.com/apify/crawlee-python/commit/c2a32397eecd5cc7f412c2af7269b004a8b2eaf2)) by [@janbuchar](https://github.com/janbuchar)
- Add HttpHeaders to module exports ([#664](https://github.com/apify/crawlee-python/pull/664)) ([f0c5ca7](https://github.com/apify/crawlee-python/commit/f0c5ca717d9f9e304d375da2c23552c26ca870da)) by [@vdusek](https://github.com/vdusek), closes [#663](https://github.com/apify/crawlee-python/issues/663)
- Fix unhandled ValueError in request handler result processing ([#666](https://github.com/apify/crawlee-python/pull/666)) ([0a99d7f](https://github.com/apify/crawlee-python/commit/0a99d7f693245eb9a065016fb6f2d268f6956805)) by [@janbuchar](https://github.com/janbuchar)
- Fix BaseDatasetClient.iter_items type hints ([#680](https://github.com/apify/crawlee-python/pull/680)) ([a968b1b](https://github.com/apify/crawlee-python/commit/a968b1be6fceb56676b0198a044c8fceac7c92a6)) by [@Pijukatel](https://github.com/Pijukatel)
## [0.4.0](https://github.com/apify/crawlee-python/releases/tag/v0.4.0) (2024-11-01)
- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v04) to ensure a smooth update.
### 🚀 Features
- [**breaking**] Add headers in unique key computation ([#609](https://github.com/apify/crawlee-python/pull/609)) ([6c4746f](https://github.com/apify/crawlee-python/commit/6c4746fa8ff86952a812b32a1d70dc910e76b43e)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#548](https://github.com/apify/crawlee-python/issues/548)
- Add `pre_navigation_hooks` to `PlaywrightCrawler` ([#631](https://github.com/apify/crawlee-python/pull/631)) ([5dd5b60](https://github.com/apify/crawlee-python/commit/5dd5b60e2a44d5bd3748b613790e1bee3232d6f3)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#427](https://github.com/apify/crawlee-python/issues/427)
- Add `always_enqueue` option to bypass URL deduplication ([#621](https://github.com/apify/crawlee-python/pull/621)) ([4e59fa4](https://github.com/apify/crawlee-python/commit/4e59fa46daaec05e52262cf62c26f28ddcd772af)) by [@Rutam21](https://github.com/Rutam21), closes [#547](https://github.com/apify/crawlee-python/issues/547)
- Split and add extra configuration to export_data method ([#580](https://github.com/apify/crawlee-python/pull/580)) ([6751635](https://github.com/apify/crawlee-python/commit/6751635e1785a4a27f60092c82f5dd0c40193d52)) by [@deshansh](https://github.com/deshansh), closes [#526](https://github.com/apify/crawlee-python/issues/526)
### 🐛 Bug Fixes
- Use strip in headers normalization ([#614](https://github.com/apify/crawlee-python/pull/614)) ([a15b21e](https://github.com/apify/crawlee-python/commit/a15b21e51deaf2b67738f95bc2b15c1c16d1775f)) by [@vdusek](https://github.com/vdusek)
- [**breaking**] Merge payload and data fields of Request ([#542](https://github.com/apify/crawlee-python/pull/542)) ([d06fcef](https://github.com/apify/crawlee-python/commit/d06fcef3fee44616ded5f587b9c7313b82a57cc7)) by [@vdusek](https://github.com/vdusek), closes [#560](https://github.com/apify/crawlee-python/issues/560)
- Default ProxyInfo port if httpx.URL port is None ([#619](https://github.com/apify/crawlee-python/pull/619)) ([8107a6f](https://github.com/apify/crawlee-python/commit/8107a6f97e8f16a330e7d02d3fc6ea34c5f78d77)) by [@steffansafey](https://github.com/steffansafey), closes [#618](https://github.com/apify/crawlee-python/issues/618)
### ⚙️ Miscellaneous Tasks
- [**breaking**] Remove Request.query_params field ([#639](https://github.com/apify/crawlee-python/pull/639)) ([6ec0ec4](https://github.com/apify/crawlee-python/commit/6ec0ec4fa0cef9b8bf893e70d99f068675c9c54c)) by [@vdusek](https://github.com/vdusek), closes [#615](https://github.com/apify/crawlee-python/issues/615)
## [0.3.9](https://github.com/apify/crawlee-python/releases/tag/v0.3.9) (2024-10-23)
### 🚀 Features
- Key-value store context helpers ([#584](https://github.com/apify/crawlee-python/pull/584)) ([fc15622](https://github.com/apify/crawlee-python/commit/fc156222c3747fc4cc7bd7666a21769845c7d0d5)) by [@janbuchar](https://github.com/janbuchar)
- Added get_public_url method to KeyValueStore ([#572](https://github.com/apify/crawlee-python/pull/572)) ([3a4ba8f](https://github.com/apify/crawlee-python/commit/3a4ba8f459903b6288aec40de2c3ca862e36abec)) by [@akshay11298](https://github.com/akshay11298), closes [#514](https://github.com/apify/crawlee-python/issues/514)
### 🐛 Bug Fixes
- Workaround for JSON value typing problems ([#581](https://github.com/apify/crawlee-python/pull/581)) ([403496a](https://github.com/apify/crawlee-python/commit/403496a53c12810351139a6e073238143ecc5930)) by [@janbuchar](https://github.com/janbuchar), closes [#563](https://github.com/apify/crawlee-python/issues/563)
## [0.3.8](https://github.com/apify/crawlee-python/releases/tag/v0.3.8) (2024-10-02)
### 🚀 Features
- Mask Playwright's "headless" headers ([#545](https://github.com/apify/crawlee-python/pull/545)) ([d1445e4](https://github.com/apify/crawlee-python/commit/d1445e4858fd804bb4a2e35efa1d2f5254d8df6b)) by [@vdusek](https://github.com/vdusek), closes [#401](https://github.com/apify/crawlee-python/issues/401)
- Add new model for `HttpHeaders` ([#544](https://github.com/apify/crawlee-python/pull/544)) ([854f2c1](https://github.com/apify/crawlee-python/commit/854f2c1e2e09cf398e04b1e153534282add1247e)) by [@vdusek](https://github.com/vdusek)
### 🐛 Bug Fixes
- Call `error_handler` for `SessionError` ([#557](https://github.com/apify/crawlee-python/pull/557)) ([e75ac4b](https://github.com/apify/crawlee-python/commit/e75ac4b70cd48a4ca9f8245cea3c5f3c188b8824)) by [@vdusek](https://github.com/vdusek), closes [#546](https://github.com/apify/crawlee-python/issues/546)
- Extend from `StrEnum` in `RequestState` to fix serialization ([#556](https://github.com/apify/crawlee-python/pull/556)) ([6bf35ba](https://github.com/apify/crawlee-python/commit/6bf35ba4a6913819706ebd1d2c1156a4c62f944e)) by [@vdusek](https://github.com/vdusek), closes [#551](https://github.com/apify/crawlee-python/issues/551)
- Add equality check to UserData model ([#562](https://github.com/apify/crawlee-python/pull/562)) ([899a25c](https://github.com/apify/crawlee-python/commit/899a25ca63f570b3c4d8d56c85a838b371fd3924)) by [@janbuchar](https://github.com/janbuchar)
## [0.3.7](https://github.com/apify/crawlee-python/releases/tag/v0.3.7) (2024-09-25)
### 🐛 Bug Fixes
- Improve `Request.user_data` serialization ([#540](https://github.com/apify/crawlee-python/pull/540)) ([de29c0e](https://github.com/apify/crawlee-python/commit/de29c0e6b737a9d2544c5382472618dde76eb2a5)) by [@janbuchar](https://github.com/janbuchar), closes [#524](https://github.com/apify/crawlee-python/issues/524)
- Adopt new version of curl-cffi ([#543](https://github.com/apify/crawlee-python/pull/543)) ([f6fcf48](https://github.com/apify/crawlee-python/commit/f6fcf48d99bfcb4b8e75c5c9c38dc8c265164a10)) by [@vdusek](https://github.com/vdusek)
## [0.3.6](https://github.com/apify/crawlee-python/releases/tag/v0.3.6) (2024-09-19)
### 🚀 Features
- Add HTTP/2 support for HTTPX client ([#513](https://github.com/apify/crawlee-python/pull/513)) ([0eb0a33](https://github.com/apify/crawlee-python/commit/0eb0a33411096011198e52c393f35730f1a0b6ac)) by [@vdusek](https://github.com/vdusek), closes [#512](https://github.com/apify/crawlee-python/issues/512)
- Expose extended unique key when creating a new Request ([#515](https://github.com/apify/crawlee-python/pull/515)) ([1807f41](https://github.com/apify/crawlee-python/commit/1807f419e47a815dd706d09acb0f3b3af8cfc691)) by [@vdusek](https://github.com/vdusek)
- Add header generator and integrate it into HTTPX client ([#530](https://github.com/apify/crawlee-python/pull/530)) ([b63f9f9](https://github.com/apify/crawlee-python/commit/b63f9f98c6613e095546ef544eab271d433e3379)) by [@vdusek](https://github.com/vdusek), closes [#402](https://github.com/apify/crawlee-python/issues/402)
### 🐛 Bug Fixes
- Use explicitly UTF-8 encoding in local storage ([#533](https://github.com/apify/crawlee-python/pull/533)) ([a3a0ab2](https://github.com/apify/crawlee-python/commit/a3a0ab2f6809b7a06319a77dfbf289df78638dea)) by [@vdusek](https://github.com/vdusek), closes [#532](https://github.com/apify/crawlee-python/issues/532)
## [0.3.5](https://github.com/apify/crawlee-python/releases/tag/v0.3.5) (2024-09-10)
### 🚀 Features
- Memory usage limit configuration via environment variables ([#502](https://github.com/apify/crawlee-python/pull/502)) ([c62e554](https://github.com/apify/crawlee-python/commit/c62e5545de6a1836f0514ebd3dd695e4fd856844)) by [@janbuchar](https://github.com/janbuchar)
### 🐛 Bug Fixes
- Http clients detect 4xx as errors by default ([#498](https://github.com/apify/crawlee-python/pull/498)) ([1895dca](https://github.com/apify/crawlee-python/commit/1895dca538f415feca37b4a030525c7c0d32f114)) by [@vdusek](https://github.com/vdusek), closes [#496](https://github.com/apify/crawlee-python/issues/496)
- Correctly handle log level configuration ([#508](https://github.com/apify/crawlee-python/pull/508)) ([7ea8fe6](https://github.com/apify/crawlee-python/commit/7ea8fe69f4a6146a1e417bebff60c08a85e2ca27)) by [@janbuchar](https://github.com/janbuchar)
## [0.3.4](https://github.com/apify/crawlee-python/releases/tag/v0.3.4) (2024-09-05)
### 🐛 Bug Fixes
- Expose basic crawling context ([#501](https://github.com/apify/crawlee-python/pull/501)) ([b484535](https://github.com/apify/crawlee-python/commit/b484535dbacc5d206a026f55a1d3e58edd375e91)) by [@vdusek](https://github.com/vdusek)
## [0.3.3](https://github.com/apify/crawlee-python/releases/tag/v0.3.3) (2024-09-05)
### 🐛 Bug Fixes
- Deduplicate requests by unique key before submitting them to the queue ([#499](https://github.com/apify/crawlee-python/pull/499)) ([6a3e0e7](https://github.com/apify/crawlee-python/commit/6a3e0e78490851c43cefb0497ce34ca52a31a25c)) by [@janbuchar](https://github.com/janbuchar)
## [0.3.2](https://github.com/apify/crawlee-python/releases/tag/v0.3.2) (2024-09-02)
### 🐛 Bug Fixes
- Double incrementation of `item_count` ([#443](https://github.com/apify/crawlee-python/pull/443)) ([cd9adf1](https://github.com/apify/crawlee-python/commit/cd9adf15731e8c4a39cb142b6d1a62909cafdc51)) by [@cadlagtrader](https://github.com/cadlagtrader), closes [#442](https://github.com/apify/crawlee-python/issues/442)
- Field alias in `BatchRequestsOperationResponse` ([#485](https://github.com/apify/crawlee-python/pull/485)) ([126a862](https://github.com/apify/crawlee-python/commit/126a8629cb5b989a0f9fe22156fb09731a34acd2)) by [@janbuchar](https://github.com/janbuchar)
- JSON handling with Parsel ([#490](https://github.com/apify/crawlee-python/pull/490)) ([ebf5755](https://github.com/apify/crawlee-python/commit/ebf575539ffb631ae131a1b801cec8f21dd0cf4c)) by [@janbuchar](https://github.com/janbuchar), closes [#488](https://github.com/apify/crawlee-python/issues/488)
## [0.3.1](https://github.com/apify/crawlee-python/releases/tag/v0.3.1) (2024-08-30)
### 🚀 Features
- Curl http client selects chrome impersonation by default ([#473](https://github.com/apify/crawlee-python/pull/473)) ([82dc939](https://github.com/apify/crawlee-python/commit/82dc93957b1a380ea975564dea5c6ba4639be548)) by [@vdusek](https://github.com/vdusek)
## [0.3.0](https://github.com/apify/crawlee-python/releases/tag/v0.3.0) (2024-08-27)
- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v03) to ensure a smooth update.
### 🚀 Features
- Implement ParselCrawler that adds support for Parsel ([#348](https://github.com/apify/crawlee-python/pull/348)) ([a3832e5](https://github.com/apify/crawlee-python/commit/a3832e527f022f32cce4a80055da3b7967b74522)) by [@asymness](https://github.com/asymness), closes [#335](https://github.com/apify/crawlee-python/issues/335)
- Add support for filling a web form ([#453](https://github.com/apify/crawlee-python/pull/453)) ([5a125b4](https://github.com/apify/crawlee-python/commit/5a125b464b2619000b92dacad4c3a7faa1869f29)) by [@vdusek](https://github.com/vdusek), closes [#305](https://github.com/apify/crawlee-python/issues/305)
### 🐛 Bug Fixes
- Remove indentation from statistics logging and print the data in tables ([#322](https://github.com/apify/crawlee-python/pull/322)) ([359b515](https://github.com/apify/crawlee-python/commit/359b515d647f064886f91441c2c01d3099e21035)) by [@TymeeK](https://github.com/TymeeK), closes [#306](https://github.com/apify/crawlee-python/issues/306)
- Remove redundant log, fix format ([#408](https://github.com/apify/crawlee-python/pull/408)) ([8d27e39](https://github.com/apify/crawlee-python/commit/8d27e3928c605d6eceb51a948453a15024fa2aa2)) by [@janbuchar](https://github.com/janbuchar)
- Dequeue items from RequestQueue in the correct order ([#411](https://github.com/apify/crawlee-python/pull/411)) ([96fc33e](https://github.com/apify/crawlee-python/commit/96fc33e2cc4631cae3c50dad9eace6407103a2a9)) by [@janbuchar](https://github.com/janbuchar)
- Relative URLS supports & If not a URL, pass #417 ([#431](https://github.com/apify/crawlee-python/pull/431)) ([ccd8145](https://github.com/apify/crawlee-python/commit/ccd81454166ece68391cdffedb8efe9e663361d9)) by [@black7375](https://github.com/black7375), closes [#417](https://github.com/apify/crawlee-python/issues/417)
- Typo in ProlongRequestLockResponse ([#458](https://github.com/apify/crawlee-python/pull/458)) ([30ccc3a](https://github.com/apify/crawlee-python/commit/30ccc3a4763bc3706a3bbeaedc95f9648f5ba09a)) by [@janbuchar](https://github.com/janbuchar)
- Add missing __all__ to top-level __init__.py file ([#463](https://github.com/apify/crawlee-python/pull/463)) ([353a1ce](https://github.com/apify/crawlee-python/commit/353a1ce28cd38c97ffb36dc1e6b0e86d3aef1a48)) by [@janbuchar](https://github.com/janbuchar)
### 🚜 Refactor
- [**breaking**] RequestQueue and service management rehaul ([#429](https://github.com/apify/crawlee-python/pull/429)) ([b155a9f](https://github.com/apify/crawlee-python/commit/b155a9f602a163e891777bef5608072fb5d0156f)) by [@janbuchar](https://github.com/janbuchar), closes [#83](https://github.com/apify/crawlee-python/issues/83), [#174](https://github.com/apify/crawlee-python/issues/174), [#203](https://github.com/apify/crawlee-python/issues/203), [#423](https://github.com/apify/crawlee-python/issues/423)
- [**breaking**] Declare private and public interface ([#456](https://github.com/apify/crawlee-python/pull/456)) ([d6738df](https://github.com/apify/crawlee-python/commit/d6738df30586934e8d1aba50b9cd437a0ea40400)) by [@vdusek](https://github.com/vdusek)
## [0.2.1](https://github.com/apify/crawlee-python/releases/tag/v0.2.1) (2024-08-05)
### 🐛 Bug Fixes
- Do not import curl impersonate in http clients init ([#396](https://github.com/apify/crawlee-python/pull/396)) ([3bb8009](https://github.com/apify/crawlee-python/commit/3bb80093e61c1615f869ecd5ab80b061e0e5db36)) by [@vdusek](https://github.com/vdusek)
## [0.2.0](https://github.com/apify/crawlee-python/releases/tag/v0.2.0) (2024-08-05)
### 🚀 Features
- Add new curl impersonate HTTP client ([#387](https://github.com/apify/crawlee-python/pull/387)) ([9c06260](https://github.com/apify/crawlee-python/commit/9c06260c0ee958522caa9322001a3186e9e43af4)) by [@vdusek](https://github.com/vdusek), closes [#292](https://github.com/apify/crawlee-python/issues/292)
- **playwright:** `infinite_scroll` helper ([#393](https://github.com/apify/crawlee-python/pull/393)) ([34f74bd](https://github.com/apify/crawlee-python/commit/34f74bdcffb42a6c876a856e1c89923d9b3e60bd)) by [@janbuchar](https://github.com/janbuchar)
## [0.1.2](https://github.com/apify/crawlee-python/releases/tag/v0.1.2) (2024-07-30)
### 🚀 Features
- Add URL validation ([#343](https://github.com/apify/crawlee-python/pull/343)) ([1514538](https://github.com/apify/crawlee-python/commit/15145388009c85ab54dc72ea8f2d07efd78f80fd)) by [@vdusek](https://github.com/vdusek), closes [#300](https://github.com/apify/crawlee-python/issues/300)
### 🐛 Bug Fixes
- Minor log fix ([#341](https://github.com/apify/crawlee-python/pull/341)) ([0688bf1](https://github.com/apify/crawlee-python/commit/0688bf1860534ab6b2a85dc850bf3d56507ab154)) by [@souravjain540](https://github.com/souravjain540)
- Also use error_handler for context pipeline errors ([#331](https://github.com/apify/crawlee-python/pull/331)) ([7a66445](https://github.com/apify/crawlee-python/commit/7a664456b45c7e429b4c90aaf1c09d5796b93e3d)) by [@janbuchar](https://github.com/janbuchar), closes [#296](https://github.com/apify/crawlee-python/issues/296)
- Strip whitespace from href in enqueue_links ([#346](https://github.com/apify/crawlee-python/pull/346)) ([8a3174a](https://github.com/apify/crawlee-python/commit/8a3174aed24f9eb4f9ac415a79a58685a081cde2)) by [@janbuchar](https://github.com/janbuchar), closes [#337](https://github.com/apify/crawlee-python/issues/337)
- Warn instead of crashing when an empty dataset is being exported ([#342](https://github.com/apify/crawlee-python/pull/342)) ([22b95d1](https://github.com/apify/crawlee-python/commit/22b95d1948d4acd23a010898fa6af2f491e7f514)) by [@janbuchar](https://github.com/janbuchar), closes [#334](https://github.com/apify/crawlee-python/issues/334)
- Avoid Github rate limiting in project bootstrapping test ([#364](https://github.com/apify/crawlee-python/pull/364)) ([992f07f](https://github.com/apify/crawlee-python/commit/992f07f266f7b8433d99e9a179f277995f81eb17)) by [@janbuchar](https://github.com/janbuchar)
- Pass crawler configuration to storages ([#375](https://github.com/apify/crawlee-python/pull/375)) ([b2d3a52](https://github.com/apify/crawlee-python/commit/b2d3a52712abe21f4a4a5db4e20c80afe72c27de)) by [@janbuchar](https://github.com/janbuchar)
- Purge request queue on repeated crawler runs ([#377](https://github.com/apify/crawlee-python/pull/377)) ([7ad3d69](https://github.com/apify/crawlee-python/commit/7ad3d6908e153c590bff72478af7ee3239a249bc)) by [@janbuchar](https://github.com/janbuchar), closes [#152](https://github.com/apify/crawlee-python/issues/152)
## [0.1.1](https://github.com/apify/crawlee-python/releases/tag/v0.1.1) (2024-07-19)
### 🚀 Features
- Expose crawler log ([#316](https://github.com/apify/crawlee-python/pull/316)) ([ae475fa](https://github.com/apify/crawlee-python/commit/ae475fa450c4fe053620d7b7eb475f3d58804674)) by [@vdusek](https://github.com/vdusek), closes [#303](https://github.com/apify/crawlee-python/issues/303)
- Integrate proxies into `PlaywrightCrawler` ([#325](https://github.com/apify/crawlee-python/pull/325)) ([2e072b6](https://github.com/apify/crawlee-python/commit/2e072b6ad7d5d82d96a7b489cafb87e7bfaf6e83)) by [@vdusek](https://github.com/vdusek)
- Blocking detection for playwright crawler ([#328](https://github.com/apify/crawlee-python/pull/328)) ([49ff6e2](https://github.com/apify/crawlee-python/commit/49ff6e25c12a97550eee718d64bb4130f9990189)) by [@vdusek](https://github.com/vdusek), closes [#239](https://github.com/apify/crawlee-python/issues/239)
### 🐛 Bug Fixes
- Pylance reportPrivateImportUsage errors ([#313](https://github.com/apify/crawlee-python/pull/313)) ([09d7203](https://github.com/apify/crawlee-python/commit/09d72034d5db8c47f461111ec093761935a3e2ef)) by [@vdusek](https://github.com/vdusek), closes [#283](https://github.com/apify/crawlee-python/issues/283)
- Set httpx logging to warning ([#314](https://github.com/apify/crawlee-python/pull/314)) ([1585def](https://github.com/apify/crawlee-python/commit/1585defffb2c0c844fab39bbc0e0b793d6169cbf)) by [@vdusek](https://github.com/vdusek), closes [#302](https://github.com/apify/crawlee-python/issues/302)
- Byte size serialization in MemoryInfo ([#245](https://github.com/apify/crawlee-python/pull/245)) ([a030174](https://github.com/apify/crawlee-python/commit/a0301746c2df076d281708344fb906e1c42e0790)) by [@janbuchar](https://github.com/janbuchar)
- Project bootstrapping in existing folder ([#318](https://github.com/apify/crawlee-python/pull/318)) ([c630818](https://github.com/apify/crawlee-python/commit/c630818538e0c37217ab73f6c6da05505ed8b364)) by [@janbuchar](https://github.com/janbuchar), closes [#301](https://github.com/apify/crawlee-python/issues/301)
## [0.1.0](https://github.com/apify/crawlee-python/releases/tag/v0.1.0) (2024-07-08)
### 🚀 Features
- Project templates ([#237](https://github.com/apify/crawlee-python/pull/237)) ([c23c12c](https://github.com/apify/crawlee-python/commit/c23c12c66688f825f74deb39702f07cc6c6bbc46)) by [@janbuchar](https://github.com/janbuchar), closes [#215](https://github.com/apify/crawlee-python/issues/215)
### 🐛 Bug Fixes
- CLI UX improvements ([#271](https://github.com/apify/crawlee-python/pull/271)) ([123d515](https://github.com/apify/crawlee-python/commit/123d515b224c663577bfe0fab387d0aa11e5e4d4)) by [@janbuchar](https://github.com/janbuchar), closes [#267](https://github.com/apify/crawlee-python/issues/267)
- Error handling in CLI and templates documentation ([#273](https://github.com/apify/crawlee-python/pull/273)) ([61083c3](https://github.com/apify/crawlee-python/commit/61083c33434d431a118538f15bfa9a68c312ab03)) by [@vdusek](https://github.com/vdusek), closes [#268](https://github.com/apify/crawlee-python/issues/268)
## [0.0.7](https://github.com/apify/crawlee-python/releases/tag/v0.0.7) (2024-06-27)
### 🐛 Bug Fixes
- Do not wait for consistency in request queue ([#235](https://github.com/apify/crawlee-python/pull/235)) ([03ff138](https://github.com/apify/crawlee-python/commit/03ff138aadaf8e915abc7fafb854fe12947b9696)) by [@vdusek](https://github.com/vdusek)
- Selector handling in BeautifulSoupCrawler enqueue_links ([#231](https://github.com/apify/crawlee-python/pull/231)) ([896501e](https://github.com/apify/crawlee-python/commit/896501edb44f801409fec95cb3e5f2bcfcb4188d)) by [@janbuchar](https://github.com/janbuchar), closes [#230](https://github.com/apify/crawlee-python/issues/230)
- Handle blocked request ([#234](https://github.com/apify/crawlee-python/pull/234)) ([f8ef79f](https://github.com/apify/crawlee-python/commit/f8ef79ffcb7410713182af716d37dbbaad66fdbc)) by [@Mantisus](https://github.com/Mantisus)
- Improve AutoscaledPool state management ([#241](https://github.com/apify/crawlee-python/pull/241)) ([fdea3d1](https://github.com/apify/crawlee-python/commit/fdea3d16b13afe70039d864de861486c760aa0ba)) by [@janbuchar](https://github.com/janbuchar), closes [#236](https://github.com/apify/crawlee-python/issues/236)
## [0.0.6](https://github.com/apify/crawlee-python/releases/tag/v0.0.6) (2024-06-25)
### 🚀 Features
- Maintain a global configuration instance ([#207](https://github.com/apify/crawlee-python/pull/207)) ([e003aa6](https://github.com/apify/crawlee-python/commit/e003aa63d859bec8199d0c890b5c9604f163ccd3)) by [@janbuchar](https://github.com/janbuchar)
- Add max requests per crawl to `BasicCrawler` ([#198](https://github.com/apify/crawlee-python/pull/198)) ([b5b3053](https://github.com/apify/crawlee-python/commit/b5b3053f43381601274e4034d07b4bf41720c7c2)) by [@vdusek](https://github.com/vdusek)
- Add support decompress *br* response content ([#226](https://github.com/apify/crawlee-python/pull/226)) ([a3547b9](https://github.com/apify/crawlee-python/commit/a3547b9c882dc5333a4fcd1223687ef85e79138d)) by [@Mantisus](https://github.com/Mantisus)
- BasicCrawler.export_data helper ([#222](https://github.com/apify/crawlee-python/pull/222)) ([237ec78](https://github.com/apify/crawlee-python/commit/237ec789b7dccc17cc57ef47ec56bcf73c6ca006)) by [@janbuchar](https://github.com/janbuchar), closes [#211](https://github.com/apify/crawlee-python/issues/211)
- Automatic logging setup ([#229](https://github.com/apify/crawlee-python/pull/229)) ([a67b72f](https://github.com/apify/crawlee-python/commit/a67b72faacd75674071bae496d59e1c60636350c)) by [@janbuchar](https://github.com/janbuchar), closes [#214](https://github.com/apify/crawlee-python/issues/214)
### 🐛 Bug Fixes
- Handling of relative URLs in add_requests ([#213](https://github.com/apify/crawlee-python/pull/213)) ([8aa8c57](https://github.com/apify/crawlee-python/commit/8aa8c57f44149caa0e01950a5d773726f261699a)) by [@janbuchar](https://github.com/janbuchar), closes [#202](https://github.com/apify/crawlee-python/issues/202), [#204](https://github.com/apify/crawlee-python/issues/204)
- Graceful exit in BasicCrawler.run ([#224](https://github.com/apify/crawlee-python/pull/224)) ([337286e](https://github.com/apify/crawlee-python/commit/337286e1b721cf61f57bc0ff3ead08df1f4f5448)) by [@janbuchar](https://github.com/janbuchar), closes [#212](https://github.com/apify/crawlee-python/issues/212)
## [0.0.5](https://github.com/apify/crawlee-python/releases/tag/v0.0.5) (2024-06-21)
### 🚀 Features
- Browser rotation and better browser abstraction ([#177](https://github.com/apify/crawlee-python/pull/177)) ([a42ae6f](https://github.com/apify/crawlee-python/commit/a42ae6f53c5e24678f04011c3684290b68684016)) by [@vdusek](https://github.com/vdusek), closes [#131](https://github.com/apify/crawlee-python/issues/131)
- Add emit persist state event to event manager ([#181](https://github.com/apify/crawlee-python/pull/181)) ([97f6c68](https://github.com/apify/crawlee-python/commit/97f6c68275b65f76c62b6d16d94354fc7f00d336)) by [@vdusek](https://github.com/vdusek)
- Batched request addition in RequestQueue ([#186](https://github.com/apify/crawlee-python/pull/186)) ([f48c806](https://github.com/apify/crawlee-python/commit/f48c8068fe16ce3dd4c46fc248733346c0621411)) by [@vdusek](https://github.com/vdusek)
- Add storage helpers to crawler & context ([#192](https://github.com/apify/crawlee-python/pull/192)) ([f8f4066](https://github.com/apify/crawlee-python/commit/f8f4066d8b32d6e7dc0d999a5aa8db75f99b43b8)) by [@vdusek](https://github.com/vdusek), closes [#98](https://github.com/apify/crawlee-python/issues/98), [#100](https://github.com/apify/crawlee-python/issues/100), [#172](https://github.com/apify/crawlee-python/issues/172)
- Handle all supported configuration options ([#199](https://github.com/apify/crawlee-python/pull/199)) ([23c901c](https://github.com/apify/crawlee-python/commit/23c901cd68cf14b4041ee03568622ee32822e94b)) by [@janbuchar](https://github.com/janbuchar), closes [#84](https://github.com/apify/crawlee-python/issues/84)
- Add Playwright's enqueue links helper ([#196](https://github.com/apify/crawlee-python/pull/196)) ([849d73c](https://github.com/apify/crawlee-python/commit/849d73cc7d137171b98f9f2ab85374e8beec0dad)) by [@vdusek](https://github.com/vdusek)
### 🐛 Bug Fixes
- Tmp path in tests is working ([#164](https://github.com/apify/crawlee-python/pull/164)) ([382b6f4](https://github.com/apify/crawlee-python/commit/382b6f48174bdac3931cc379eaf770ab06f826dc)) by [@vdusek](https://github.com/vdusek), closes [#159](https://github.com/apify/crawlee-python/issues/159)
- Add explicit err msgs for missing pckg extras during import ([#165](https://github.com/apify/crawlee-python/pull/165)) ([200ebfa](https://github.com/apify/crawlee-python/commit/200ebfa63d6e20e17c8ca29544ef7229ed0df308)) by [@vdusek](https://github.com/vdusek), closes [#155](https://github.com/apify/crawlee-python/issues/155)
- Make timedelta_ms accept string-encoded numbers ([#190](https://github.com/apify/crawlee-python/pull/190)) ([d8426ff](https://github.com/apify/crawlee-python/commit/d8426ff41e36f701af459ad17552fee39637674d)) by [@janbuchar](https://github.com/janbuchar)
- **deps:** Update dependency psutil to v6 ([#193](https://github.com/apify/crawlee-python/pull/193)) ([eb91f51](https://github.com/apify/crawlee-python/commit/eb91f51e19da406e3f9293e5336c1f85fc7885a4)) by [@renovate[bot]](https://github.com/renovate[bot])
- Improve compatibility between ProxyConfiguration and its SDK counterpart ([#201](https://github.com/apify/crawlee-python/pull/201)) ([1a76124](https://github.com/apify/crawlee-python/commit/1a76124080d561e0153a4dda0bdb0d9863c3aab6)) by [@janbuchar](https://github.com/janbuchar)
- Correct return type of storage get_info methods ([#200](https://github.com/apify/crawlee-python/pull/200)) ([332673c](https://github.com/apify/crawlee-python/commit/332673c4fb519b80846df7fb8cd8bb521538a8a4)) by [@janbuchar](https://github.com/janbuchar)
- Type error in statistics persist state ([#206](https://github.com/apify/crawlee-python/pull/206)) ([96ceef6](https://github.com/apify/crawlee-python/commit/96ceef697769cd57bd1a50b6615cf1e70549bd2d)) by [@vdusek](https://github.com/vdusek), closes [#194](https://github.com/apify/crawlee-python/issues/194)
## [0.0.4](https://github.com/apify/crawlee-python/releases/tag/v0.0.4) (2024-05-30)
### 🚀 Features
- Capture statistics about the crawler run ([#142](https://github.com/apify/crawlee-python/pull/142)) ([eeebe9b](https://github.com/apify/crawlee-python/commit/eeebe9b1e24338d68a0a55228bbfc717f4d9d295)) by [@janbuchar](https://github.com/janbuchar), closes [#97](https://github.com/apify/crawlee-python/issues/97)
- Proxy configuration ([#156](https://github.com/apify/crawlee-python/pull/156)) ([5c3753a](https://github.com/apify/crawlee-python/commit/5c3753a5527b1d01f7260b9e4c566e43f956a5e8)) by [@janbuchar](https://github.com/janbuchar), closes [#136](https://github.com/apify/crawlee-python/issues/136)
- Add first version of browser pool and playwright crawler ([#161](https://github.com/apify/crawlee-python/pull/161)) ([2d2a050](https://github.com/apify/crawlee-python/commit/2d2a0505b1c2b1529a8835163ca97d1ec2a6e44a)) by [@vdusek](https://github.com/vdusek)
## [0.0.3](https://github.com/apify/crawlee-python/releases/tag/v0.0.3) (2024-05-13)
### 🚀 Features
- AutoscaledPool implementation ([#55](https://github.com/apify/crawlee-python/pull/55)) ([621ada2](https://github.com/apify/crawlee-python/commit/621ada2bd1ba4e2346fb948dc02686e2b37e3856)) by [@janbuchar](https://github.com/janbuchar), closes [#19](https://github.com/apify/crawlee-python/issues/19)
- Add Snapshotter ([#20](https://github.com/apify/crawlee-python/pull/20)) ([492ee38](https://github.com/apify/crawlee-python/commit/492ee38c893b8f54e9583dd492576c5106e29881)) by [@vdusek](https://github.com/vdusek)
- Implement BasicCrawler ([#56](https://github.com/apify/crawlee-python/pull/56)) ([6da971f](https://github.com/apify/crawlee-python/commit/6da971fcddbf8b6795346c88e295dada28e7b1d3)) by [@janbuchar](https://github.com/janbuchar), closes [#30](https://github.com/apify/crawlee-python/issues/30)
- BeautifulSoupCrawler ([#107](https://github.com/apify/crawlee-python/pull/107)) ([4974dfa](https://github.com/apify/crawlee-python/commit/4974dfa20c7911ee073438fd388e60ba4b2c07db)) by [@janbuchar](https://github.com/janbuchar), closes [#31](https://github.com/apify/crawlee-python/issues/31)
- Add_requests and enqueue_links context helpers ([#120](https://github.com/apify/crawlee-python/pull/120)) ([dc850a5](https://github.com/apify/crawlee-python/commit/dc850a5778b105ff09e19eaecbb0a12d94798a62)) by [@janbuchar](https://github.com/janbuchar), closes [#5](https://github.com/apify/crawlee-python/issues/5)
- Use SessionPool in BasicCrawler ([#128](https://github.com/apify/crawlee-python/pull/128)) ([9fc4648](https://github.com/apify/crawlee-python/commit/9fc464837e596b3b5a7cd818b6d617550e249352)) by [@janbuchar](https://github.com/janbuchar), closes [#110](https://github.com/apify/crawlee-python/issues/110)
- Add base storage client and resource subclients ([#138](https://github.com/apify/crawlee-python/pull/138)) ([44d6597](https://github.com/apify/crawlee-python/commit/44d65974e4837576918069d7e63f8b804964971a)) by [@vdusek](https://github.com/vdusek)
### 🐛 Bug Fixes
- **deps:** Update dependency docutils to ^0.21.0 ([#101](https://github.com/apify/crawlee-python/pull/101)) ([534b613](https://github.com/apify/crawlee-python/commit/534b613f7cdfe7adf38b548ee48537db3167d1ec)) by [@renovate[bot]](https://github.com/renovate[bot])
- **deps:** Update dependency eval-type-backport to ^0.2.0 ([#124](https://github.com/apify/crawlee-python/pull/124)) ([c9e69a8](https://github.com/apify/crawlee-python/commit/c9e69a8534f4d82d9a6314947d76a86bcb744607)) by [@renovate[bot]](https://github.com/renovate[bot])
- Fire local SystemInfo events every second ([#144](https://github.com/apify/crawlee-python/pull/144)) ([f1359fa](https://github.com/apify/crawlee-python/commit/f1359fa7eea23f8153ad711287c073e45d498401)) by [@vdusek](https://github.com/vdusek)
- Storage manager & purging the defaults ([#150](https://github.com/apify/crawlee-python/pull/150)) ([851042f](https://github.com/apify/crawlee-python/commit/851042f25ad07e25651768e476f098ef0ed21914)) by [@vdusek](https://github.com/vdusek)
<!-- generated by git-cliff -->
================================================
FILE: CONTRIBUTING.md
================================================
# Development
Here you'll find a contributing guide to get started with development.
## Environment
For local development, it is required to have Python 3.10 (or a later version) installed.
We use [uv](https://docs.astral.sh/uv/) for project management. Install it and set up your IDE accordingly.
We use [Poe the Poet](https://poethepoet.natn.io/) as a task runner, similar to npm scripts in `package.json`.
All tasks are defined in `pyproject.toml` under `[tool.poe.tasks]` and can be run with `uv run poe <task>`.
### Available tasks
| Task | Description |
| ---- | ----------- |
| `install-dev` | Install development dependencies |
| `check-code` | Run lint, type-check, and unit-tests |
| `lint` | Run linter |
| `format` | Fix lint issues and format code |
| `type-check` | Run type checker |
| `unit-tests` | Run unit tests |
| `unit-tests-cov` | Run unit tests with coverage |
| `e2e-templates-tests` | Run end-to-end template tests |
| `build-docs` | Build documentation website |
| `run-docs` | Run documentation website locally |
| `build` | Build package |
| `clean` | Remove build artifacts and clean caches |
## Dependencies
To install this package and its development dependencies, run:
```sh
uv run poe install-dev
```
## Code checking
To execute all code checking tools together, run:
```sh
uv run poe check-code
```
### Linting
We utilize [ruff](https://docs.astral.sh/ruff/) for linting, which analyzes code for potential issues and enforces consistent style. Refer to `pyproject.toml` for configuration details.
To run linting:
```sh
uv run poe lint
```
### Formatting
Our automated code formatting also leverages [ruff](https://docs.astral.sh/ruff/), ensuring uniform style and addressing fixable linting issues. Configuration specifics are outlined in `pyproject.toml`.
To run formatting:
```sh
uv run poe format
```
### Type checking
Type checking is handled by [ty](https://docs.astral.sh/ty/), verifying code against type annotations. Configuration settings can be found in `pyproject.toml`.
To run type checking:
```sh
uv run poe type-check
```
### Unit tests
We use [pytest](https://docs.pytest.org/) as a testing framework with many plugins. Check `pyproject.toml` for configuration details and installed plugins.
To run unit tests:
```sh
uv run poe unit-tests
```
To run unit tests with coverage report:
```sh
uv run poe unit-tests-cov
```
## End-to-end tests
Prerequisites:
- [apify-cli](https://docs.apify.com/cli/docs/installation) installed and available in `PATH`
- Set `APIFY_TEST_USER_API_TOKEN` to your [Apify API token](https://docs.apify.com/platform/integrations/api#api-token)
To run end-to-end tests:
```sh
uv run poe e2e-templates-tests
```
## Documentation
We follow the [Google docstring format](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) for code documentation. All user-facing classes and functions must be documented. Documentation standards are enforced using [Ruff](https://docs.astral.sh/ruff/).
Our API documentation is generated from these docstrings using [pydoc-markdown](https://pypi.org/project/pydoc-markdown/) with custom post-processing. Additional content is provided through markdown files in the `docs/` directory. The final documentation is rendered using [Docusaurus](https://docusaurus.io/) and published to GitHub Pages.
To run the documentation locally, ensure you have `Node.js` 20+ installed, then run:
```sh
uv run poe run-docs
```
## Commits
We use [Conventional Commits](https://www.conventionalcommits.org/) format for commit messages. This convention is used to automatically determine version bumps during the release process.
### Available commit types
| Type | Description |
| ---- | ----------- |
| `feat` | A new feature |
| `fix` | A bug fix |
| `docs` | Documentation only changes |
| `style` | Changes that do not affect the meaning of the code (white-space, formatting, missing semi-colons, etc) |
| `refactor` | A code change that neither fixes a bug nor adds a feature |
| `perf` | A code change that improves performance |
| `test` | Adding missing tests or correcting existing tests |
| `build` | Changes that affect the build system or external dependencies (example scopes: gulp, broccoli, npm) |
| `ci` | Changes to our CI configuration files and scripts (example scopes: Travis, Circle, BrowserStack, SauceLabs) |
| `chore` | Other changes that don't modify src or test files |
| `revert` | Reverts a previous commit |
## Release process
Publishing new versions to [PyPI](https://pypi.org/project/crawlee) is automated through GitHub Actions.
- **Beta releases**: On each commit to the master branch, a new beta release is automatically published. The version number is determined based on the latest release and conventional commits. The beta version suffix is incremented by 1 from the last beta release on PyPI.
- **Stable releases**: A stable version release may be created by triggering the `release` GitHub Actions workflow. The version number is determined based on the latest release and conventional commits (`auto` release type), or it may be overridden using the `custom` release type.
### Publishing to PyPI manually
1. **Do not do this unless absolutely necessary.** In all conceivable scenarios, you should use the `release` workflow instead.
2. **Make sure you know what you're doing.**
3. Update the version number:
- Modify the `version` field under `project` in `pyproject.toml`.
```toml
[project]
name = "crawlee"
version = "x.z.y"
```
4. Build the package:
```sh
uv run poe build
```
5. Upload to PyPI:
```sh
uv publish --token YOUR_API_TOKEN
```
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2023 Apify Technologies s.r.o.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
<h1 align="center">
<a href="https://crawlee.dev">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/apify/crawlee-python/master/website/static/img/crawlee-dark.svg?sanitize=true">
<img alt="Crawlee" src="https://raw.githubusercontent.com/apify/crawlee-python/master/website/static/img/crawlee-light.svg?sanitize=true" width="500">
</picture>
</a>
<br>
<small>A web scraping and browser automation library</small>
</h1>
<p align=center>
<a href="https://trendshift.io/repositories/11169" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11169" alt="apify%2Fcrawlee-python | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</p>
<p align="center">
<a href="https://badge.fury.io/py/crawlee" rel="nofollow"><img src="https://badge.fury.io/py/crawlee.svg" alt="PyPI package version"></a>
<a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/dm/crawlee" alt="PyPI package downloads"></a>
<a href="https://codecov.io/gh/apify/crawlee-python"><img src="https://codecov.io/gh/apify/crawlee-python/graph/badge.svg?token=cCju61iPQG" alt="Codecov report"></a>
<a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/pyversions/crawlee" alt="PyPI Python version"></a>
<a href="https://discord.gg/jyEM2PRvMU" rel="nofollow"><img src="https://img.shields.io/discord/801163717915574323?label=discord" alt="Chat on Discord"></a>
</p>
Crawlee covers your crawling and scraping end-to-end and **helps you build reliable scrapers. Fast.**
Your crawlers will appear almost human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it.
> 👉 **View full documentation, guides and examples on the [Crawlee project website](https://crawlee.dev/python/)** 👈
We also have a TypeScript implementation of the Crawlee, which you can explore and utilize for your projects. Visit our GitHub repository for more information [Crawlee for JS/TS on GitHub](https://github.com/apify/crawlee).
## Installation
We recommend visiting the [Introduction tutorial](https://crawlee.dev/python/docs/introduction) in Crawlee documentation for more information.
Crawlee is available as [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal.
To install Crawlee with all features, run the following command:
```sh
python -m pip install 'crawlee[all]'
```
Then, install the [Playwright](https://playwright.dev/) dependencies:
```sh
playwright install
```
Verify that Crawlee is successfully installed:
```sh
python -c 'import crawlee; print(crawlee.__version__)'
```
For detailed installation instructions see the [Setting up](https://crawlee.dev/python/docs/introduction/setting-up) documentation page.
### With Crawlee CLI
The quickest way to get started with Crawlee is by using the Crawlee CLI and selecting one of the prepared templates. First, ensure you have [uv](https://pypi.org/project/uv/) installed:
```sh
uv --help
```
If [uv](https://pypi.org/project/uv/) is not installed, follow the official [installation guide](https://docs.astral.sh/uv/getting-started/installation/).
Then, run the CLI and choose from the available templates:
```sh
uvx 'crawlee[cli]' create my-crawler
```
If you already have `crawlee` installed, you can spin it up by running:
```sh
crawlee create my-crawler
```
## Examples
Here are some practical examples to help you get started with different types of crawlers in Crawlee. Each example demonstrates how to set up and run a crawler for specific use cases, whether you need to handle simple HTML pages or interact with JavaScript-heavy sites. A crawler run will create a `storage/` directory in your current working directory.
### BeautifulSoupCrawler
The [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler) downloads web pages using an HTTP library and provides HTML-parsed content to the user. By default it uses [`HttpxHttpClient`](https://crawlee.dev/python/api/class/HttpxHttpClient) for HTTP communication and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) for parsing HTML. It is ideal for projects that require efficient extraction of data from HTML content. This crawler has very good performance since it does not use a browser. However, if you need to execute client-side JavaScript, to get your content, this is not going to be enough and you will need to use [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler). Also if you want to use this crawler, make sure you install `crawlee` with `beautifulsoup` extra.
```python
import asyncio
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract data from the page.
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
}
# Push the extracted data to the default dataset.
await context.push_data(data)
# Enqueue all links found on the page.
await context.enqueue_links()
# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())
```
### PlaywrightCrawler
The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) uses a headless browser to download web pages and provides an API for data extraction. It is built on [Playwright](https://playwright.dev/), an automation library designed for managing headless browsers. It excels at retrieving web pages that rely on client-side JavaScript for content generation, or tasks requiring interaction with JavaScript-driven content. For scenarios where JavaScript execution is unnecessary or higher performance is required, consider using the [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler). Also if you want to use this crawler, make sure you install `crawlee` with `playwright` extra.
```python
import asyncio
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
async def main() -> None:
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract data from the page.
data = {
'url': context.request.url,
'title': await context.page.title(),
}
# Push the extracted data to the default dataset.
await context.push_data(data)
# Enqueue all links found on the page.
await context.enqueue_links()
# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())
```
### More examples
Explore our [Examples](https://crawlee.dev/python/docs/examples) page in the Crawlee documentation for a wide range of additional use cases and demonstrations.
## Features
Why Crawlee is the preferred choice for web scraping and crawling?
### Why use Crawlee instead of just a random HTTP library with an HTML parser?
- Unified interface for **HTTP & headless browser** crawling.
- Automatic **parallel crawling** based on available system resources.
- Written in Python with **type hints** - enhances DX (IDE autocompletion) and reduces bugs (static type checking).
- Automatic **retries** on errors or when you’re getting blocked.
- Integrated **proxy rotation** and session management.
- Configurable **request routing** - direct URLs to the appropriate handlers.
- Persistent **queue for URLs** to crawl.
- Pluggable **storage** of both tabular data and files.
- Robust **error handling**.
### Why to use Crawlee rather than Scrapy?
- **Asyncio-based** – Leveraging the standard [Asyncio](https://docs.python.org/3/library/asyncio.html) library, Crawlee delivers better performance and seamless compatibility with other modern asynchronous libraries.
- **Type hints** – Newer project built with modern Python, and complete type hint coverage for a better developer experience.
- **Simple integration** – Crawlee crawlers are regular Python scripts, requiring no additional launcher executor. This flexibility allows to integrate a crawler directly into other applications.
- **State persistence** – Supports state persistence during interruptions, saving time and costs by avoiding the need to restart scraping pipelines from scratch after an issue.
- **Organized data storages** – Allows saving of multiple types of results in a single scraping run. Offers several storing options (see [datasets](https://crawlee.dev/python/api/class/Dataset) & [key-value stores](https://crawlee.dev/python/api/class/KeyValueStore)).
## Running on the Apify platform
Crawlee is open-source and runs anywhere, but since it's developed by [Apify](https://apify.com), it's easy to set up on the Apify platform and run in the cloud. Visit the [Apify SDK website](https://docs.apify.com/sdk/python/) to learn more about deploying Crawlee to the Apify platform.
## Support
If you find any bug or issue with Crawlee, please [submit an issue on GitHub](https://github.com/apify/crawlee-python/issues). For questions, you can ask on [Stack Overflow](https://stackoverflow.com/questions/tagged/apify), in GitHub Discussions or you can join our [Discord server](https://discord.com/invite/jyEM2PRvMU).
## Contributing
Your code contributions are welcome, and you'll be praised for eternity! If you have any ideas for improvements, either submit an issue or create a pull request. For contribution guidelines and the code of conduct, see [CONTRIBUTING.md](https://github.com/apify/crawlee-python/blob/master/CONTRIBUTING.md).
## License
This project is licensed under the Apache License 2.0 - see the [LICENSE](https://github.com/apify/crawlee-python/blob/master/LICENSE) file for details.
================================================
FILE: codecov.yaml
================================================
coverage:
status:
project:
default:
target: auto
threshold: 0.10% # tolerate up to 0.10% decrease
informational: true # CI check reports status but never fails
patch:
default:
target: 50% # error only if patch coverage drops below 50%
informational: true # CI check reports status but never fails
================================================
FILE: docs/deployment/apify_platform.mdx
================================================
---
id: apify-platform
title: Apify platform
description: Apify platform - large-scale and high-performance web scraping
---
import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';
import LogWithConfigExample from '!!raw-loader!./code_examples/apify/log_with_config_example.py';
import CrawlerAsActorExample from '!!raw-loader!./code_examples/apify/crawler_as_actor_example.py';
import ProxyExample from '!!raw-loader!./code_examples/apify/proxy_example.py';
import ProxyAdvancedExample from '!!raw-loader!./code_examples/apify/proxy_advanced_example.py';
Apify is a [platform](https://apify.com) built to serve large-scale and high-performance web scraping and automation needs. It provides easy access to [compute instances (Actors)](#what-is-an-actor), convenient request and result storages, [proxies](../guides/proxy-management), scheduling, webhooks and [more](https://docs.apify.com/), accessible through a [web interface](https://console.apify.com) or an [API](https://docs.apify.com/api).
While we think that the Apify platform is super cool, and it's definitely worth signing up for a [free account](https://console.apify.com/sign-up), **Crawlee is and will always be open source**, runnable locally or on any cloud infrastructure.
:::note
We do not test Crawlee in other cloud environments such as Lambda or on specific architectures such as Raspberry PI. We strive to make it work, but there are no guarantees.
:::
## Requirements
To run your Crawlee code on Apify platform, you need an Apify account. If you don't have one yet, you can sign up [here](https://console.apify.com/sign-up).
Additionally, you must have the [Apify CLI](https://docs.apify.com/cli/) installed on your computer. For installation instructions, refer to the [Installation guide](https://docs.apify.com/cli/docs/installation).
Finally, ensure that the [Apify SDK] (https://docs.apify.com/sdk/python/) is installed in your project. You can install it using `pip`:
```bash
pip install apify
```
## Logging into Apify platform from Crawlee
To access your [Apify account](https://console.apify.com/sign-up) from Crawlee, you must provide credentials - your [API token](https://console.apify.com/account?tab=integrations). You can do that either by utilizing [Apify CLI](https://docs.apify.com/cli/) or with environment variables.
Once you provide credentials to your Apify CLI installation, you will be able to use all the Apify platform features, such as calling Actors, saving to cloud storages, using Apify proxies, setting up webhooks and so on.
### Log in with CLI
Apify CLI allows you to log in to your Apify account on your computer. If you then run your crawler using the CLI, your credentials will automatically be added.
```bash
npm install -g apify-cli
apify login -t YOUR_API_TOKEN
```
### Log in with environment variables
Alternatively, you can always provide credentials to your Actor by setting the [`APIFY_TOKEN`](#apify_token) environment variable to your API token.
> There's also the [`APIFY_PROXY_PASSWORD`](#apify_proxy_password)
> environment variable. Actor automatically infers that from your token, but it can be useful
> when you need to access proxies from a different account than your token represents.
### Log in with Configuration
Another option is to use the [`Configuration`](https://docs.apify.com/sdk/python/reference/class/Configuration) instance and set your api token there.
<CodeBlock className="language-python">
{LogWithConfigExample}
</CodeBlock>
## What is an Actor
When you deploy your script to the Apify platform, it becomes an [Actor](https://apify.com/actors). An Actor is a serverless microservice that accepts an input and produces an output. It can run for a few seconds, hours or even infinitely. An Actor can perform anything from a simple action such as filling out a web form or sending an email, to complex operations such as crawling an entire website and removing duplicates from a large dataset.
Actors can be shared in the [Apify Store](https://apify.com/store) so that other people can use them. But don't worry, if you share your Actor in the store and somebody uses it, it runs under their account, not yours.
**Related links**
- [Store of existing Actors](https://apify.com/store)
- [Documentation](https://docs.apify.com/actors)
- [View Actors in Apify Console](https://console.apify.com/actors)
- [API reference](https://apify.com/docs/api/v2#/reference/actors)
## Running an Actor locally
First let's create a boilerplate of the new Actor. You could use Apify CLI and just run:
```bash
apify create my-hello-world
```
The CLI will prompt you to select a project boilerplate template - let's pick "Crawlee + BeautifulSoup". The tool will create a directory called `my-hello-world` with Python project files. You can run the Actor as follows:
```bash
cd my-hello-world
apify run
```
## Running Crawlee code as an Actor
For running Crawlee code as an Actor on [Apify platform](https://apify.com/actors) you need to wrap the body of the main function of your crawler with `async with Actor`.
:::info NOTE
Adding `async with Actor` is the only important thing needed to run it on Apify platform as an Actor. It is needed to initialize your Actor (e.g. to set the correct storage implementation) and to correctly handle exiting the process.
:::
Let's look at the `BeautifulSoupCrawler` example from the [Quick start](../quick-start) guide:
<CodeBlock className="language-python">
{CrawlerAsActorExample}
</CodeBlock>
Note that you could also run your Actor (that is using Crawlee) locally with Apify CLI. You could start it via the following command in your project folder:
```bash
apify run
```
## Deploying an Actor to Apify platform
Now (assuming you are already logged in to your Apify account) you can easily deploy your code to the Apify platform by running:
```bash
apify push
```
Your script will be uploaded to and built on the Apify platform so that it can be run there. For more information, view the
[Apify Actor](https://docs.apify.com/cli) documentation.
## Usage on Apify platform
You can also develop your Actor in an online code editor directly on the platform (you'll need an Apify Account). Let's go to the [Actors](https://console.apify.com/actors) page in the app, click *Create new* and then go to the *Source* tab and start writing the code or paste one of the examples from the [Examples](../examples) section.
## Storages
There are several things worth mentioning here.
### Helper functions for default Key-Value Store and Dataset
To simplify access to the _default_ storages, instead of using the helper functions of respective storage classes, you could use:
- [`Actor.set_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#set_value), [`Actor.get_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_value), [`Actor.get_input()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_input) for [`Key-Value Store`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore)
- [`Actor.push_data()`](https://docs.apify.com/sdk/python/reference/class/Actor#push_data) for [`Dataset`](https://docs.apify.com/sdk/python/reference/class/Dataset)
### Using platform storage in a local Actor
When you plan to use the platform storage while developing and running your Actor locally, you should use [`Actor.open_key_value_store()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_key_value_store), [`Actor.open_dataset()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_dataset) and [`Actor.open_request_queue()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_request_queue) to open the respective storage.
Using each of these methods allows to pass the `force_cloud` keyword argument. If set to `True`, cloud storage will be used instead of the folder on the local disk.
:::note
If you don't plan to force usage of the platform storages when running the Actor locally, there is no need to use the [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) class for it. The Crawlee variants <ApiLink to="class/KeyValueStore#open">`KeyValueStore.open()`</ApiLink>, <ApiLink to="class/Dataset#open">`Dataset.open()`</ApiLink> and <ApiLink to="class/RequestQueue#open">`RequestQueue.open()`</ApiLink> will work the same.
:::
{/*
### Getting public url of an item in the platform storage
If you need to share a link to some file stored in a [Key-Value](https://docs.apify.com/sdk/python/reference/class/KeyValueStore) Store on Apify platform, you can use [`get_public_url()`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore#get_public_url) method. It accepts only one parameter: `key` - the key of the item you want to share.
<CodeBlock language="python">
{GetPublicUrlSource}
</CodeBlock>
*/}
### Exporting dataset data
When the <ApiLink to="class/Dataset">`Dataset`</ApiLink> is stored on the [Apify platform](https://apify.com/actors), you can export its data to the following formats: HTML, JSON, CSV, Excel, XML and RSS. The datasets are displayed on the Actor run details page and in the [Storage](https://console.apify.com/storage) section in the Apify Console. The actual data is exported using the [Get dataset items](https://apify.com/docs/api/v2#/reference/datasets/item-collection/get-items) Apify API endpoint. This way you can easily share the crawling results.
**Related links**
- [Apify platform storage documentation](https://docs.apify.com/storage)
- [View storage in Apify Console](https://console.apify.com/storage)
- [Key-value stores API reference](https://apify.com/docs/api/v2#/reference/key-value-stores)
- [Datasets API reference](https://docs.apify.com/api/v2#/reference/datasets)
- [Request queues API reference](https://docs.apify.com/api/v2#/reference/request-queues)
## Environment variables
The following describes select environment variables set by the Apify platform. For a complete list, see the [Environment variables](https://docs.apify.com/platform/actors/development/programming-interface/environment-variables) section in the Apify platform documentation.
:::note
It's important to notice that `CRAWLEE_` environment variables don't need to be replaced with equivalent `APIFY_` ones. Likewise, Crawlee understands `APIFY_` environment variables.
:::
### `APIFY_TOKEN`
The API token for your Apify account. It is used to access the Apify API, e.g. to access cloud storage
or to run an Actor on the Apify platform. You can find your API token on the
[Account Settings / Integrations](https://console.apify.com/account?tab=integrations) page.
### Combinations of `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR`
By combining the env vars in various ways, you can greatly influence the Actor's behavior.
| Env Vars | API | Storages |
| --------------------------------------- | --- | ---------------- |
| none OR `CRAWLEE_STORAGE_DIR` | no | local |
| `APIFY_TOKEN` | yes | Apify platform |
| `APIFY_TOKEN` AND `CRAWLEE_STORAGE_DIR` | yes | local + platform |
When using both `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR`, you can use all the Apify platform
features and your data will be stored locally by default. If you want to access platform storages,
you can use the `force_cloud=true` option in their respective functions.
### `APIFY_PROXY_PASSWORD`
Optional password to [Apify Proxy](https://docs.apify.com/proxy) for IP address rotation.
Assuming Apify Account was already created, you can find the password on the [Proxy page](https://console.apify.com/proxy)
in the Apify Console. The password is automatically inferred using the `APIFY_TOKEN` env var,
so in most cases, you don't need to touch it. You should use it when, for some reason,
you need access to Apify Proxy, but not access to Apify API, or when you need access to
proxy from a different account than your token represents.
## Proxy management
In addition to your own proxy servers and proxy servers acquired from
third-party providers used together with Crawlee, you can also rely on [Apify Proxy](https://apify.com/proxy)
for your scraping needs.
### Apify proxy
If you are already subscribed to Apify Proxy, you can start using them immediately in only a few lines of code (for local usage you first should be [logged in](#logging-into-apify-platform-from-crawlee) to your Apify account.
<CodeBlock className="language-python">
{ProxyExample}
</CodeBlock>
Note that unlike using your own proxies in Crawlee, you shouldn't use the constructor to create <ApiLink to="class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> instances. For using the Apify Proxy you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function instead.
### Advanced Apify proxy configuration
With Apify Proxy, you can select specific proxy groups to use, or countries to connect from.
This allows you to get better proxy performance after some initial research.
<CodeBlock className="language-python">
{ProxyAdvancedExample}
</CodeBlock>
Now your crawlers will use only Residential proxies from the US. Note that you must first get access
to a proxy group before you are able to use it. You can check proxy groups available to you
in the [proxy dashboard](https://console.apify.com/proxy).
### Apify proxy vs. own proxies
The [`ProxyConfiguration`](https://docs.apify.com/sdk/python/reference/class/ProxyConfiguration) class covers both Apify Proxy and custom proxy URLs so that you can easily switch between proxy providers. However, some features of the class are available only to Apify Proxy users, mainly because Apify Proxy is what one would call a super-proxy. It's not a single proxy server, but an API endpoint that allows connection through millions of different IP addresses. So the class essentially has two modes: Apify Proxy or Own (third party) proxy.
The difference is easy to remember.
- If you're using your own proxies - you should create a <ApiLink to="class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> instance directly.
- If you are planning to use Apify Proxy - you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function. The `new_url_function` parameter enables the use of your custom proxy URLs, whereas all the other options are there to configure Apify Proxy.
**Related links**
- [Apify Proxy docs](https://docs.apify.com/proxy)
================================================
FILE: docs/deployment/aws_lambda.mdx
================================================
---
id: aws-lambda
title: Deploy on AWS Lambda
description: Prepare your crawler to run on AWS Lambda.
---
import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';
import BeautifulSoupCrawlerLambda from '!!raw-loader!./code_examples/aws/beautifulsoup_crawler_lambda.py';
import PlaywrightCrawlerLambda from '!!raw-loader!./code_examples/aws/playwright_crawler_lambda.py';
import PlaywrightCrawlerDockerfile from '!!raw-loader!./code_examples/aws/playwright_dockerfile';
[AWS Lambda](https://docs.aws.amazon.com/lambda/latest/dg/welcome.html) is a serverless compute service that lets you run code without provisioning or managing servers. This guide covers deploying <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> and <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>.
The code examples are based on the [BeautifulSoupCrawler example](../examples/beautifulsoup-crawler).
## BeautifulSoupCrawler on AWS Lambda
For simple crawlers that don't require browser rendering, you can deploy using a ZIP archive.
### Updating the code
When instantiating a crawler, use <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink>. By default, Crawlee uses file-based storage, but the Lambda filesystem is read-only (except for `/tmp`). Using `MemoryStorageClient` tells Crawlee to use in-memory storage instead.
Wrap the crawler logic in a `lambda_handler` function. This is the entry point that AWS will execute.
:::important
Make sure to always instantiate a new crawler for every Lambda invocation. AWS keeps the environment running for some time after the first execution (to reduce cold-start times), so subsequent calls may access an already-used crawler instance.
**TL;DR: Keep your Lambda stateless.**
:::
Finally, return the scraped data from the Lambda when the crawler run ends.
<CodeBlock language="python" title="lambda_function.py">
{BeautifulSoupCrawlerLambda}
</CodeBlock>
### Preparing the environment
Lambda requires all dependencies to be included in the deployment package. Create a virtual environment and install dependencies:
```bash
python3.14 -m venv .venv
source .venv/bin/activate
pip install 'crawlee[beautifulsoup]' 'boto3' 'aws-lambda-powertools'
```
[`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) is the AWS SDK for Python. Including it in your dependencies is recommended to avoid version misalignment issues with the Lambda runtime.
### Creating the ZIP archive
Create a ZIP archive from your project, including dependencies from the virtual environment:
```bash
cd .venv/lib/python3.14/site-packages
zip -r ../../../../package.zip .
cd ../../../../
zip package.zip lambda_function.py
```
:::note Large dependencies?
AWS has a limit of 50 MB for direct upload and 250 MB for unzipped deployment package size.
A better way to manage dependencies is by using Lambda Layers. With Layers, you can share files between multiple Lambda functions and keep the actual code as slim as possible.
To create a Lambda Layer:
1. Create a `python/` folder and copy dependencies from `site-packages` into it
2. Create a zip archive: `zip -r layer.zip python/`
3. Create a new Lambda Layer from the archive (you may need to upload it to S3 first)
4. Attach the Layer to your Lambda function
:::
### Creating the Lambda function
Create the Lambda function in the AWS Lambda Console:
1. Navigate to `Lambda` in [AWS Management Console](https://aws.amazon.com/console/).
2. Click **Create function**.
3. Select **Author from scratch**.
4. Enter a **Function name**, for example `BeautifulSoupTest`.
5. Choose a **Python runtime** that matches the version used in your virtual environment (for example, Python 3.14).
6. Click **Create function** to finish.
Once created, upload `package.zip` as the code source in the AWS Lambda Console using the "Upload from" button.
In Lambda Runtime Settings, set the handler. Since the file is named `lambda_function.py` and the function is `lambda_handler`, you can use the default value `lambda_function.lambda_handler`.
:::tip Configuration
In the Configuration tab, you can adjust:
- **Memory**: Memory size can greatly affect execution speed. A minimum of 256-512 MB is recommended.
- **Timeout**: Set according to the size of the website you are scraping (1 minute for the example code).
- **Ephemeral storage**: Size of the `/tmp` directory.
See the [official documentation](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html) to learn how performance and cost scale with memory.
:::
After the Lambda deploys, you can test it by clicking the "Test" button. The event contents don't matter for a basic test, but you can parameterize your crawler by parsing the event object that AWS passes as the first argument to the handler.
## PlaywrightCrawler on AWS Lambda
For crawlers that require browser rendering, you need to deploy using Docker container images because Playwright and browser binaries exceed Lambda's ZIP deployment size limits.
### Updating the code
As with <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, use <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink> and wrap the logic in a `lambda_handler` function. Additionally, configure `browser_launch_options` with flags optimized for serverless environments. These flags disable sandboxing and GPU features that aren't available in Lambda's containerized runtime.
<CodeBlock language="python" title="main.py">
{PlaywrightCrawlerLambda}
</CodeBlock>
### Installing and configuring AWS CLI
Install AWS CLI following the [official documentation](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) according to your operating system.
Authenticate by running:
```bash
aws login
```
### Preparing the project
Initialize the project by running `uvx 'crawlee[cli]' create`.
Or use a single command if you don't need interactive mode:
```bash
uvx 'crawlee[cli]' create aws_playwright --crawler-type playwright --http-client impit --package-manager uv --no-apify --start-url 'https://crawlee.dev' --install
```
Add the following dependencies:
```bash
uv add awslambdaric aws-lambda-powertools boto3
```
[`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) is the AWS SDK for Python. Use it if your function integrates with any other AWS services.
The project is created with a Dockerfile that needs to be modified for AWS Lambda by adding `ENTRYPOINT` and updating `CMD`:
<CodeBlock language="dockerfile" title="Dockerfile">
{PlaywrightCrawlerDockerfile}
</CodeBlock>
### Building and pushing the Docker image
Create a repository `lambda/aws-playwright` in [Amazon Elastic Container Registry](https://docs.aws.amazon.com/AmazonECR/latest/userguide/what-is-ecr.html) in the same region where your Lambda functions will run. To learn more, refer to the [official documentation](https://docs.aws.amazon.com/AmazonECR/latest/userguide/getting-started-cli.html).
Navigate to the created repository and click the "View push commands" button. This will open a window with console commands for uploading the Docker image to your repository. Execute them.
Example:
```bash
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin {user-specific-data}
docker build --platform linux/amd64 --provenance=false -t lambda/aws-playwright .
docker tag lambda/aws-playwright:latest {user-specific-data}/lambda/aws-playwright:latest
docker push {user-specific-data}/lambda/aws-playwright:latest
```
### Creating the Lambda function
1. Navigate to `Lambda` in [AWS Management Console](https://aws.amazon.com/console/).
2. Click **Create function**.
3. Select **Container image**.
4. Browse and select your ECR image.
5. Click **Create function** to finish.
:::tip Configuration
In the Configuration tab, you can adjust resources. Playwright crawlers require more resources than BeautifulSoup crawlers:
- **Memory**: Minimum 1024 MB recommended. Browser operations are memory-intensive, so 2048 MB or more may be needed for complex pages.
- **Timeout**: Set according to crawl size. Browser startup adds overhead, so allow at least 5 minutes even for simple crawls.
- **Ephemeral storage**: Default 512 MB is usually sufficient unless downloading large files.
See the [official documentation](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html) to learn how performance and cost scale with memory.
:::
After the Lambda deploys, click the "Test" button to invoke it. The event contents don't matter for a basic test, but you can parameterize your crawler by parsing the event object that AWS passes as the first argument to the handler.
================================================
FILE: docs/deployment/code_examples/apify/crawler_as_actor_example.py
================================================
import asyncio
from apify import Actor
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
# Wrap the crawler code in an Actor context manager.
async with Actor:
crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
}
await context.push_data(data)
await context.enqueue_links()
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())
================================================
FILE: docs/deployment/code_examples/apify/get_public_url.py
================================================
import asyncio
from apify import Actor
async def main() -> None:
async with Actor:
store = await Actor.open_key_value_store()
await store.set_value('your-file', {'foo': 'bar'})
url = store.get_public_url('your-file')
Actor.log.info(f'KVS public URL: {url}')
# https://api.apify.com/v2/key-value-stores/<your-store-id>/records/your-file
if __name__ == '__main__':
asyncio.run(main())
================================================
FILE: docs/deployment/code_examples/apify/log_with_config_example.py
================================================
import asyncio
from apify import Actor, Configuration
async def main() -> None:
# Create a new configuration with your API key. You can find it at
# https://console.apify.com/settings/integrations. It can be provided either
# as a parameter "token" or as an environment variable "APIFY_TOKEN".
config = Configuration(
token='apify_api_YOUR_TOKEN',
)
async with Actor(config):
Actor.log.info('Hello from Apify platform!')
if __name__ == '__main__':
asyncio.run(main())
================================================
FILE: docs/deployment/code_examples/apify/proxy_advanced_example.py
================================================
import asyncio
from apify import Actor
async def main() -> None:
async with Actor:
proxy_configuration = await Actor.create_proxy_configuration(
password='apify_proxy_YOUR_PASSWORD',
# Specify the proxy group to use.
groups=['RESIDENTIAL'],
# Set the country code for the proxy.
country_code='US',
)
# ...
if __name__ == '__main__':
asyncio.run(main())
================================================
FILE: docs/deployment/code_examples/apify/proxy_example.py
================================================
import asyncio
from apify import Actor
async def main() -> None:
async with Actor:
# Create a new Apify Proxy configuration. The password can be found at
# https://console.apify.com/proxy/http-settings and should be provided either
# as a parameter "password" or as an environment variable "APIFY_PROXY_PASSWORD".
proxy_configuration = await Actor.create_proxy_configuration(
password='apify_proxy_YOUR_PASSWORD',
)
if not proxy_configuration:
Actor.log.warning('Failed to create proxy configuration.')
return
proxy_url = await proxy_configuration.new_url()
Actor.log.info(f'Proxy URL: {proxy_url}')
if __name__ == '__main__':
asyncio.run(main())
================================================
FILE: docs/deployment/code_examples/aws/beautifulsoup_crawler_lambda.py
================================================
import asyncio
import json
from datetime import timedelta
from typing import Any
from aws_lambda_powertools.utilities.typing import LambdaContext
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.storage_clients import MemoryStorageClient
from crawlee.storages import Dataset, RequestQueue
async def main() -> str:
# highlight-start
# Disable writing storage data to the file system
storage_client = MemoryStorageClient()
# highlight-end
# Initialize storages
dataset = await Dataset.open(storage_client=storage_client)
request_queue = await RequestQueue.open(storage_client=storage_client)
crawler = BeautifulSoupCrawler(
storage_client=storage_client,
max_request_retries=1,
request_handler_timeout=timedelta(seconds=30),
max_requests_per_crawl=10,
)
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
'h1s': [h1.text for h1 in context.soup.find_all('h1')],
'h2s': [h2.text for h2 in context.soup.find_all('h2')],
'h3s': [h3.text for h3 in context.soup.find_all('h3')],
}
await context.push_data(data)
await context.enqueue_links()
await crawler.run(['https://crawlee.dev'])
# Extract data saved in `Dataset`
data = await crawler.get_data()
# Clean up storages after the crawl
await dataset.drop()
await request_queue.drop()
# Serialize the list of scraped items to JSON string
return json.dumps(data.items)
def lambda_handler(_event: dict[str, Any], _context: LambdaContext) -> dict[str, Any]:
result = asyncio.run(main())
# Return the response with results
return {'statusCode': 200, 'body': result}
================================================
FILE: docs/deployment/code_examples/aws/playwright_crawler_lambda.py
================================================
import asyncio
import json
from datetime import timedelta
from typing import Any
from aws_lambda_powertools.utilities.typing import LambdaContext
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.storage_clients import MemoryStorageClient
from crawlee.storages import Dataset, RequestQueue
async def main() -> str:
# highlight-start
# Disable writing storage data to the file system
storage_client = MemoryStorageClient()
# highlight-end
# Initialize storages
dataset = await Dataset.open(storage_client=storage_client)
request_queue = await RequestQueue.open(storage_client=storage_client)
crawler = PlaywrightCrawler(
storage_client=storage_client,
max_request_retries=1,
request_handler_timeout=timedelta(seconds=30),
max_requests_per_crawl=10,
# highlight-start
# Configure Playwright to run in AWS Lambda environment
browser_launch_options={
'args': [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--single-process',
]
},
# highlight-end
)
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
data = {
'url': context.request.url,
'title': await context.page.title(),
'h1s': await context.page.locator('h1').all_text_contents(),
'h2s': await context.page.locator('h2').all_text_contents(),
'h3s': await context.page.locator('h3').all_text_contents(),
}
await context.push_data(data)
await context.enqueue_links()
await crawler.run(['https://crawlee.dev'])
# Extract data saved in `Dataset`
data = await crawler.get_data()
# Clean up storages after the crawl
await dataset.drop()
await request_queue.drop()
# Serialize the list of scraped items to JSON string
return json.dumps(data.items)
def lambda_handler(_event: dict[str, Any], _context: LambdaContext) -> dict[str, Any]:
result = asyncio.run(main())
# Return the response with results
return {'statusCode': 200, 'body': result}
================================================
FILE: docs/deployment/code_examples/aws/playwright_dockerfile
================================================
FROM apify/actor-python-playwright:3.14
RUN apt update && apt install -yq git && rm -rf /var/lib/apt/lists/*
RUN pip install -U pip setuptools \
&& pip install 'uv<1'
ENV UV_PROJECT_ENVIRONMENT="/usr/local"
COPY pyproject.toml uv.lock ./
RUN echo "Python version:" \
&& python --version \
&& echo "Installing dependencies:" \
&& PLAYWRIGHT_INSTALLED=$(pip freeze | grep -q playwright && echo "true" || echo "false") \
&& if [ "$PLAYWRIGHT_INSTALLED" = "true" ]; then \
echo "Playwright already installed, excluding from uv sync" \
&& uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact --no-install-package playwright; \
else \
echo "Playwright not found, installing all dependencies" \
&& uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact; \
fi \
&& echo "All installed Python packages:" \
&& pip freeze
COPY . ./
RUN python -m compileall -q .
# highlight-start
# AWS Lambda entrypoint
ENTRYPOINT [ "/usr/local/bin/python3", "-m", "awslambdaric" ]
# Lambda handler function
CMD [ "aws_playwright.main.lambda_handler" ]
# highlight-end
================================================
FILE: docs/deployment/code_examples/google/cloud_run_example.py
================================================
import json
import os
import uvicorn
from litestar import Litestar, get
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.storage_clients import MemoryStorageClient
@get('/')
async def main() -> str:
"""The crawler entry point that will be called when the HTTP endpoint is accessed."""
# highlight-start
# Disable writing storage data to the file system
storage_client = MemoryStorageClient()
# highlight-end
crawler = PlaywrightCrawler(
headless=True,
max_requests_per_crawl=10,
browser_type='firefox',
storage_client=storage_client,
)
@crawler.router.default_handler
async def default_handler(context: PlaywrightCrawlingContext) -> None:
"""Default request handler that processes each page during crawling."""
context.log.info(f'Processing {context.request.url} ...')
title = await context.page.query_selector('title')
await context.push_data(
{
'url': context.request.loaded_url,
'title': await title.inner_text() if title else None,
}
)
await context.enqueue_links()
await crawler.run(['https://crawlee.dev'])
data = await crawler.get_data()
# Return the results as JSON to the client
return json.dumps(data.items)
# Initialize the Litestar app with our route handler
app = Litestar(route_handlers=[main])
# Start the Uvicorn server using the `PORT` environment variable provided by GCP
# This is crucial - Cloud Run expects your app to listen on this specific port
uvicorn.run(app, host='0.0.0.0', port=int(os.environ.get('PORT', '8080'))) # noqa: S104 # Use all interfaces in a container, safely
================================================
FILE: docs/deployment/code_examples/google/google_example.py
================================================
import asyncio
import json
from datetime import timedelta
import functions_framework
from flask import Request, Response
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.storage_clients import MemoryStorageClient
async def main() -> str:
# highlight-start
# Disable writing storage data to the file system
storage_client = MemoryStorageClient()
# highlight-end
crawler = BeautifulSoupCrawler(
storage_client=storage_client,
max_request_retries=1,
request_handler_timeout=timedelta(seconds=30),
max_requests_per_crawl=10,
)
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
'h1s': [h1.text for h1 in context.soup.find_all('h1')],
'h2s': [h2.text for h2 in context.soup.find_all('h2')],
'h3s': [h3.text for h3 in context.soup.find_all('h3')],
}
await context.push_data(data)
await context.enqueue_links()
await crawler.run(['https://crawlee.dev'])
# highlight-start
# Extract data saved in `Dataset`
data = await crawler.get_data()
# Serialize to json string and return
return json.dumps(data.items)
# highlight-end
@functions_framework.http
def crawlee_run(request: Request) -> Response:
# You can pass data to your crawler using `request`
function_id = request.headers['Function-Execution-Id']
response_str = asyncio.run(main())
# Return a response with the crawling results
return Response(response=response_str, status=200)
================================================
FILE: docs/deployment/google_cloud.mdx
================================================
---
id: gcp-cloud-run-functions
title: Cloud Run functions
description: Prepare your crawler to run in Cloud Run functions on Google Cloud Platform.
---
import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';
import GoogleFunctions from '!!raw-loader!./code_examples/google/google_example.py';
[Google Cloud Run Functions](https://cloud.google.com/functions) is a serverless execution environment for running simple HTTP-based web scrapers. This service is best suited for lightweight crawlers that don't require browser rendering capabilities and can be executed via HTTP requests.
## Updating the project
For the project foundation, use <ApiLink to="class/BeautifulSoupCrawler">BeautifulSoupCrawler</ApiLink> as described in this [example](../examples/beautifulsoup-crawler).
Add [`functions-framework`](https://pypi.org/project/functions-framework/) to your dependencies file `requirements.txt`. If you're using a project manager like `poetry` or `uv`, export your dependencies to `requirements.txt`.
Update the project code to make it compatible with Cloud Functions and return data in JSON format. Also add an entry point that Cloud Functions will use to run the project.
<CodeBlock className="language-python">
{GoogleFunctions.replace(/^.*?\n/, '')}
</CodeBlock>
You can test your project locally. Start the server by running:
```bash
functions-framework --target=crawlee_run
```
Then make a GET request to `http://127.0.0.1:8080/`, for example in your browser.
## Deploying to Google Cloud Platform
In the Google Cloud dashboard, create a new function, allocate memory and CPUs to it, set region and function timeout.
When deploying, select **"Use an inline editor to create a function"**. This allows you to configure the project using only the Google Cloud Console dashboard.
Using the `inline editor`, update the function files according to your project. **Make sure** to update the `requirements.txt` file to match your project's dependencies.
Also, make sure to set the **Function entry point** to the name of the function decorated with `@functions_framework.http`, which in our case is `crawlee_run`.
After the Function deploys, you can test it by clicking the "Test" button. This button opens a popup with a `curl` script that calls your new Cloud Function. To avoid having to install the `gcloud` CLI application locally, you can also run this script in the Cloud Shell by clicking the link above the code block.
================================================
FILE: docs/deployment/google_cloud_run.mdx
================================================
---
id: gcp-cloud-run
title: Cloud Run
description: Prepare your crawler to run in Cloud Run on Google Cloud Platform.
---
import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';
import GoogleCloudRun from '!!raw-loader!./code_examples/google/cloud_run_example.py';
[Google Cloud Run](https://cloud.google.com/run) is a container-based serverless platform that allows you to run web crawlers with headless browsers. This service is recommended when your Crawlee applications need browser rendering capabilities, require more granular control, or have complex dependencies that aren't supported by [Cloud Functions](./gcp-cloud-run-functions).
GCP Cloud Run allows you to deploy using Docker containers, giving you full control over your environment and the flexibility to use any web server framework of your choice, unlike Cloud Functions which are limited to [Flask](https://flask.palletsprojects.com/en/stable/).
## Preparing the project
We'll prepare our project using [Litestar](https://litestar.dev/) and the [Uvicorn](https://www.uvicorn.org/) web server. The HTTP server handler will wrap the crawler to communicate with clients. Because the Cloud Run platform sees only an opaque Docker container, we have to take care of this bit ourselves.
:::info
GCP passes you an environment variable called `PORT` - your HTTP server is expected to be listening on this port (GCP exposes this one to the outer world).
:::
<CodeBlock className="language-python">
{GoogleCloudRun.replace(/^.*?\n/, '')}
</CodeBlock>
:::tip
Always make sure to keep all the logic in the request handler - as with other FaaS services, your request handlers have to be **stateless.**
:::
## Deploying to Google Cloud Platform
Now, we’re ready to
gitextract_o1cy5s8w/
├── .editorconfig
├── .github/
│ ├── CODEOWNERS
│ ├── pull_request_template.md
│ └── workflows/
│ ├── _check_code.yaml
│ ├── _check_docs.yaml
│ ├── _release_docs.yaml
│ ├── _tests.yaml
│ ├── manual_release_stable.yaml
│ ├── on_issue.yaml
│ ├── on_master.yaml
│ ├── on_pull_request.yaml
│ └── on_schedule_tests.yaml
├── .gitignore
├── .markdownlint.yaml
├── .pre-commit-config.yaml
├── .rules.md
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── codecov.yaml
├── docs/
│ ├── deployment/
│ │ ├── apify_platform.mdx
│ │ ├── aws_lambda.mdx
│ │ ├── code_examples/
│ │ │ ├── apify/
│ │ │ │ ├── crawler_as_actor_example.py
│ │ │ │ ├── get_public_url.py
│ │ │ │ ├── log_with_config_example.py
│ │ │ │ ├── proxy_advanced_example.py
│ │ │ │ └── proxy_example.py
│ │ │ ├── aws/
│ │ │ │ ├── beautifulsoup_crawler_lambda.py
│ │ │ │ ├── playwright_crawler_lambda.py
│ │ │ │ └── playwright_dockerfile
│ │ │ └── google/
│ │ │ ├── cloud_run_example.py
│ │ │ └── google_example.py
│ │ ├── google_cloud.mdx
│ │ └── google_cloud_run.mdx
│ ├── examples/
│ │ ├── add_data_to_dataset.mdx
│ │ ├── beautifulsoup_crawler.mdx
│ │ ├── capture_screenshot_using_playwright.mdx
│ │ ├── capturing_page_snapshots_with_error_snapshotter.mdx
│ │ ├── code_examples/
│ │ │ ├── adaptive_playwright_crawler.py
│ │ │ ├── add_data_to_dataset_bs.py
│ │ │ ├── add_data_to_dataset_dataset.py
│ │ │ ├── add_data_to_dataset_pw.py
│ │ │ ├── beautifulsoup_crawler.py
│ │ │ ├── beautifulsoup_crawler_keep_alive.py
│ │ │ ├── beautifulsoup_crawler_stop.py
│ │ │ ├── capture_screenshot_using_playwright.py
│ │ │ ├── configure_json_logging.py
│ │ │ ├── crawl_all_links_on_website_bs.py
│ │ │ ├── crawl_all_links_on_website_pw.py
│ │ │ ├── crawl_multiple_urls_bs.py
│ │ │ ├── crawl_multiple_urls_pw.py
│ │ │ ├── crawl_specific_links_on_website_bs.py
│ │ │ ├── crawl_specific_links_on_website_pw.py
│ │ │ ├── crawl_website_with_relative_links_all_links.py
│ │ │ ├── crawl_website_with_relative_links_same_domain.py
│ │ │ ├── crawl_website_with_relative_links_same_hostname.py
│ │ │ ├── crawl_website_with_relative_links_same_origin.py
│ │ │ ├── export_entire_dataset_to_file_csv.py
│ │ │ ├── export_entire_dataset_to_file_json.py
│ │ │ ├── extract_and_add_specific_links_on_website_bs.py
│ │ │ ├── extract_and_add_specific_links_on_website_pw.py
│ │ │ ├── fill_and_submit_web_form_crawler.py
│ │ │ ├── fill_and_submit_web_form_request.py
│ │ │ ├── parsel_crawler.py
│ │ │ ├── parsel_crawler_with_error_snapshotter.py
│ │ │ ├── playwright_block_requests.py
│ │ │ ├── playwright_crawler.py
│ │ │ ├── playwright_crawler_with_camoufox.py
│ │ │ ├── playwright_crawler_with_error_snapshotter.py
│ │ │ ├── playwright_crawler_with_fingerprint_generator.py
│ │ │ ├── respect_robots_on_skipped_request.py
│ │ │ ├── respect_robots_txt_file.py
│ │ │ ├── resuming_paused_crawl.py
│ │ │ ├── run_parallel_crawlers.py
│ │ │ ├── using_browser_profiles_chrome.py
│ │ │ ├── using_browser_profiles_firefox.py
│ │ │ └── using_sitemap_request_loader.py
│ │ ├── crawl_all_links_on_website.mdx
│ │ ├── crawl_multiple_urls.mdx
│ │ ├── crawl_specific_links_on_website.mdx
│ │ ├── crawl_website_with_relative_links.mdx
│ │ ├── crawler_keep_alive.mdx
│ │ ├── crawler_stop.mdx
│ │ ├── export_entire_dataset_to_file.mdx
│ │ ├── fill_and_submit_web_form.mdx
│ │ ├── json_logging.mdx
│ │ ├── parsel_crawler.mdx
│ │ ├── playwright_crawler.mdx
│ │ ├── playwright_crawler_adaptive.mdx
│ │ ├── playwright_crawler_with_block_requests.mdx
│ │ ├── playwright_crawler_with_camoufox.mdx
│ │ ├── playwright_crawler_with_fingerprint_generator.mdx
│ │ ├── respect_robots_txt_file.mdx
│ │ ├── resuming_paused_crawl.mdx
│ │ ├── run_parallel_crawlers.mdx
│ │ ├── using_browser_profile.mdx
│ │ └── using_sitemap_request_loader.mdx
│ ├── guides/
│ │ ├── architecture_overview.mdx
│ │ ├── avoid_blocking.mdx
│ │ ├── code_examples/
│ │ │ ├── avoid_blocking/
│ │ │ │ ├── default_fingerprint_generator_with_args.py
│ │ │ │ └── playwright_with_fingerprint_generator.py
│ │ │ ├── creating_web_archive/
│ │ │ │ ├── manual_archiving_parsel_crawler.py
│ │ │ │ ├── manual_archiving_playwright_crawler.py
│ │ │ │ └── simple_pw_through_proxy_pywb_server.py
│ │ │ ├── error_handling/
│ │ │ │ ├── change_handle_error_status.py
│ │ │ │ ├── disable_retry.py
│ │ │ │ └── handle_proxy_error.py
│ │ │ ├── http_clients/
│ │ │ │ ├── parsel_curl_impersonate_example.py
│ │ │ │ ├── parsel_httpx_example.py
│ │ │ │ └── parsel_impit_example.py
│ │ │ ├── http_crawlers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── beautifulsoup_example.py
│ │ │ │ ├── custom_crawler_example.py
│ │ │ │ ├── http_example.py
│ │ │ │ ├── lexbor_parser.py
│ │ │ │ ├── lxml_parser.py
│ │ │ │ ├── lxml_saxonche_parser.py
│ │ │ │ ├── parsel_example.py
│ │ │ │ ├── pyquery_parser.py
│ │ │ │ ├── scrapling_parser.py
│ │ │ │ ├── selectolax_adaptive_run.py
│ │ │ │ ├── selectolax_context.py
│ │ │ │ ├── selectolax_crawler.py
│ │ │ │ ├── selectolax_crawler_run.py
│ │ │ │ └── selectolax_parser.py
│ │ │ ├── login_crawler/
│ │ │ │ ├── http_login.py
│ │ │ │ └── playwright_login.py
│ │ │ ├── playwright_crawler/
│ │ │ │ ├── browser_configuration_example.py
│ │ │ │ ├── browser_pool_page_hooks_example.py
│ │ │ │ ├── multiple_launch_example.py
│ │ │ │ ├── navigation_hooks_example.py
│ │ │ │ └── plugin_browser_configuration_example.py
│ │ │ ├── playwright_crawler_adaptive/
│ │ │ │ ├── handler.py
│ │ │ │ ├── init_beautifulsoup.py
│ │ │ │ ├── init_parsel.py
│ │ │ │ ├── init_prediction.py
│ │ │ │ └── pre_nav_hooks.py
│ │ │ ├── playwright_crawler_stagehand/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── browser_classes.py
│ │ │ │ ├── stagehand_run.py
│ │ │ │ └── support_classes.py
│ │ │ ├── proxy_management/
│ │ │ │ ├── inspecting_bs_example.py
│ │ │ │ ├── inspecting_pw_example.py
│ │ │ │ ├── integration_bs_example.py
│ │ │ │ ├── integration_pw_example.py
│ │ │ │ ├── quick_start_example.py
│ │ │ │ ├── session_bs_example.py
│ │ │ │ ├── session_pw_example.py
│ │ │ │ ├── tiers_bs_example.py
│ │ │ │ └── tiers_pw_example.py
│ │ │ ├── request_loaders/
│ │ │ │ ├── rl_basic_example.py
│ │ │ │ ├── rl_basic_example_with_persist.py
│ │ │ │ ├── rl_tandem_example.py
│ │ │ │ ├── rl_tandem_example_explicit.py
│ │ │ │ ├── sitemap_basic_example.py
│ │ │ │ ├── sitemap_example_with_persist.py
│ │ │ │ ├── sitemap_tandem_example.py
│ │ │ │ └── sitemap_tandem_example_explicit.py
│ │ │ ├── request_router/
│ │ │ │ ├── adaptive_crawler_handlers.py
│ │ │ │ ├── basic_request_handlers.py
│ │ │ │ ├── custom_router_default_only.py
│ │ │ │ ├── error_handler.py
│ │ │ │ ├── failed_request_handler.py
│ │ │ │ ├── http_pre_navigation.py
│ │ │ │ ├── playwright_pre_navigation.py
│ │ │ │ └── simple_default_handler.py
│ │ │ ├── running_in_web_server/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── crawler.py
│ │ │ │ └── server.py
│ │ │ ├── scaling_crawlers/
│ │ │ │ ├── max_tasks_per_minute_example.py
│ │ │ │ └── min_and_max_concurrency_example.py
│ │ │ ├── service_locator/
│ │ │ │ ├── service_conflicts.py
│ │ │ │ ├── service_crawler_configuration.py
│ │ │ │ ├── service_crawler_event_manager.py
│ │ │ │ ├── service_crawler_storage_client.py
│ │ │ │ ├── service_locator_configuration.py
│ │ │ │ ├── service_locator_event_manager.py
│ │ │ │ ├── service_locator_storage_client.py
│ │ │ │ ├── service_storage_configuration.py
│ │ │ │ └── service_storage_storage_client.py
│ │ │ ├── session_management/
│ │ │ │ ├── multi_sessions_http.py
│ │ │ │ ├── one_session_http.py
│ │ │ │ ├── sm_basic.py
│ │ │ │ ├── sm_beautifulsoup.py
│ │ │ │ ├── sm_http.py
│ │ │ │ ├── sm_parsel.py
│ │ │ │ ├── sm_playwright.py
│ │ │ │ └── sm_standalone.py
│ │ │ ├── storage_clients/
│ │ │ │ ├── custom_storage_client_example.py
│ │ │ │ ├── file_system_storage_client_basic_example.py
│ │ │ │ ├── file_system_storage_client_configuration_example.py
│ │ │ │ ├── memory_storage_client_basic_example.py
│ │ │ │ ├── redis_storage_client_basic_example.py
│ │ │ │ ├── redis_storage_client_configuration_example.py
│ │ │ │ ├── registering_storage_clients_example.py
│ │ │ │ ├── sql_storage_client_basic_example.py
│ │ │ │ └── sql_storage_client_configuration_example.py
│ │ │ ├── storages/
│ │ │ │ ├── cleaning_do_not_purge_example.py
│ │ │ │ ├── cleaning_purge_explicitly_example.py
│ │ │ │ ├── dataset_basic_example.py
│ │ │ │ ├── dataset_with_crawler_example.py
│ │ │ │ ├── dataset_with_crawler_explicit_example.py
│ │ │ │ ├── helper_add_requests_example.py
│ │ │ │ ├── helper_enqueue_links_example.py
│ │ │ │ ├── kvs_basic_example.py
│ │ │ │ ├── kvs_with_crawler_example.py
│ │ │ │ ├── kvs_with_crawler_explicit_example.py
│ │ │ │ ├── opening.py
│ │ │ │ ├── rq_basic_example.py
│ │ │ │ ├── rq_with_crawler_example.py
│ │ │ │ └── rq_with_crawler_explicit_example.py
│ │ │ └── trace_and_monitor_crawlers/
│ │ │ └── instrument_crawler.py
│ │ ├── crawler_login.mdx
│ │ ├── creating_web_archive.mdx
│ │ ├── error_handling.mdx
│ │ ├── http_clients.mdx
│ │ ├── http_crawlers.mdx
│ │ ├── playwright_crawler.mdx
│ │ ├── playwright_crawler_adaptive.mdx
│ │ ├── playwright_crawler_stagehand.mdx
│ │ ├── proxy_management.mdx
│ │ ├── request_loaders.mdx
│ │ ├── request_router.mdx
│ │ ├── running_in_web_server.mdx
│ │ ├── scaling_crawlers.mdx
│ │ ├── service_locator.mdx
│ │ ├── session_management.mdx
│ │ ├── storage_clients.mdx
│ │ ├── storages.mdx
│ │ └── trace_and_monitor_crawlers.mdx
│ ├── introduction/
│ │ ├── 01_setting_up.mdx
│ │ ├── 02_first_crawler.mdx
│ │ ├── 03_adding_more_urls.mdx
│ │ ├── 04_real_world_project.mdx
│ │ ├── 05_crawling.mdx
│ │ ├── 06_scraping.mdx
│ │ ├── 07_saving_data.mdx
│ │ ├── 08_refactoring.mdx
│ │ ├── 09_running_in_cloud.mdx
│ │ ├── code_examples/
│ │ │ ├── 02_bs.py
│ │ │ ├── 02_bs_better.py
│ │ │ ├── 02_request_queue.py
│ │ │ ├── 03_enqueue_strategy.py
│ │ │ ├── 03_finding_new_links.py
│ │ │ ├── 03_globs.py
│ │ │ ├── 03_original_code.py
│ │ │ ├── 03_transform_request.py
│ │ │ ├── 04_sanity_check.py
│ │ │ ├── 05_crawling_detail.py
│ │ │ ├── 05_crawling_listing.py
│ │ │ ├── 06_scraping.py
│ │ │ ├── 07_final_code.py
│ │ │ ├── 07_first_code.py
│ │ │ ├── 08_main.py
│ │ │ ├── 08_routes.py
│ │ │ ├── 09_apify_sdk.py
│ │ │ ├── __init__.py
│ │ │ └── routes.py
│ │ └── index.mdx
│ ├── pyproject.toml
│ ├── quick-start/
│ │ ├── code_examples/
│ │ │ ├── beautifulsoup_crawler_example.py
│ │ │ ├── parsel_crawler_example.py
│ │ │ ├── playwright_crawler_example.py
│ │ │ └── playwright_crawler_headful_example.py
│ │ └── index.mdx
│ └── upgrading/
│ ├── upgrading_to_v0x.md
│ └── upgrading_to_v1.md
├── pyproject.toml
├── renovate.json
├── src/
│ └── crawlee/
│ ├── __init__.py
│ ├── _autoscaling/
│ │ ├── __init__.py
│ │ ├── _types.py
│ │ ├── autoscaled_pool.py
│ │ ├── py.typed
│ │ ├── snapshotter.py
│ │ └── system_status.py
│ ├── _cli.py
│ ├── _consts.py
│ ├── _log_config.py
│ ├── _request.py
│ ├── _service_locator.py
│ ├── _types.py
│ ├── _utils/
│ │ ├── __init__.py
│ │ ├── blocked.py
│ │ ├── byte_size.py
│ │ ├── console.py
│ │ ├── context.py
│ │ ├── crypto.py
│ │ ├── docs.py
│ │ ├── file.py
│ │ ├── globs.py
│ │ ├── html_to_text.py
│ │ ├── models.py
│ │ ├── raise_if_too_many_kwargs.py
│ │ ├── recoverable_state.py
│ │ ├── recurring_task.py
│ │ ├── requests.py
│ │ ├── robots.py
│ │ ├── sitemap.py
│ │ ├── system.py
│ │ ├── time.py
│ │ ├── try_import.py
│ │ ├── urls.py
│ │ ├── wait.py
│ │ └── web.py
│ ├── browsers/
│ │ ├── __init__.py
│ │ ├── _browser_controller.py
│ │ ├── _browser_plugin.py
│ │ ├── _browser_pool.py
│ │ ├── _playwright_browser.py
│ │ ├── _playwright_browser_controller.py
│ │ ├── _playwright_browser_plugin.py
│ │ ├── _types.py
│ │ └── py.typed
│ ├── configuration.py
│ ├── crawlers/
│ │ ├── __init__.py
│ │ ├── _abstract_http/
│ │ │ ├── __init__.py
│ │ │ ├── _abstract_http_crawler.py
│ │ │ ├── _abstract_http_parser.py
│ │ │ ├── _http_crawling_context.py
│ │ │ └── py.typed
│ │ ├── _adaptive_playwright/
│ │ │ ├── __init__.py
│ │ │ ├── _adaptive_playwright_crawler.py
│ │ │ ├── _adaptive_playwright_crawler_statistics.py
│ │ │ ├── _adaptive_playwright_crawling_context.py
│ │ │ ├── _rendering_type_predictor.py
│ │ │ ├── _result_comparator.py
│ │ │ └── _utils.py
│ │ ├── _basic/
│ │ │ ├── __init__.py
│ │ │ ├── _basic_crawler.py
│ │ │ ├── _basic_crawling_context.py
│ │ │ ├── _context_pipeline.py
│ │ │ ├── _context_utils.py
│ │ │ ├── _logging_utils.py
│ │ │ └── py.typed
│ │ ├── _beautifulsoup/
│ │ │ ├── __init__.py
│ │ │ ├── _beautifulsoup_crawler.py
│ │ │ ├── _beautifulsoup_crawling_context.py
│ │ │ ├── _beautifulsoup_parser.py
│ │ │ ├── _utils.py
│ │ │ └── py.typed
│ │ ├── _http/
│ │ │ ├── __init__.py
│ │ │ ├── _http_crawler.py
│ │ │ └── _http_parser.py
│ │ ├── _parsel/
│ │ │ ├── __init__.py
│ │ │ ├── _parsel_crawler.py
│ │ │ ├── _parsel_crawling_context.py
│ │ │ ├── _parsel_parser.py
│ │ │ └── _utils.py
│ │ ├── _playwright/
│ │ │ ├── __init__.py
│ │ │ ├── _playwright_crawler.py
│ │ │ ├── _playwright_crawling_context.py
│ │ │ ├── _playwright_http_client.py
│ │ │ ├── _playwright_post_nav_crawling_context.py
│ │ │ ├── _playwright_pre_nav_crawling_context.py
│ │ │ ├── _types.py
│ │ │ └── _utils.py
│ │ ├── _types.py
│ │ └── py.typed
│ ├── errors.py
│ ├── events/
│ │ ├── __init__.py
│ │ ├── _event_manager.py
│ │ ├── _local_event_manager.py
│ │ ├── _types.py
│ │ └── py.typed
│ ├── fingerprint_suite/
│ │ ├── __init__.py
│ │ ├── _browserforge_adapter.py
│ │ ├── _consts.py
│ │ ├── _fingerprint_generator.py
│ │ ├── _header_generator.py
│ │ ├── _types.py
│ │ └── py.typed
│ ├── http_clients/
│ │ ├── __init__.py
│ │ ├── _base.py
│ │ ├── _curl_impersonate.py
│ │ ├── _httpx.py
│ │ └── _impit.py
│ ├── otel/
│ │ ├── __init__.py
│ │ └── crawler_instrumentor.py
│ ├── project_template/
│ │ ├── cookiecutter.json
│ │ ├── hooks/
│ │ │ ├── post_gen_project.py
│ │ │ └── pre_gen_project.py
│ │ ├── templates/
│ │ │ ├── main.py
│ │ │ ├── main_beautifulsoup.py
│ │ │ ├── main_parsel.py
│ │ │ ├── main_playwright.py
│ │ │ ├── main_playwright_camoufox.py
│ │ │ ├── main_playwright_chrome.py
│ │ │ ├── main_playwright_firefox.py
│ │ │ ├── main_playwright_webkit.py
│ │ │ ├── routes_beautifulsoup.py
│ │ │ ├── routes_parsel.py
│ │ │ └── routes_playwright.py
│ │ └── {{cookiecutter.project_name}}/
│ │ ├── .dockerignore
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ ├── requirements.txt
│ │ └── {{cookiecutter.__package_name}}/
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── main.py
│ │ └── routes.py
│ ├── proxy_configuration.py
│ ├── py.typed
│ ├── request_loaders/
│ │ ├── __init__.py
│ │ ├── _request_list.py
│ │ ├── _request_loader.py
│ │ ├── _request_manager.py
│ │ ├── _request_manager_tandem.py
│ │ └── _sitemap_request_loader.py
│ ├── router.py
│ ├── sessions/
│ │ ├── __init__.py
│ │ ├── _cookies.py
│ │ ├── _models.py
│ │ ├── _session.py
│ │ ├── _session_pool.py
│ │ └── py.typed
│ ├── statistics/
│ │ ├── __init__.py
│ │ ├── _error_snapshotter.py
│ │ ├── _error_tracker.py
│ │ ├── _models.py
│ │ └── _statistics.py
│ ├── storage_clients/
│ │ ├── __init__.py
│ │ ├── _base/
│ │ │ ├── __init__.py
│ │ │ ├── _dataset_client.py
│ │ │ ├── _key_value_store_client.py
│ │ │ ├── _request_queue_client.py
│ │ │ ├── _storage_client.py
│ │ │ └── py.typed
│ │ ├── _file_system/
│ │ │ ├── __init__.py
│ │ │ ├── _dataset_client.py
│ │ │ ├── _key_value_store_client.py
│ │ │ ├── _request_queue_client.py
│ │ │ ├── _storage_client.py
│ │ │ ├── _utils.py
│ │ │ └── py.typed
│ │ ├── _memory/
│ │ │ ├── __init__.py
│ │ │ ├── _dataset_client.py
│ │ │ ├── _key_value_store_client.py
│ │ │ ├── _request_queue_client.py
│ │ │ ├── _storage_client.py
│ │ │ └── py.typed
│ │ ├── _redis/
│ │ │ ├── __init__.py
│ │ │ ├── _client_mixin.py
│ │ │ ├── _dataset_client.py
│ │ │ ├── _key_value_store_client.py
│ │ │ ├── _request_queue_client.py
│ │ │ ├── _storage_client.py
│ │ │ ├── _utils.py
│ │ │ ├── lua_scripts/
│ │ │ │ ├── atomic_bloom_add_requests.lua
│ │ │ │ ├── atomic_fetch_request.lua
│ │ │ │ ├── atomic_set_add_requests.lua
│ │ │ │ └── reclaim_stale_requests.lua
│ │ │ └── py.typed
│ │ ├── _sql/
│ │ │ ├── __init__.py
│ │ │ ├── _client_mixin.py
│ │ │ ├── _dataset_client.py
│ │ │ ├── _db_models.py
│ │ │ ├── _key_value_store_client.py
│ │ │ ├── _request_queue_client.py
│ │ │ ├── _storage_client.py
│ │ │ └── py.typed
│ │ ├── models.py
│ │ └── py.typed
│ └── storages/
│ ├── __init__.py
│ ├── _base.py
│ ├── _dataset.py
│ ├── _key_value_store.py
│ ├── _request_queue.py
│ ├── _storage_instance_manager.py
│ ├── _utils.py
│ └── py.typed
├── tests/
│ ├── __init__.py
│ ├── e2e/
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ └── project_template/
│ │ ├── test_static_crawlers_templates.py
│ │ └── utils.py
│ └── unit/
│ ├── README.md
│ ├── __init__.py
│ ├── _autoscaling/
│ │ ├── test_autoscaled_pool.py
│ │ ├── test_snapshotter.py
│ │ └── test_system_status.py
│ ├── _statistics/
│ │ ├── test_error_tracker.py
│ │ ├── test_periodic_logging.py
│ │ ├── test_persistence.py
│ │ ├── test_request_max_duration.py
│ │ └── test_request_processing_record.py
│ ├── _utils/
│ │ ├── test_byte_size.py
│ │ ├── test_console.py
│ │ ├── test_crypto.py
│ │ ├── test_file.py
│ │ ├── test_globs.py
│ │ ├── test_html_to_text.py
│ │ ├── test_measure_time.py
│ │ ├── test_raise_if_too_many_kwargs.py
│ │ ├── test_recurring_task.py
│ │ ├── test_requests.py
│ │ ├── test_robots.py
│ │ ├── test_shared_timeout.py
│ │ ├── test_sitemap.py
│ │ ├── test_system.py
│ │ ├── test_timedelta_ms.py
│ │ └── test_urls.py
│ ├── browsers/
│ │ ├── test_browser_pool.py
│ │ ├── test_playwright_browser.py
│ │ ├── test_playwright_browser_controller.py
│ │ └── test_playwright_browser_plugin.py
│ ├── conftest.py
│ ├── crawlers/
│ │ ├── _adaptive_playwright/
│ │ │ ├── test_adaptive_playwright_crawler.py
│ │ │ ├── test_adaptive_playwright_crawler_statistics.py
│ │ │ ├── test_adaptive_playwright_crawling_context.py
│ │ │ └── test_predictor.py
│ │ ├── _basic/
│ │ │ ├── test_basic_crawler.py
│ │ │ └── test_context_pipeline.py
│ │ ├── _beautifulsoup/
│ │ │ └── test_beautifulsoup_crawler.py
│ │ ├── _http/
│ │ │ └── test_http_crawler.py
│ │ ├── _parsel/
│ │ │ └── test_parsel_crawler.py
│ │ └── _playwright/
│ │ ├── test_playwright_crawler.py
│ │ └── test_utils.py
│ ├── events/
│ │ ├── test_event_manager.py
│ │ └── test_local_event_manager.py
│ ├── fingerprint_suite/
│ │ ├── test_adapters.py
│ │ └── test_header_generator.py
│ ├── http_clients/
│ │ ├── test_http_clients.py
│ │ └── test_httpx.py
│ ├── otel/
│ │ └── test_crawler_instrumentor.py
│ ├── proxy_configuration/
│ │ ├── test_new_proxy_info.py
│ │ └── test_tiers.py
│ ├── request_loaders/
│ │ ├── test_request_list.py
│ │ └── test_sitemap_request_loader.py
│ ├── server.py
│ ├── server_endpoints.py
│ ├── server_static/
│ │ └── test.js
│ ├── sessions/
│ │ ├── test_cookies.py
│ │ ├── test_models.py
│ │ ├── test_session.py
│ │ └── test_session_pool.py
│ ├── storage_clients/
│ │ ├── _file_system/
│ │ │ ├── test_fs_dataset_client.py
│ │ │ ├── test_fs_kvs_client.py
│ │ │ └── test_fs_rq_client.py
│ │ ├── _memory/
│ │ │ ├── test_memory_dataset_client.py
│ │ │ ├── test_memory_kvs_client.py
│ │ │ └── test_memory_rq_client.py
│ │ ├── _redis/
│ │ │ ├── test_redis_dataset_client.py
│ │ │ ├── test_redis_kvs_client.py
│ │ │ └── test_redis_rq_client.py
│ │ └── _sql/
│ │ ├── test_sql_dataset_client.py
│ │ ├── test_sql_kvs_client.py
│ │ └── test_sql_rq_client.py
│ ├── storages/
│ │ ├── conftest.py
│ │ ├── test_dataset.py
│ │ ├── test_key_value_store.py
│ │ ├── test_request_manager_tandem.py
│ │ ├── test_request_queue.py
│ │ └── test_storage_instance_manager.py
│ ├── test_cli.py
│ ├── test_configuration.py
│ ├── test_log_config.py
│ ├── test_router.py
│ ├── test_service_locator.py
│ └── utils.py
├── typos.toml
└── website/
├── .eslintrc.json
├── .yarnrc.yml
├── babel.config.js
├── build_api_reference.sh
├── docusaurus.config.js
├── generate_module_shortcuts.py
├── package.json
├── patches/
│ ├── @docusaurus+core+3.4.0.patch
│ └── @docusaurus+core+3.5.2.patch
├── roa-loader/
│ ├── index.js
│ └── package.json
├── sidebars.js
├── src/
│ ├── components/
│ │ ├── ApiLink.jsx
│ │ ├── Button.jsx
│ │ ├── Button.module.css
│ │ ├── CopyButton.jsx
│ │ ├── CopyButton.module.css
│ │ ├── Gradients.jsx
│ │ ├── Highlights.jsx
│ │ ├── Highlights.module.css
│ │ ├── Homepage/
│ │ │ ├── HomepageCliExample.jsx
│ │ │ ├── HomepageCliExample.module.css
│ │ │ ├── HomepageCtaSection.jsx
│ │ │ ├── HomepageCtaSection.module.css
│ │ │ ├── HomepageHeroSection.jsx
│ │ │ ├── HomepageHeroSection.module.css
│ │ │ ├── LanguageInfoWidget.jsx
│ │ │ ├── LanguageInfoWidget.module.css
│ │ │ ├── LanguageSwitch.jsx
│ │ │ ├── LanguageSwitch.module.css
│ │ │ ├── RiverSection.jsx
│ │ │ ├── RiverSection.module.css
│ │ │ ├── ThreeCardsWithIcon.jsx
│ │ │ └── ThreeCardsWithIcon.module.css
│ │ ├── LLMButtons.jsx
│ │ ├── LLMButtons.module.css
│ │ ├── RunnableCodeBlock.jsx
│ │ └── RunnableCodeBlock.module.css
│ ├── css/
│ │ └── custom.css
│ ├── pages/
│ │ ├── home_page_example.py
│ │ ├── index.js
│ │ └── index.module.css
│ ├── plugins/
│ │ └── docusaurus-plugin-segment/
│ │ ├── index.js
│ │ └── segment.js
│ └── theme/
│ ├── ColorModeToggle/
│ │ ├── index.js
│ │ └── styles.module.css
│ ├── DocItem/
│ │ ├── Content/
│ │ │ ├── index.js
│ │ │ └── styles.module.css
│ │ └── Layout/
│ │ ├── index.js
│ │ └── styles.module.css
│ ├── Footer/
│ │ ├── LinkItem/
│ │ │ ├── index.js
│ │ │ └── index.module.css
│ │ ├── index.js
│ │ └── index.module.css
│ ├── MDXComponents/
│ │ └── A.js
│ ├── Navbar/
│ │ ├── Content/
│ │ │ ├── index.js
│ │ │ └── styles.module.css
│ │ ├── Logo/
│ │ │ ├── index.js
│ │ │ └── index.module.css
│ │ └── MobileSidebar/
│ │ ├── Header/
│ │ │ ├── index.js
│ │ │ └── index.module.css
│ │ ├── Layout/
│ │ │ └── index.js
│ │ ├── PrimaryMenu/
│ │ │ └── index.js
│ │ └── index.js
│ └── NavbarItem/
│ └── ComponentTypes.js
├── static/
│ ├── .nojekyll
│ ├── js/
│ │ └── custom.js
│ └── robots.txt
├── tools/
│ ├── docs-prettier.config.js
│ ├── utils/
│ │ └── externalLink.js
│ └── website_gif/
│ └── website_gif.mjs
└── tsconfig.eslint.json
Showing preview only (257K chars total). Download the full file or copy to clipboard to get everything.
SYMBOL INDEX (2556 symbols across 422 files)
FILE: docs/deployment/code_examples/apify/crawler_as_actor_example.py
function main (line 8) | async def main() -> None:
FILE: docs/deployment/code_examples/apify/get_public_url.py
function main (line 6) | async def main() -> None:
FILE: docs/deployment/code_examples/apify/log_with_config_example.py
function main (line 6) | async def main() -> None:
FILE: docs/deployment/code_examples/apify/proxy_advanced_example.py
function main (line 6) | async def main() -> None:
FILE: docs/deployment/code_examples/apify/proxy_example.py
function main (line 6) | async def main() -> None:
FILE: docs/deployment/code_examples/aws/beautifulsoup_crawler_lambda.py
function main (line 13) | async def main() -> str:
function lambda_handler (line 58) | def lambda_handler(_event: dict[str, Any], _context: LambdaContext) -> d...
FILE: docs/deployment/code_examples/aws/playwright_crawler_lambda.py
function main (line 13) | async def main() -> str:
function lambda_handler (line 70) | def lambda_handler(_event: dict[str, Any], _context: LambdaContext) -> d...
FILE: docs/deployment/code_examples/google/cloud_run_example.py
function main (line 12) | async def main() -> str:
FILE: docs/deployment/code_examples/google/google_example.py
function main (line 12) | async def main() -> str:
function crawlee_run (line 51) | def crawlee_run(request: Request) -> Response:
FILE: docs/examples/code_examples/adaptive_playwright_crawler.py
function main (line 13) | async def main() -> None:
FILE: docs/examples/code_examples/add_data_to_dataset_bs.py
function main (line 6) | async def main() -> None:
FILE: docs/examples/code_examples/add_data_to_dataset_dataset.py
function main (line 6) | async def main() -> None:
FILE: docs/examples/code_examples/add_data_to_dataset_pw.py
function main (line 6) | async def main() -> None:
FILE: docs/examples/code_examples/beautifulsoup_crawler.py
function main (line 11) | async def main() -> None:
FILE: docs/examples/code_examples/beautifulsoup_crawler_keep_alive.py
function main (line 7) | async def main() -> None:
FILE: docs/examples/code_examples/beautifulsoup_crawler_stop.py
function main (line 6) | async def main() -> None:
FILE: docs/examples/code_examples/capture_screenshot_using_playwright.py
function main (line 7) | async def main() -> None:
FILE: docs/examples/code_examples/configure_json_logging.py
class InterceptHandler (line 18) | class InterceptHandler(logging.Handler):
method emit (line 19) | def emit(self, record: logging.LogRecord) -> None:
function formatter (line 54) | def formatter(record: Record) -> str:
function main (line 74) | async def main() -> None:
FILE: docs/examples/code_examples/crawl_all_links_on_website_bs.py
function main (line 6) | async def main() -> None:
FILE: docs/examples/code_examples/crawl_all_links_on_website_pw.py
function main (line 6) | async def main() -> None:
FILE: docs/examples/code_examples/crawl_multiple_urls_bs.py
function main (line 6) | async def main() -> None:
FILE: docs/examples/code_examples/crawl_multiple_urls_pw.py
function main (line 6) | async def main() -> None:
FILE: docs/examples/code_examples/crawl_specific_links_on_website_bs.py
function main (line 7) | async def main() -> None:
FILE: docs/examples/code_examples/crawl_specific_links_on_website_pw.py
function main (line 7) | async def main() -> None:
FILE: docs/examples/code_examples/crawl_website_with_relative_links_all_links.py
function main (line 6) | async def main() -> None:
FILE: docs/examples/code_examples/crawl_website_with_relative_links_same_domain.py
function main (line 6) | async def main() -> None:
FILE: docs/examples/code_examples/crawl_website_with_relative_links_same_hostname.py
function main (line 6) | async def main() -> None:
FILE: docs/examples/code_examples/crawl_website_with_relative_links_same_origin.py
function main (line 6) | async def main() -> None:
FILE: docs/examples/code_examples/export_entire_dataset_to_file_csv.py
function main (line 6) | async def main() -> None:
FILE: docs/examples/code_examples/export_entire_dataset_to_file_json.py
function main (line 6) | async def main() -> None:
FILE: docs/examples/code_examples/extract_and_add_specific_links_on_website_bs.py
function main (line 7) | async def main() -> None:
FILE: docs/examples/code_examples/extract_and_add_specific_links_on_website_pw.py
function main (line 7) | async def main() -> None:
FILE: docs/examples/code_examples/fill_and_submit_web_form_crawler.py
function main (line 8) | async def main() -> None:
FILE: docs/examples/code_examples/fill_and_submit_web_form_request.py
function main (line 7) | async def main() -> None:
FILE: docs/examples/code_examples/parsel_crawler.py
function main (line 9) | async def main() -> None:
FILE: docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py
function main (line 8) | async def main() -> None:
FILE: docs/examples/code_examples/playwright_block_requests.py
function main (line 10) | async def main() -> None:
FILE: docs/examples/code_examples/playwright_crawler.py
function main (line 10) | async def main() -> None:
FILE: docs/examples/code_examples/playwright_crawler_with_camoufox.py
class CamoufoxPlugin (line 15) | class CamoufoxPlugin(PlaywrightBrowserPlugin):
method new_browser (line 21) | async def new_browser(self) -> PlaywrightBrowserController:
function main (line 36) | async def main() -> None:
FILE: docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py
function main (line 8) | async def main() -> None:
FILE: docs/examples/code_examples/playwright_crawler_with_fingerprint_generator.py
function main (line 11) | async def main() -> None:
FILE: docs/examples/code_examples/respect_robots_on_skipped_request.py
function main (line 10) | async def main() -> None:
FILE: docs/examples/code_examples/respect_robots_txt_file.py
function main (line 9) | async def main() -> None:
FILE: docs/examples/code_examples/resuming_paused_crawl.py
function main (line 17) | async def main() -> None:
FILE: docs/examples/code_examples/run_parallel_crawlers.py
function main (line 14) | async def main() -> None:
FILE: docs/examples/code_examples/using_browser_profiles_chrome.py
function main (line 16) | async def main() -> None:
FILE: docs/examples/code_examples/using_browser_profiles_firefox.py
function main (line 17) | async def main() -> None:
FILE: docs/examples/code_examples/using_sitemap_request_loader.py
function create_transform_request (line 14) | def create_transform_request(
function main (line 38) | async def main() -> None:
FILE: docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py
function main (line 10) | async def main() -> None:
FILE: docs/guides/code_examples/avoid_blocking/playwright_with_fingerprint_generator.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py
function archive_response (line 11) | async def archive_response(context: ParselCrawlingContext, writer: WARCW...
function main (line 32) | async def main() -> None:
FILE: docs/guides/code_examples/creating_web_archive/manual_archiving_playwright_crawler.py
function archive_response (line 18) | async def archive_response(
function main (line 46) | async def main() -> None:
FILE: docs/guides/code_examples/creating_web_archive/simple_pw_through_proxy_pywb_server.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/error_handling/change_handle_error_status.py
function main (line 14) | async def main() -> None:
FILE: docs/guides/code_examples/error_handling/disable_retry.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/error_handling/handle_proxy_error.py
function main (line 8) | async def main() -> None:
FILE: docs/guides/code_examples/http_clients/parsel_curl_impersonate_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/http_clients/parsel_httpx_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/http_clients/parsel_impit_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/http_crawlers/beautifulsoup_example.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/http_crawlers/http_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/http_crawlers/lexbor_parser.py
function main (line 11) | async def main() -> None:
FILE: docs/guides/code_examples/http_crawlers/lxml_parser.py
function main (line 10) | async def main() -> None:
FILE: docs/guides/code_examples/http_crawlers/lxml_saxonche_parser.py
function main (line 11) | async def main() -> None:
FILE: docs/guides/code_examples/http_crawlers/parsel_example.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/http_crawlers/pyquery_parser.py
function main (line 11) | async def main() -> None:
FILE: docs/guides/code_examples/http_crawlers/scrapling_parser.py
function main (line 11) | async def main() -> None:
FILE: docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py
function main (line 11) | async def main() -> None:
FILE: docs/guides/code_examples/http_crawlers/selectolax_context.py
class SelectolaxLexborContext (line 12) | class SelectolaxLexborContext(ParsedHttpCrawlingContext[LexborHTMLParser]):
method parser (line 20) | def parser(self) -> LexborHTMLParser:
method from_parsed_http_crawling_context (line 25) | def from_parsed_http_crawling_context(
FILE: docs/guides/code_examples/http_crawlers/selectolax_crawler.py
class SelectolaxLexborCrawler (line 23) | class SelectolaxLexborCrawler(
method __init__ (line 28) | def __init__(
FILE: docs/guides/code_examples/http_crawlers/selectolax_crawler_run.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/http_crawlers/selectolax_parser.py
class SelectolaxLexborParser (line 17) | class SelectolaxLexborParser(AbstractHttpParser[LexborHTMLParser, Lexbor...
method parse (line 21) | async def parse(self, response: HttpResponse) -> LexborHTMLParser:
method parse_text (line 28) | async def parse_text(self, text: str) -> LexborHTMLParser:
method select (line 33) | async def select(
method is_matching_selector (line 40) | def is_matching_selector(
method find_links (line 47) | def find_links(
FILE: docs/guides/code_examples/login_crawler/http_login.py
function main (line 13) | async def main() -> None:
FILE: docs/guides/code_examples/login_crawler/playwright_login.py
function main (line 12) | async def main() -> None:
FILE: docs/guides/code_examples/playwright_crawler/browser_configuration_example.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/playwright_crawler/browser_pool_page_hooks_example.py
function main (line 19) | async def main() -> None:
FILE: docs/guides/code_examples/playwright_crawler/multiple_launch_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/playwright_crawler/navigation_hooks_example.py
function main (line 12) | async def main() -> None:
FILE: docs/guides/code_examples/playwright_crawler/plugin_browser_configuration_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/playwright_crawler_adaptive/handler.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/playwright_crawler_adaptive/init_beautifulsoup.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/playwright_crawler_adaptive/init_parsel.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/playwright_crawler_adaptive/init_prediction.py
class CustomRenderingTypePredictor (line 13) | class CustomRenderingTypePredictor(RenderingTypePredictor):
method __init__ (line 14) | def __init__(self) -> None:
method predict (line 19) | def predict(self, request: Request) -> RenderingTypePrediction:
method store_result (line 36) | def store_result(self, request: Request, rendering_type: RenderingType...
function result_checker (line 44) | def result_checker(result: RequestHandlerRunResult) -> bool:
function result_comparator (line 50) | def result_comparator(
function main (line 61) | async def main() -> None:
FILE: docs/guides/code_examples/playwright_crawler_adaptive/pre_nav_hooks.py
function main (line 11) | async def main() -> None:
FILE: docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py
class StagehandBrowserController (line 26) | class StagehandBrowserController(PlaywrightBrowserController):
method __init__ (line 28) | def __init__(
method new_page (line 38) | async def new_page(
class StagehandPlugin (line 74) | class StagehandPlugin(PlaywrightBrowserPlugin):
method __init__ (line 78) | def __init__(self, stagehand: Stagehand, **kwargs: Any) -> None:
method new_browser (line 84) | async def new_browser(self) -> StagehandBrowserController:
FILE: docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py
function main (line 17) | async def main() -> None:
FILE: docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py
class CrawleeStagehandPage (line 11) | class CrawleeStagehandPage:
method __init__ (line 14) | def __init__(self, page: StagehandPage) -> None:
method goto (line 17) | async def goto(
method __getattr__ (line 34) | def __getattr__(self, name: str) -> Any:
method __aenter__ (line 38) | async def __aenter__(self) -> CrawleeStagehandPage:
method __aexit__ (line 42) | async def __aexit__(
class CrawleeStagehand (line 51) | class CrawleeStagehand(Stagehand):
method init (line 54) | async def init(self) -> None:
FILE: docs/guides/code_examples/proxy_management/inspecting_bs_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/proxy_management/inspecting_pw_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/proxy_management/integration_bs_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/proxy_management/integration_pw_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/proxy_management/quick_start_example.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/proxy_management/session_bs_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/proxy_management/session_pw_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/proxy_management/tiers_bs_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/proxy_management/tiers_pw_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/request_loaders/rl_basic_example.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/request_loaders/rl_basic_example_with_persist.py
function main (line 19) | async def main() -> None:
FILE: docs/guides/code_examples/request_loaders/rl_tandem_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py
function main (line 8) | async def main() -> None:
FILE: docs/guides/code_examples/request_loaders/sitemap_basic_example.py
function main (line 8) | async def main() -> None:
FILE: docs/guides/code_examples/request_loaders/sitemap_example_with_persist.py
function main (line 20) | async def main() -> None:
FILE: docs/guides/code_examples/request_loaders/sitemap_tandem_example.py
function main (line 9) | async def main() -> None:
FILE: docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py
function main (line 10) | async def main() -> None:
FILE: docs/guides/code_examples/request_router/adaptive_crawler_handlers.py
function main (line 11) | async def main() -> None:
FILE: docs/guides/code_examples/request_router/basic_request_handlers.py
function main (line 8) | async def main() -> None:
FILE: docs/guides/code_examples/request_router/custom_router_default_only.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/request_router/error_handler.py
function main (line 10) | async def main() -> None:
FILE: docs/guides/code_examples/request_router/failed_request_handler.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/request_router/http_pre_navigation.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/request_router/playwright_pre_navigation.py
function main (line 10) | async def main() -> None:
FILE: docs/guides/code_examples/request_router/simple_default_handler.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/running_in_web_server/crawler.py
class State (line 11) | class State(TypedDict):
function lifespan (line 19) | async def lifespan(app: FastAPI) -> AsyncIterator[State]:
FILE: docs/guides/code_examples/running_in_web_server/server.py
function index (line 18) | def index() -> str:
function scrape_url (line 36) | async def scrape_url(request: Request, url: str | None = None) -> dict:
FILE: docs/guides/code_examples/scaling_crawlers/max_tasks_per_minute_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/scaling_crawlers/min_and_max_concurrency_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/service_locator/service_conflicts.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/service_locator/service_crawler_configuration.py
function main (line 8) | async def main() -> None:
FILE: docs/guides/code_examples/service_locator/service_crawler_event_manager.py
function main (line 8) | async def main() -> None:
FILE: docs/guides/code_examples/service_locator/service_crawler_storage_client.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/service_locator/service_locator_configuration.py
function main (line 8) | async def main() -> None:
FILE: docs/guides/code_examples/service_locator/service_locator_event_manager.py
function main (line 8) | async def main() -> None:
FILE: docs/guides/code_examples/service_locator/service_locator_storage_client.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/service_locator/service_storage_configuration.py
function main (line 10) | async def main() -> None:
FILE: docs/guides/code_examples/service_locator/service_storage_storage_client.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/session_management/multi_sessions_http.py
function create_session_function (line 15) | def create_session_function() -> Callable[[], Session]:
function main (line 30) | async def main() -> None:
FILE: docs/guides/code_examples/session_management/one_session_http.py
function main (line 10) | async def main() -> None:
FILE: docs/guides/code_examples/session_management/sm_basic.py
function main (line 9) | async def main() -> None:
FILE: docs/guides/code_examples/session_management/sm_beautifulsoup.py
function main (line 8) | async def main() -> None:
FILE: docs/guides/code_examples/session_management/sm_http.py
function main (line 9) | async def main() -> None:
FILE: docs/guides/code_examples/session_management/sm_parsel.py
function main (line 8) | async def main() -> None:
FILE: docs/guides/code_examples/session_management/sm_playwright.py
function main (line 8) | async def main() -> None:
FILE: docs/guides/code_examples/session_management/sm_standalone.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/storage_clients/custom_storage_client_example.py
class CustomDatasetClient (line 18) | class CustomDatasetClient(DatasetClient):
class CustomKeyValueStoreClient (line 23) | class CustomKeyValueStoreClient(KeyValueStoreClient):
class CustomRequestQueueClient (line 28) | class CustomRequestQueueClient(RequestQueueClient):
class CustomStorageClient (line 36) | class CustomStorageClient(StorageClient):
method create_dataset_client (line 37) | async def create_dataset_client(
method create_kvs_client (line 47) | async def create_kvs_client(
method create_rq_client (line 57) | async def create_rq_client(
FILE: docs/guides/code_examples/storage_clients/registering_storage_clients_example.py
function main (line 9) | async def main() -> None:
FILE: docs/guides/code_examples/storage_clients/sql_storage_client_basic_example.py
function main (line 5) | async def main() -> None:
FILE: docs/guides/code_examples/storage_clients/sql_storage_client_configuration_example.py
function main (line 8) | async def main() -> None:
FILE: docs/guides/code_examples/storages/cleaning_do_not_purge_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/storages/dataset_basic_example.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/storages/dataset_with_crawler_example.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/storages/dataset_with_crawler_explicit_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/storages/helper_add_requests_example.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/storages/helper_enqueue_links_example.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/storages/kvs_basic_example.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/storages/kvs_with_crawler_example.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/storages/kvs_with_crawler_explicit_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/storages/opening.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/storages/rq_basic_example.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/storages/rq_with_crawler_example.py
function main (line 6) | async def main() -> None:
FILE: docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py
function main (line 7) | async def main() -> None:
FILE: docs/guides/code_examples/trace_and_monitor_crawlers/instrument_crawler.py
function instrument_crawler (line 14) | def instrument_crawler() -> None:
function main (line 35) | async def main() -> None:
FILE: docs/introduction/code_examples/02_bs.py
function main (line 8) | async def main() -> None:
FILE: docs/introduction/code_examples/02_bs_better.py
function main (line 7) | async def main() -> None:
FILE: docs/introduction/code_examples/02_request_queue.py
function main (line 6) | async def main() -> None:
FILE: docs/introduction/code_examples/03_enqueue_strategy.py
function main (line 6) | async def main() -> None:
FILE: docs/introduction/code_examples/03_finding_new_links.py
function main (line 6) | async def main() -> None:
FILE: docs/introduction/code_examples/03_globs.py
function main (line 7) | async def main() -> None:
FILE: docs/introduction/code_examples/03_original_code.py
function main (line 6) | async def main() -> None:
FILE: docs/introduction/code_examples/03_transform_request.py
function transform_request (line 9) | def transform_request(
function main (line 31) | async def main() -> None:
FILE: docs/introduction/code_examples/04_sanity_check.py
function main (line 7) | async def main() -> None:
FILE: docs/introduction/code_examples/05_crawling_detail.py
function main (line 6) | async def main() -> None:
FILE: docs/introduction/code_examples/05_crawling_listing.py
function main (line 6) | async def main() -> None:
FILE: docs/introduction/code_examples/06_scraping.py
function main (line 6) | async def main() -> None:
FILE: docs/introduction/code_examples/07_final_code.py
function main (line 6) | async def main() -> None:
FILE: docs/introduction/code_examples/07_first_code.py
function main (line 9) | async def main() -> None:
FILE: docs/introduction/code_examples/08_main.py
function main (line 8) | async def main() -> None:
FILE: docs/introduction/code_examples/08_routes.py
function default_handler (line 8) | async def default_handler(context: PlaywrightCrawlingContext) -> None:
function category_handler (line 21) | async def category_handler(context: PlaywrightCrawlingContext) -> None:
function detail_handler (line 42) | async def detail_handler(context: PlaywrightCrawlingContext) -> None:
FILE: docs/introduction/code_examples/09_apify_sdk.py
function main (line 11) | async def main() -> None:
FILE: docs/quick-start/code_examples/beautifulsoup_crawler_example.py
function main (line 6) | async def main() -> None:
FILE: docs/quick-start/code_examples/parsel_crawler_example.py
function main (line 6) | async def main() -> None:
FILE: docs/quick-start/code_examples/playwright_crawler_example.py
function main (line 6) | async def main() -> None:
FILE: docs/quick-start/code_examples/playwright_crawler_headful_example.py
function main (line 6) | async def main() -> None:
FILE: src/crawlee/_autoscaling/_types.py
class LoadRatioInfo (line 17) | class LoadRatioInfo:
method is_overloaded (line 28) | def is_overloaded(self) -> bool:
class SystemInfo (line 34) | class SystemInfo:
method is_system_idle (line 53) | def is_system_idle(self) -> bool:
method __str__ (line 62) | def __str__(self) -> str:
class CpuSnapshot (line 74) | class CpuSnapshot:
method is_overloaded (line 87) | def is_overloaded(self) -> bool:
class MemorySnapshot (line 93) | class MemorySnapshot:
method is_overloaded (line 115) | def is_overloaded(self) -> bool:
class EventLoopSnapshot (line 126) | class EventLoopSnapshot:
method max_delay_exceeded (line 139) | def max_delay_exceeded(self) -> timedelta:
method is_overloaded (line 144) | def is_overloaded(self) -> bool:
class ClientSnapshot (line 150) | class ClientSnapshot:
method is_overloaded (line 166) | def is_overloaded(self) -> bool:
class Ratio (line 175) | class Ratio:
FILE: src/crawlee/_autoscaling/autoscaled_pool.py
class AbortError (line 24) | class AbortError(Exception):
class _AutoscaledPoolRun (line 28) | class _AutoscaledPoolRun:
method __init__ (line 29) | def __init__(self) -> None:
class AutoscaledPool (line 39) | class AutoscaledPool:
method __init__ (line 64) | def __init__(
method run (line 105) | async def run(self) -> None:
method abort (line 155) | async def abort(self) -> None:
method pause (line 163) | def pause(self) -> None:
method resume (line 167) | def resume(self) -> None:
method desired_concurrency (line 172) | def desired_concurrency(self) -> int:
method current_concurrency (line 177) | def current_concurrency(self) -> int:
method _autoscale (line 184) | def _autoscale(self) -> None:
method _log_system_status (line 204) | def _log_system_status(self) -> None:
method _worker_task_orchestrator (line 213) | async def _worker_task_orchestrator(self, run: _AutoscaledPoolRun) -> ...
method _reap_worker_task (line 262) | def _reap_worker_task(self, task: asyncio.Task, run: _AutoscaledPoolRu...
method _worker_task (line 275) | async def _worker_task(self) -> None:
FILE: src/crawlee/_autoscaling/snapshotter.py
function _warn_once (line 31) | def _warn_once(warning_message: str) -> None:
class SortedSnapshotList (line 36) | class SortedSnapshotList(list[T]):
method add (line 39) | def add(self, item: T) -> None:
class Snapshotter (line 45) | class Snapshotter:
method __init__ (line 72) | def __init__(
method from_config (line 119) | def from_config(cls, config: Configuration | None = None) -> Snapshotter:
method _get_sorted_list_by_created_at (line 145) | def _get_sorted_list_by_created_at(input_list: list[T]) -> SortedSnaps...
method active (line 155) | def active(self) -> bool:
method __aenter__ (line 159) | async def __aenter__(self) -> Snapshotter:
method __aexit__ (line 176) | async def __aexit__(
method get_memory_sample (line 201) | def get_memory_sample(self, duration: timedelta | None = None) -> list...
method get_event_loop_sample (line 214) | def get_event_loop_sample(self, duration: timedelta | None = None) -> ...
method get_cpu_sample (line 227) | def get_cpu_sample(self, duration: timedelta | None = None) -> list[Sn...
method get_client_sample (line 240) | def get_client_sample(self, duration: timedelta | None = None) -> list...
method _get_sample (line 253) | def _get_sample(snapshots: list[Snapshot], duration: timedelta | None ...
method _snapshot_cpu (line 264) | async def _snapshot_cpu(self, event_data: EventSystemInfoData) -> None:
method _snapshot_memory (line 285) | async def _snapshot_memory(self, event_data: EventSystemInfoData) -> N...
method _snapshot_event_loop (line 346) | async def _snapshot_event_loop(self) -> None:
method _snapshot_client (line 367) | async def _snapshot_client(self) -> None:
method _prune_snapshots (line 391) | def _prune_snapshots(self, snapshots: list[Snapshot], now: datetime) -...
method _evaluate_memory_load (line 416) | def _evaluate_memory_load(
FILE: src/crawlee/_autoscaling/system_status.py
class SystemStatus (line 21) | class SystemStatus:
method __init__ (line 39) | def __init__(
method get_current_system_info (line 71) | def get_current_system_info(self) -> SystemInfo:
method get_historical_system_info (line 82) | def get_historical_system_info(self) -> SystemInfo:
method _get_system_info (line 93) | def _get_system_info(self, *, sample_duration: timedelta | None = None...
method _is_cpu_overloaded (line 115) | def _is_cpu_overloaded(self, sample_duration: timedelta | None = None)...
method _is_memory_overloaded (line 128) | def _is_memory_overloaded(self, sample_duration: timedelta | None = No...
method _is_event_loop_overloaded (line 141) | def _is_event_loop_overloaded(self, sample_duration: timedelta | None ...
method _is_client_overloaded (line 154) | def _is_client_overloaded(self, sample_duration: timedelta | None = No...
method _is_sample_overloaded (line 167) | def _is_sample_overloaded(self, sample: list[Snapshot], threshold: flo...
FILE: src/crawlee/_cli.py
function callback (line 39) | def callback(
function _prompt_for_project_name (line 56) | def _prompt_for_project_name(initial_project_name: str | None) -> str:
function _prompt_text (line 84) | def _prompt_text(message: str, default: str) -> str:
function _prompt_choice (line 98) | def _prompt_choice(message: str, choices: list[str]) -> str:
function _prompt_bool (line 112) | def _prompt_bool(message: str, *, default: bool) -> bool:
function create (line 126) | def create(
FILE: src/crawlee/_log_config.py
function string_to_log_level (line 41) | def string_to_log_level(level: LogLevel) -> int:
function get_configured_log_level (line 57) | def get_configured_log_level() -> int:
function configure_logger (line 69) | def configure_logger(logger: logging.Logger, *, remove_old_handlers: boo...
class CrawleeLogFormatter (line 84) | class CrawleeLogFormatter(logging.Formatter):
method __init__ (line 100) | def __init__(
method _get_extra_fields (line 116) | def _get_extra_fields(self, record: logging.LogRecord) -> dict[str, Any]:
method format (line 124) | def format(self, record: logging.LogRecord) -> str:
FILE: src/crawlee/_request.py
class RequestState (line 21) | class RequestState(IntEnum):
class CrawleeRequestData (line 34) | class CrawleeRequestData(BaseModel):
class UserData (line 65) | class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
method __getitem__ (line 81) | def __getitem__(self, key: str) -> JsonSerializable:
method __setitem__ (line 84) | def __setitem__(self, key: str, value: JsonSerializable) -> None:
method __delitem__ (line 93) | def __delitem__(self, key: str) -> None:
method __iter__ (line 96) | def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-over...
method __len__ (line 99) | def __len__(self) -> int:
method __eq__ (line 102) | def __eq__(self, other: object) -> bool:
method __hash__ (line 111) | def __hash__(self) -> int:
class RequestOptions (line 121) | class RequestOptions(TypedDict):
class Request (line 145) | class Request(BaseModel):
method from_url (line 243) | def from_url(
method get_query_param_from_url (line 344) | def get_query_param_from_url(self, param: str, *, default: str | None ...
method label (line 350) | def label(self) -> str | None:
method session_id (line 355) | def session_id(self) -> str | None:
method crawlee_data (line 360) | def crawlee_data(self) -> CrawleeRequestData:
method crawl_depth (line 369) | def crawl_depth(self) -> int:
method crawl_depth (line 374) | def crawl_depth(self, new_value: int) -> None:
method state (line 378) | def state(self) -> RequestState:
method state (line 383) | def state(self, new_state: RequestState) -> None:
method max_retries (line 387) | def max_retries(self) -> int | None:
method session_rotation_count (line 392) | def session_rotation_count(self) -> int | None:
method session_rotation_count (line 397) | def session_rotation_count(self, new_session_rotation_count: int) -> N...
method enqueue_strategy (line 401) | def enqueue_strategy(self) -> EnqueueStrategy:
method enqueue_strategy (line 406) | def enqueue_strategy(self, new_enqueue_strategy: EnqueueStrategy) -> N...
method last_proxy_tier (line 410) | def last_proxy_tier(self) -> int | None:
method last_proxy_tier (line 415) | def last_proxy_tier(self, new_value: int) -> None:
method forefront (line 419) | def forefront(self) -> bool:
method forefront (line 424) | def forefront(self, new_value: bool) -> None:
method was_already_handled (line 428) | def was_already_handled(self) -> bool:
class RequestWithLock (line 433) | class RequestWithLock(Request):
FILE: src/crawlee/_service_locator.py
class ServiceLocator (line 20) | class ServiceLocator:
method __init__ (line 28) | def __init__(
method get_configuration (line 38) | def get_configuration(self) -> Configuration:
method set_configuration (line 46) | def set_configuration(self, configuration: Configuration) -> None:
method get_event_manager (line 63) | def get_event_manager(self) -> EventManager:
method set_event_manager (line 76) | def set_event_manager(self, event_manager: EventManager) -> None:
method get_storage_client (line 93) | def get_storage_client(self) -> StorageClient:
method set_storage_client (line 106) | def set_storage_client(self, storage_client: StorageClient) -> None:
method storage_instance_manager (line 124) | def storage_instance_manager(self) -> StorageInstanceManager:
FILE: src/crawlee/_types.py
function _normalize_headers (line 52) | def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
class HttpHeaders (line 60) | class HttpHeaders(RootModel, Mapping[str, str]):
method __getitem__ (line 75) | def __getitem__(self, key: str) -> str:
method __setitem__ (line 78) | def __setitem__(self, key: str, value: str) -> None:
method __delitem__ (line 81) | def __delitem__(self, key: str) -> None:
method __or__ (line 84) | def __or__(self, other: HttpHeaders) -> HttpHeaders:
method __ror__ (line 89) | def __ror__(self, other: HttpHeaders) -> HttpHeaders:
method __iter__ (line 94) | def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-over...
method __len__ (line 97) | def __len__(self) -> int:
class ConcurrencySettings (line 102) | class ConcurrencySettings:
method __init__ (line 105) | def __init__(
class EnqueueLinksKwargs (line 144) | class EnqueueLinksKwargs(TypedDict):
class AddRequestsKwargs (line 175) | class AddRequestsKwargs(EnqueueLinksKwargs):
class PushDataKwargs (line 193) | class PushDataKwargs(TypedDict):
class PushDataFunctionCall (line 197) | class PushDataFunctionCall(PushDataKwargs):
class KeyValueStoreInterface (line 204) | class KeyValueStoreInterface(Protocol):
method get_value (line 208) | async def get_value(self, key: str) -> Any: ...
method get_value (line 211) | async def get_value(self, key: str, default_value: T) -> T: ...
method get_value (line 214) | async def get_value(self, key: str, default_value: T | None = None) ->...
method get_value (line 216) | async def get_value(self, key: str, default_value: T | None = None) ->...
method set_value (line 218) | async def set_value(
class KeyValueStoreValue (line 227) | class KeyValueStoreValue:
class KeyValueStoreChangeRecords (line 232) | class KeyValueStoreChangeRecords:
method __init__ (line 233) | def __init__(self, actual_key_value_store: KeyValueStore) -> None:
method set_value (line 237) | async def set_value(
method get_value (line 246) | async def get_value(self, key: str) -> Any: ...
method get_value (line 249) | async def get_value(self, key: str, default_value: T) -> T: ...
method get_value (line 252) | async def get_value(self, key: str, default_value: T | None = None) ->...
method get_value (line 254) | async def get_value(self, key: str, default_value: T | None = None) ->...
class RequestHandlerRunResult (line 261) | class RequestHandlerRunResult:
method __init__ (line 264) | def __init__(
method request (line 279) | def request(self) -> Request:
method add_requests (line 282) | async def add_requests(
method push_data (line 298) | async def push_data(
method get_key_value_store (line 317) | async def get_key_value_store(
method apply_request_changes (line 331) | def apply_request_changes(self, target: Request) -> None:
class AddRequestsFunction (line 341) | class AddRequestsFunction(Protocol):
method __call__ (line 348) | def __call__(
class EnqueueLinksFunction (line 371) | class EnqueueLinksFunction(Protocol):
method __call__ (line 386) | def __call__(
method __call__ (line 401) | def __call__(
method __call__ (line 411) | def __call__(
class ExtractLinksFunction (line 452) | class ExtractLinksFunction(Protocol):
method __call__ (line 459) | def __call__(
class GetKeyValueStoreFunction (line 489) | class GetKeyValueStoreFunction(Protocol):
method __call__ (line 495) | def __call__(
class GetKeyValueStoreFromRequestHandlerFunction (line 511) | class GetKeyValueStoreFromRequestHandlerFunction(Protocol):
method __call__ (line 517) | def __call__(
class PushDataFunction (line 534) | class PushDataFunction(Protocol):
method __call__ (line 541) | def __call__(
class SendRequestFunction (line 561) | class SendRequestFunction(Protocol):
method __call__ (line 568) | def __call__(
class PageSnapshot (line 591) | class PageSnapshot:
method __bool__ (line 600) | def __bool__(self) -> bool:
class UseStateFunction (line 605) | class UseStateFunction(Protocol):
method __call__ (line 614) | def __call__(
class BasicCrawlingContext (line 630) | class BasicCrawlingContext:
method get_snapshot (line 664) | async def get_snapshot(self) -> PageSnapshot:
method __hash__ (line 668) | def __hash__(self) -> int:
method create_modified_copy (line 672) | def create_modified_copy(
class GetDataKwargs (line 691) | class GetDataKwargs(TypedDict):
class ExportToKwargs (line 728) | class ExportToKwargs(TypedDict):
class ExportDataJsonKwargs (line 750) | class ExportDataJsonKwargs(TypedDict):
class ExportDataCsvKwargs (line 786) | class ExportDataCsvKwargs(TypedDict):
FILE: src/crawlee/_utils/byte_size.py
class ByteSize (line 13) | class ByteSize:
method __post_init__ (line 18) | def __post_init__(self) -> None:
method validate (line 23) | def validate(cls, value: Any) -> ByteSize:
method from_kb (line 33) | def from_kb(cls, kb: float) -> ByteSize:
method from_mb (line 37) | def from_mb(cls, mb: float) -> ByteSize:
method from_gb (line 41) | def from_gb(cls, gb: float) -> ByteSize:
method from_tb (line 45) | def from_tb(cls, tb: float) -> ByteSize:
method to_kb (line 48) | def to_kb(self) -> float:
method to_mb (line 51) | def to_mb(self) -> float:
method to_gb (line 54) | def to_gb(self) -> float:
method to_tb (line 57) | def to_tb(self) -> float:
method __str__ (line 60) | def __str__(self) -> str:
method __eq__ (line 71) | def __eq__(self, other: object) -> bool:
method __hash__ (line 76) | def __hash__(self) -> int:
method __lt__ (line 80) | def __lt__(self, other: object) -> bool:
method __le__ (line 85) | def __le__(self, other: object) -> bool:
method __gt__ (line 90) | def __gt__(self, other: object) -> bool:
method __ge__ (line 95) | def __ge__(self, other: object) -> bool:
method __add__ (line 100) | def __add__(self, other: object) -> ByteSize:
method __sub__ (line 105) | def __sub__(self, other: object) -> ByteSize:
method __mul__ (line 113) | def __mul__(self, other: object) -> ByteSize:
method __truediv__ (line 119) | def __truediv__(self, other: object) -> float:
method __rmul__ (line 127) | def __rmul__(self, other: object) -> ByteSize:
FILE: src/crawlee/_utils/console.py
function make_table (line 11) | def make_table(rows: Sequence[Sequence[str]], width: int = 100) -> str:
FILE: src/crawlee/_utils/context.py
function ensure_context (line 11) | def ensure_context(method: T) -> T:
FILE: src/crawlee/_utils/crypto.py
function compute_short_hash (line 7) | def compute_short_hash(data: bytes, *, length: int = 8) -> str:
function crypto_random_object_id (line 21) | def crypto_random_object_id(length: int = 17) -> str:
FILE: src/crawlee/_utils/docs.py
function docs_group (line 31) | def docs_group(group_name: GroupName) -> Callable[[T], T]: # noqa: ARG001
FILE: src/crawlee/_utils/file.py
function _write_file (line 22) | def _write_file(path: Path, data: str | bytes) -> None:
function _write_file (line 36) | def _write_file(path: Path, data: str | bytes) -> None:
function infer_mime_type (line 63) | def infer_mime_type(value: Any) -> str:
function json_dumps (line 88) | async def json_dumps(obj: Any) -> str:
function atomic_write (line 101) | async def atomic_write(
function atomic_write (line 110) | async def atomic_write(
function atomic_write (line 118) | async def atomic_write(
function export_json_to_stream (line 152) | async def export_json_to_stream(
function export_csv_to_stream (line 161) | async def export_csv_to_stream(
FILE: src/crawlee/_utils/globs.py
class Glob (line 11) | class Glob:
method __init__ (line 14) | def __init__(self, glob: str) -> None:
function _translate (line 19) | def _translate(
function _fnmatch_translate (line 76) | def _fnmatch_translate(pat: str, star: str, question_mark: str) -> list[...
FILE: src/crawlee/_utils/models.py
function _timedelta_to_ms (line 15) | def _timedelta_to_ms(td: timedelta | None) -> float | None:
function _timedelta_to_secs (line 23) | def _timedelta_to_secs(td: timedelta | None) -> float | None:
function _timedelta_from_ms (line 34) | def _timedelta_from_ms(value: float | timedelta | Any | None, handler: C...
function _timedelta_from_secs (line 49) | def _timedelta_from_secs(
FILE: src/crawlee/_utils/raise_if_too_many_kwargs.py
function raise_if_too_many_kwargs (line 4) | def raise_if_too_many_kwargs(max_kwargs: int = 1, **kwargs: Any) -> None:
FILE: src/crawlee/_utils/recoverable_state.py
class RecoverableState (line 19) | class RecoverableState(Generic[TStateModel]):
method __init__ (line 34) | def __init__(
method initialize (line 93) | async def initialize(self) -> TStateModel:
method teardown (line 121) | async def teardown(self) -> None:
method current_value (line 139) | def current_value(self) -> TStateModel:
method is_initialized (line 147) | def is_initialized(self) -> bool:
method has_persisted_state (line 151) | async def has_persisted_state(self) -> bool:
method reset (line 161) | async def reset(self) -> None:
method persist_state (line 175) | async def persist_state(self, event_data: EventPersistStateData | None...
method _load_saved_state (line 200) | async def _load_saved_state(self) -> None:
FILE: src/crawlee/_utils/recurring_task.py
class RecurringTask (line 18) | class RecurringTask:
method __init__ (line 27) | def __init__(self, func: Callable, delay: timedelta) -> None:
method __aenter__ (line 37) | async def __aenter__(self) -> Self:
method __aexit__ (line 41) | async def __aexit__(
method _wrapper (line 49) | async def _wrapper(self) -> None:
method start (line 60) | def start(self) -> None:
method stop (line 68) | async def stop(self) -> None:
FILE: src/crawlee/_utils/requests.py
function normalize_url (line 16) | def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
function compute_unique_key (line 50) | def compute_unique_key(
function _get_payload_hash (line 113) | def _get_payload_hash(payload: HttpPayload | None) -> str:
function _get_headers_hash (line 118) | def _get_headers_hash(headers: HttpHeaders | None) -> str:
FILE: src/crawlee/_utils/robots.py
class RobotsTxtFile (line 22) | class RobotsTxtFile:
method __init__ (line 23) | def __init__(
method from_content (line 32) | async def from_content(cls, url: str, content: str) -> Self:
method find (line 43) | async def find(cls, url: str, http_client: HttpClient, proxy_info: Pro...
method load (line 55) | async def load(cls, url: str, http_client: HttpClient, proxy_info: Pro...
method is_allowed (line 80) | def is_allowed(self, url: str, user_agent: str = '*') -> bool:
method get_sitemaps (line 92) | def get_sitemaps(self) -> list[str]:
method get_crawl_delay (line 96) | def get_crawl_delay(self, user_agent: str = '*') -> int | None:
method parse_sitemaps (line 106) | async def parse_sitemaps(self) -> Sitemap:
method parse_urls_from_sitemaps (line 114) | async def parse_urls_from_sitemaps(self) -> list[str]:
FILE: src/crawlee/_utils/sitemap.py
class SitemapUrl (line 40) | class SitemapUrl:
class NestedSitemap (line 49) | class NestedSitemap:
class ParseSitemapOptions (line 54) | class ParseSitemapOptions(TypedDict, total=False):
class SitemapSource (line 61) | class SitemapSource(TypedDict):
class _SitemapItem (line 68) | class _SitemapItem(TypedDict, total=False):
class _XMLSaxSitemapHandler (line 77) | class _XMLSaxSitemapHandler(ContentHandler):
method __init__ (line 78) | def __init__(self) -> None:
method items (line 87) | def items(self) -> list[_SitemapItem]:
method startElement (line 91) | def startElement(self, name: str, attrs: AttributesImpl) -> None:
method characters (line 99) | def characters(self, content: str) -> None:
method endElement (line 104) | def endElement(self, name: str) -> None:
class _TxtSitemapParser (line 132) | class _TxtSitemapParser:
method __init__ (line 135) | def __init__(self) -> None:
method process_chunk (line 138) | async def process_chunk(self, chunk: str) -> AsyncGenerator[_SitemapIt...
method flush (line 153) | async def flush(self) -> AsyncGenerator[_SitemapItem, None]:
method close (line 161) | def close(self) -> None:
class _XmlSitemapParser (line 166) | class _XmlSitemapParser:
method __init__ (line 169) | def __init__(self) -> None:
method process_chunk (line 174) | async def process_chunk(self, chunk: str) -> AsyncGenerator[_SitemapIt...
method flush (line 188) | async def flush(self) -> AsyncGenerator[_SitemapItem, None]:
method close (line 201) | def close(self) -> None:
function _get_parser (line 207) | def _get_parser(content_type: str = '', url: str | None = None) -> _XmlS...
function _get_origin_url (line 215) | def _get_origin_url(source: SitemapSource) -> str:
function _process_sitemap_item (line 225) | async def _process_sitemap_item(
function _process_raw_source (line 268) | async def _process_raw_source(
function _fetch_and_process_sitemap (line 306) | async def _fetch_and_process_sitemap(
class Sitemap (line 383) | class Sitemap:
method __init__ (line 384) | def __init__(self, urls: list[str]) -> None:
method urls (line 388) | def urls(self) -> list[str]:
method try_common_names (line 392) | async def try_common_names(cls, url: str, http_client: HttpClient, pro...
method load (line 398) | async def load(
method from_xml_string (line 412) | async def from_xml_string(cls, content: str) -> Sitemap:
method parse (line 416) | async def parse(
function parse_sitemap (line 427) | async def parse_sitemap(
function _merge_async_generators (line 496) | async def _merge_async_generators(*generators: AsyncGenerator) -> AsyncG...
function _discover_for_hostname (line 526) | async def _discover_for_hostname(
function discover_valid_sitemaps (line 581) | async def discover_valid_sitemaps(
FILE: src/crawlee/_utils/system.py
function _get_used_memory (line 28) | def _get_used_memory(process: psutil.Process) -> int:
function _get_used_memory (line 32) | def _get_used_memory(process: psutil.Process) -> int:
class CpuInfo (line 36) | class CpuInfo(BaseModel):
class MemoryUsageInfo (line 59) | class MemoryUsageInfo(BaseModel):
class MemoryInfo (line 87) | class MemoryInfo(MemoryUsageInfo):
function get_cpu_info (line 106) | def get_cpu_info() -> CpuInfo:
function get_memory_info (line 117) | def get_memory_info() -> MemoryInfo:
FILE: src/crawlee/_utils/time.py
class TimerResult (line 20) | class TimerResult:
function measure_time (line 26) | def measure_time() -> Iterator[TimerResult]:
class SharedTimeout (line 41) | class SharedTimeout:
method __init__ (line 47) | def __init__(self, timeout: timedelta) -> None:
method __aenter__ (line 52) | async def __aenter__(self) -> timedelta:
method __aexit__ (line 61) | async def __aexit__(
function format_duration (line 78) | def format_duration(duration: timedelta | None) -> str:
FILE: src/crawlee/_utils/try_import.py
function try_import (line 10) | def try_import(module_name: str, *symbol_names: str) -> Iterator[None]:
function install_import_hook (line 22) | def install_import_hook(module_name: str) -> None:
class FailedImport (line 28) | class FailedImport:
class ImportWrapper (line 35) | class ImportWrapper(ModuleType):
method __getattribute__ (line 38) | def __getattribute__(self, name: str) -> Any:
FILE: src/crawlee/_utils/urls.py
function is_url_absolute (line 13) | def is_url_absolute(url: str) -> bool:
function convert_to_absolute_url (line 21) | def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
function to_absolute_url_iterator (line 26) | def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger:...
function validate_http_url (line 44) | def validate_http_url(value: str | None) -> str | None:
FILE: src/crawlee/_utils/wait.py
function wait_for (line 15) | async def wait_for(
function wait_for_all_tasks_for_finish (line 49) | async def wait_for_all_tasks_for_finish(
FILE: src/crawlee/_utils/web.py
function is_status_code_client_error (line 6) | def is_status_code_client_error(value: int) -> bool:
function is_status_code_server_error (line 11) | def is_status_code_server_error(value: int) -> bool:
function is_status_code_successful (line 16) | def is_status_code_successful(value: int) -> bool:
FILE: src/crawlee/browsers/_browser_controller.py
class BrowserController (line 21) | class BrowserController(ABC):
method pages (line 29) | def pages(self) -> list[Page]:
method total_opened_pages (line 34) | def total_opened_pages(self) -> int:
method pages_count (line 39) | def pages_count(self) -> int:
method last_page_opened_at (line 44) | def last_page_opened_at(self) -> datetime:
method idle_time (line 49) | def idle_time(self) -> timedelta:
method has_free_capacity (line 54) | def has_free_capacity(self) -> bool:
method is_browser_connected (line 59) | def is_browser_connected(self) -> bool:
method browser_type (line 64) | def browser_type(self) -> BrowserType:
method new_page (line 68) | async def new_page(
method close (line 89) | async def close(self, *, force: bool = False) -> None:
FILE: src/crawlee/browsers/_browser_plugin.py
class BrowserPlugin (line 19) | class BrowserPlugin(ABC):
method active (line 31) | def active(self) -> bool:
method browser_type (line 36) | def browser_type(self) -> BrowserType:
method browser_launch_options (line 41) | def browser_launch_options(self) -> Mapping[str, Any]:
method browser_new_context_options (line 51) | def browser_new_context_options(self) -> Mapping[str, Any]:
method max_open_pages_per_browser (line 61) | def max_open_pages_per_browser(self) -> int:
method __aenter__ (line 65) | async def __aenter__(self) -> BrowserPlugin:
method __aexit__ (line 73) | async def __aexit__(
method new_browser (line 86) | async def new_browser(self) -> BrowserController:
FILE: src/crawlee/browsers/_browser_pool.py
class BrowserPool (line 34) | class BrowserPool:
method __init__ (line 48) | def __init__(
method with_default_plugin (line 113) | def with_default_plugin(
method plugins (line 170) | def plugins(self) -> Sequence[BrowserPlugin]:
method active_browsers (line 175) | def active_browsers(self) -> Sequence[BrowserController]:
method inactive_browsers (line 180) | def inactive_browsers(self) -> Sequence[BrowserController]:
method pages (line 185) | def pages(self) -> Mapping[str, CrawleePage]:
method total_pages_count (line 190) | def total_pages_count(self) -> int:
method active (line 195) | def active(self) -> bool:
method __aenter__ (line 199) | async def __aenter__(self) -> BrowserPool:
method __aexit__ (line 223) | async def __aexit__(
method new_page (line 251) | async def new_page(
method new_page_with_each_plugin (line 281) | async def new_page_with_each_plugin(self) -> Sequence[CrawleePage]:
method _get_new_page (line 294) | async def _get_new_page(
method _pick_browser_with_free_capacity (line 342) | def _pick_browser_with_free_capacity(
method _retire_browser (line 353) | def _retire_browser(self, browser: BrowserController) -> None:
method _launch_new_browser (line 359) | async def _launch_new_browser(self, plugin: BrowserPlugin) -> BrowserC...
method _identify_inactive_browsers (line 365) | def _identify_inactive_browsers(self) -> None:
method _close_inactive_browsers (line 372) | async def _close_inactive_browsers(self) -> None:
method _execute_hooks (line 379) | async def _execute_hooks(self, hooks: list[Callable[..., Awaitable[Non...
method _override_page_close (line 384) | def _override_page_close(self, crawlee_page: CrawleePage, browser_cont...
method pre_page_create_hook (line 398) | def pre_page_create_hook(
method post_page_create_hook (line 413) | def post_page_create_hook(
method pre_page_close_hook (line 425) | def pre_page_close_hook(
method post_page_close_hook (line 437) | def post_page_close_hook(
FILE: src/crawlee/browsers/_playwright_browser.py
class PlaywrightPersistentBrowser (line 22) | class PlaywrightPersistentBrowser(Browser):
method __init__ (line 32) | def __init__(
method browser_type (line 47) | def browser_type(self) -> BrowserType:
method contexts (line 51) | def contexts(self) -> list[BrowserContext]:
method is_connected (line 54) | def is_connected(self) -> bool:
method new_context (line 57) | async def new_context(self, **context_options: Any) -> BrowserContext:
method _delete_temp_dir (line 79) | async def _delete_temp_dir(self, _: BrowserContext | None) -> None:
method close (line 85) | async def close(self, **kwargs: Any) -> None:
method version (line 96) | def version(self) -> str:
method new_page (line 99) | async def new_page(self, **kwargs: Any) -> Page:
method new_browser_cdp_session (line 103) | async def new_browser_cdp_session(self) -> CDPSession:
method start_tracing (line 106) | async def start_tracing(self, **kwargs: Any) -> None:
method stop_tracing (line 109) | async def stop_tracing(self, **kwargs: Any) -> bytes:
FILE: src/crawlee/browsers/_playwright_browser_controller.py
class PlaywrightBrowserController (line 32) | class PlaywrightBrowserController(BrowserController):
method __init__ (line 42) | def __init__(
method _get_context_creation_lock (line 86) | async def _get_context_creation_lock(self) -> Lock:
method pages (line 99) | def pages(self) -> list[Page]:
method total_opened_pages (line 104) | def total_opened_pages(self) -> int:
method pages_count (line 109) | def pages_count(self) -> int:
method last_page_opened_at (line 114) | def last_page_opened_at(self) -> datetime:
method idle_time (line 119) | def idle_time(self) -> timedelta:
method has_free_capacity (line 124) | def has_free_capacity(self) -> bool:
method is_browser_connected (line 129) | def is_browser_connected(self) -> bool:
method browser_type (line 134) | def browser_type(self) -> BrowserType:
method new_page (line 138) | async def new_page(
method close (line 192) | async def close(self, *, force: bool = False) -> None:
method _on_page_close (line 208) | def _on_page_close(self, page: Page) -> None:
method _create_browser_context (line 212) | async def _create_browser_context(
FILE: src/crawlee/browsers/_playwright_browser_plugin.py
class PlaywrightBrowserPlugin (line 32) | class PlaywrightBrowserPlugin(BrowserPlugin):
method __init__ (line 44) | def __init__(
method active (line 117) | def active(self) -> bool:
method browser_type (line 122) | def browser_type(self) -> BrowserType:
method browser_launch_options (line 127) | def browser_launch_options(self) -> Mapping[str, Any]:
method browser_new_context_options (line 138) | def browser_new_context_options(self) -> Mapping[str, Any]:
method max_open_pages_per_browser (line 149) | def max_open_pages_per_browser(self) -> int:
method __aenter__ (line 153) | async def __aenter__(self) -> PlaywrightBrowserPlugin:
method __aexit__ (line 162) | async def __aexit__(
method new_browser (line 177) | async def new_browser(self) -> PlaywrightBrowserController:
FILE: src/crawlee/browsers/_types.py
class CrawleePage (line 13) | class CrawleePage:
FILE: src/crawlee/configuration.py
class Configuration (line 20) | class Configuration(BaseSettings):
method get_global_configuration (line 216) | def get_global_configuration(cls) -> Self:
FILE: src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
class HttpCrawlerOptions (line 37) | class HttpCrawlerOptions(
class AbstractHttpCrawler (line 51) | class AbstractHttpCrawler(
method __init__ (line 70) | def __init__(
method create_parsed_http_crawler_class (line 93) | def create_parsed_http_crawler_class(
method _create_static_content_crawler_pipeline (line 118) | def _create_static_content_crawler_pipeline(self) -> ContextPipeline[P...
method _execute_pre_navigation_hooks (line 130) | async def _execute_pre_navigation_hooks(
method _execute_post_navigation_hooks (line 145) | async def _execute_post_navigation_hooks(
method _parse_http_response (line 153) | async def _parse_http_response(
method _create_extract_links_function (line 173) | def _create_extract_links_function(
method _make_http_request (line 256) | async def _make_http_request(self, context: BasicCrawlingContext) -> A...
method _handle_status_code_response (line 277) | async def _handle_status_code_response(
method _handle_blocked_request_by_content (line 299) | async def _handle_blocked_request_by_content(
method pre_navigation_hook (line 317) | def pre_navigation_hook(self, hook: Callable[[BasicCrawlingContext], A...
method post_navigation_hook (line 325) | def post_navigation_hook(self, hook: Callable[[HttpCrawlingContext], A...
FILE: src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py
class AbstractHttpParser (line 19) | class AbstractHttpParser(ABC, Generic[TParseResult, TSelectResult]):
method parse (line 23) | async def parse(self, response: HttpResponse) -> TParseResult:
method parse_text (line 34) | async def parse_text(self, text: str) -> TParseResult:
method select (line 45) | async def select(self, parsed_content: TParseResult, selector: str) ->...
method is_blocked (line 56) | def is_blocked(self, parsed_content: TParseResult) -> BlockedInfo:
method is_matching_selector (line 84) | def is_matching_selector(self, parsed_content: TParseResult, selector:...
method find_links (line 96) | def find_links(self, parsed_content: TParseResult, selector: str, attr...
FILE: src/crawlee/crawlers/_abstract_http/_http_crawling_context.py
class HttpCrawlingContext (line 18) | class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult):
method from_basic_crawling_context (line 22) | def from_basic_crawling_context(cls, context: BasicCrawlingContext, ht...
method get_snapshot (line 27) | async def get_snapshot(self) -> PageSnapshot:
class ParsedHttpCrawlingContext (line 34) | class ParsedHttpCrawlingContext(HttpCrawlingContext, Generic[TParseResul...
method from_http_crawling_context (line 45) | def from_http_crawling_context(
FILE: src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py
class _NonPersistentStatistics (line 57) | class _NonPersistentStatistics(Statistics):
method __init__ (line 63) | def __init__(self) -> None:
method __aenter__ (line 66) | async def __aenter__(self) -> Self:
method __aexit__ (line 71) | async def __aexit__(
class AdaptivePlaywrightCrawler (line 81) | class AdaptivePlaywrightCrawler(
method __init__ (line 119) | def __init__(
method with_beautifulsoup_static_parser (line 235) | def with_beautifulsoup_static_parser(
method with_parsel_static_parser (line 261) | def with_parsel_static_parser(
method _crawl_one (line 285) | async def _crawl_one(
method _pipeline_call_factory (line 340) | def _pipeline_call_factory(
method _run_request_handler (line 371) | async def _run_request_handler(self, context: BasicCrawlingContext) ->...
method pre_navigation_hook (line 437) | def pre_navigation_hook(
method post_navigation_hook (line 462) | def post_navigation_hook(
method track_http_only_request_handler_runs (line 488) | def track_http_only_request_handler_runs(self) -> None:
method track_browser_request_handler_runs (line 491) | def track_browser_request_handler_runs(self) -> None:
method track_rendering_type_mispredictions (line 494) | def track_rendering_type_mispredictions(self) -> None:
class SubCrawlerRun (line 499) | class SubCrawlerRun:
FILE: src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py
class AdaptivePlaywrightCrawlerStatisticState (line 12) | class AdaptivePlaywrightCrawlerStatisticState(StatisticsState):
FILE: src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py
class AdaptiveContextError (line 29) | class AdaptiveContextError(RuntimeError):
class AdaptivePlaywrightCrawlingContext (line 35) | class AdaptivePlaywrightCrawlingContext(
method page (line 50) | def page(self) -> Page:
method infinite_scroll (line 60) | def infinite_scroll(self) -> Callable[[], Awaitable[None]]:
method response (line 71) | def response(self) -> Response:
method wait_for_selector (line 80) | async def wait_for_selector(self, selector: str, timeout: timedelta = ...
method query_selector_one (line 93) | async def query_selector_one(
method query_selector_all (line 111) | async def query_selector_all(
method parse_with_static_parser (line 149) | async def parse_with_static_parser(
method from_parsed_http_crawling_context (line 170) | def from_parsed_http_crawling_context(
method from_playwright_crawling_context (line 179) | async def from_playwright_crawling_context(
class AdaptivePlaywrightPreNavCrawlingContext (line 208) | class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
method page (line 222) | def page(self) -> Page:
method from_pre_navigation_context (line 236) | def from_pre_navigation_context(cls, context: BasicCrawlingContext) ->...
class AdaptivePlaywrightPostNavCrawlingContext (line 254) | class AdaptivePlaywrightPostNavCrawlingContext(HttpCrawlingContext):
method page (line 264) | def page(self) -> Page:
method response (line 274) | def response(self) -> Response:
method from_post_navigation_context (line 284) | async def from_post_navigation_context(
FILE: src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py
class RenderingTypePredictorState (line 34) | class RenderingTypePredictorState(BaseModel):
class RenderingTypePrediction (line 49) | class RenderingTypePrediction:
class RenderingTypePredictor (line 62) | class RenderingTypePredictor(ABC):
method __init__ (line 65) | def __init__(self) -> None:
method predict (line 71) | def predict(self, request: Request) -> RenderingTypePrediction:
method store_result (line 79) | def store_result(self, request: Request, rendering_type: RenderingType...
method initialize (line 87) | async def initialize(self) -> None:
method clear (line 93) | async def clear(self) -> None:
method __aenter__ (line 99) | async def __aenter__(self) -> RenderingTypePredictor:
method __aexit__ (line 104) | async def __aexit__(
class DefaultRenderingTypePredictor (line 115) | class DefaultRenderingTypePredictor(RenderingTypePredictor):
method __init__ (line 122) | def __init__(
method initialize (line 161) | async def initialize(self) -> None:
method clear (line 169) | async def clear(self) -> None:
method predict (line 177) | def predict(self, request: Request) -> RenderingTypePrediction:
method store_result (line 209) | def store_result(self, request: Request, rendering_type: RenderingType...
method _retrain (line 222) | def _retrain(self) -> None:
method _calculate_mean_similarity (line 235) | def _calculate_mean_similarity(self, url: UrlComponents, label: str, r...
method _calculate_feature_vector (line 243) | def _calculate_feature_vector(self, url: UrlComponents, label: str) ->...
function get_url_components (line 250) | def get_url_components(url: str) -> UrlComponents:
function calculate_url_similarity (line 258) | def calculate_url_similarity(url_1: UrlComponents, url_2: UrlComponents)...
FILE: src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py
function create_default_comparator (line 11) | def create_default_comparator(
function full_result_comparator (line 22) | def full_result_comparator(result_1: RequestHandlerRunResult, result_2: ...
function push_data_only_comparator (line 37) | def push_data_only_comparator(result_1: RequestHandlerRunResult, result_...
FILE: src/crawlee/crawlers/_adaptive_playwright/_utils.py
function sklearn_model_validator (line 7) | def sklearn_model_validator(v: LogisticRegression | dict[str, Any]) -> L...
function sklearn_model_serializer (line 21) | def sklearn_model_serializer(model: LogisticRegression) -> dict[str, Any]:
FILE: src/crawlee/crawlers/_basic/_basic_crawler.py
class _BasicCrawlerOptions (line 113) | class _BasicCrawlerOptions(TypedDict):
class _BasicCrawlerOptionsGeneric (line 221) | class _BasicCrawlerOptionsGeneric(TypedDict, Generic[TCrawlingContext, T...
class BasicCrawlerOptions (line 235) | class BasicCrawlerOptions(
class BasicCrawler (line 247) | class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
method __init__ (line 275) | def __init__(
method log (line 511) | def log(self) -> logging.Logger:
method router (line 516) | def router(self) -> Router[TCrawlingContext]:
method router (line 524) | def router(self, router: Router[TCrawlingContext]) -> None:
method statistics (line 531) | def statistics(self) -> Statistics[TStatisticsState]:
method stop (line 535) | def stop(self, reason: str = 'Stop was called externally.') -> None:
method _wrap_handler_with_error_context (line 546) | def _wrap_handler_with_error_context(
method _stop_if_max_requests_count_exceeded (line 564) | def _stop_if_max_requests_count_exceeded(self) -> None:
method _get_session (line 574) | async def _get_session(self) -> Session | None:
method _get_session_by_id (line 588) | async def _get_session_by_id(self, session_id: str | None) -> Session ...
method _get_proxy_info (line 602) | async def _get_proxy_info(self, request: Request, session: Session | N...
method get_request_manager (line 613) | async def get_request_manager(self) -> RequestManager:
method get_dataset (line 623) | async def get_dataset(
method get_key_value_store (line 639) | async def get_key_value_store(
method error_handler (line 655) | def error_handler(
method failed_request_handler (line 665) | def failed_request_handler(
method on_skipped_request (line 675) | def on_skipped_request(self, callback: SkippedRequestCallback) -> Skip...
method run (line 683) | async def run(
method _run_crawler (line 770) | async def _run_crawler(self) -> None:
method add_requests (line 795) | async def add_requests(
method use_state (line 843) | async def use_state(
method _save_crawler_state (line 850) | async def _save_crawler_state(self) -> None:
method get_data (line 854) | async def get_data(
method export_data (line 884) | async def export_data(
method _push_data (line 928) | async def _push_data(
method _should_retry_request (line 951) | def _should_retry_request(self, context: BasicCrawlingContext, error: ...
method _check_url_after_redirects (line 968) | async def _check_url_after_redirects(self, context: TCrawlingContext) ...
method _create_enqueue_links_function (line 984) | def _create_enqueue_links_function(
method _enqueue_links_filter_iterator (line 1042) | def _enqueue_links_filter_iterator(
method _check_enqueue_strategy (line 1080) | def _check_enqueue_strategy(
method _check_url_patterns (line 1115) | def _check_url_patterns(
method _handle_request_retries (line 1145) | async def _handle_request_retries(
method _handle_request_error (line 1184) | async def _handle_request_error(self, context: TCrawlingContext | Basi...
method _handle_failed_request (line 1210) | async def _handle_failed_request(self, context: TCrawlingContext | Bas...
method _handle_skipped_request (line 1223) | async def _handle_skipped_request(
method _get_message_from_error (line 1238) | def _get_message_from_error(self, error: Exception) -> str:
method _get_only_inner_most_exception (line 1256) | def _get_only_inner_most_exception(self, error: BaseException) -> Base...
method _prepare_send_request_function (line 1265) | def _prepare_send_request_function(
method _convert_url_to_request_iterator (line 1288) | def _convert_url_to_request_iterator(self, urls: Sequence[str | Reques...
method _add_requests (line 1301) | async def _add_requests(
method _commit_request_handler_result (line 1335) | async def _commit_request_handler_result(self, context: BasicCrawlingC...
method _commit_key_value_store_changes (line 1350) | async def _commit_key_value_store_changes(
method __is_finished_function (line 1359) | async def __is_finished_function(self) -> bool:
method __is_task_ready_function (line 1375) | async def __is_task_ready_function(self) -> bool:
method __run_task_function (line 1387) | async def __run_task_function(self) -> None:
method _run_request_handler (line 1512) | async def _run_request_handler(self, context: BasicCrawlingContext) ->...
method _raise_for_error_status_code (line 1525) | def _raise_for_error_status_code(self, status_code: int) -> None:
method _raise_for_session_blocked_status_code (line 1547) | def _raise_for_session_blocked_status_code(self, session: Session | No...
method _check_request_collision (line 1563) | def _check_request_collision(self, request: Request, session: Session ...
method _is_allowed_based_on_robots_txt_file (line 1578) | async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool:
method _get_robots_txt_file_for_url (line 1589) | async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFil...
method _find_txt_file_for_url (line 1613) | async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
method _log_status_message (line 1621) | def _log_status_message(self, message: str, level: LogLevel = 'DEBUG')...
method _crawler_state_task (line 1631) | async def _crawler_state_task(self) -> None:
method _mark_request_as_handled (line 1671) | async def _mark_request_as_handled(self, request: Request) -> None:
FILE: src/crawlee/crawlers/_basic/_context_pipeline.py
class _Middleware (line 24) | class _Middleware(Generic[TMiddlewareCrawlingContext, TCrawlingContext]):
method __init__ (line 27) | def __init__(
method action (line 39) | async def action(self) -> TMiddlewareCrawlingContext:
method cleanup (line 43) | async def cleanup(self, final_consumer_exception: Exception | None) ->...
class ContextPipeline (line 57) | class ContextPipeline(Generic[TCrawlingContext]):
method __init__ (line 63) | def __init__(
method _middleware_chain (line 76) | def _middleware_chain(self) -> Generator[ContextPipeline[Any], None, N...
method __call__ (line 82) | async def __call__(
method compose (line 125) | def compose(
FILE: src/crawlee/crawlers/_basic/_context_utils.py
function swapped_context (line 15) | def swapped_context(
FILE: src/crawlee/crawlers/_basic/_logging_utils.py
function _get_only_innermost_exception (line 8) | def _get_only_innermost_exception(error: BaseException) -> BaseException:
function _get_filtered_traceback_parts_for_asyncio_timeout_error (line 28) | def _get_filtered_traceback_parts_for_asyncio_timeout_error(traceback_pa...
function _strip_pep657_highlighting (line 42) | def _strip_pep657_highlighting(traceback_part: str) -> str:
function reduce_asyncio_timeout_error_to_relevant_traceback_parts (line 48) | def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
function _get_traceback_parts_for_innermost_exception (line 55) | def _get_traceback_parts_for_innermost_exception(error: Exception) -> li...
function get_one_line_error_summary_if_possible (line 62) | def get_one_line_error_summary_if_possible(error: Exception) -> str:
FILE: src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py
class BeautifulSoupCrawler (line 22) | class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingCont...
method __init__ (line 57) | def __init__(
FILE: src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py
class BeautifulSoupCrawlingContext (line 14) | class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext[BeautifulSo...
method soup (line 21) | def soup(self) -> BeautifulSoup:
method from_parsed_http_crawling_context (line 26) | def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawling...
method html_to_text (line 30) | def html_to_text(self) -> str:
FILE: src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py
class BeautifulSoupParser (line 18) | class BeautifulSoupParser(AbstractHttpParser[BeautifulSoup, Tag]):
method __init__ (line 21) | def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None:
method parse (line 25) | async def parse(self, response: HttpResponse) -> BeautifulSoup:
method parse_text (line 29) | async def parse_text(self, text: str) -> BeautifulSoup:
method is_matching_selector (line 33) | def is_matching_selector(self, parsed_content: Tag, selector: str) -> ...
method select (line 37) | async def select(self, parsed_content: Tag, selector: str) -> Sequence...
method find_links (line 41) | def find_links(self, parsed_content: Tag, selector: str, attribute: st...
FILE: src/crawlee/crawlers/_beautifulsoup/_utils.py
function html_to_text (line 20) | def html_to_text(source: str | Tag) -> str:
FILE: src/crawlee/crawlers/_http/_http_crawler.py
class HttpCrawler (line 17) | class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], ...
method __init__ (line 49) | def __init__(
FILE: src/crawlee/crawlers/_http/_http_parser.py
class NoParser (line 18) | class NoParser(AbstractHttpParser[bytes, bytes]):
method parse (line 26) | async def parse(self, response: HttpResponse) -> bytes:
method parse_text (line 30) | async def parse_text(self, text: str) -> bytes:
method select (line 34) | async def select(self, parsed_content: bytes, selector: str) -> Sequen...
method is_blocked (line 38) | def is_blocked(self, parsed_content: bytes) -> BlockedInfo: # Intenti...
method is_matching_selector (line 42) | def is_matching_selector(self, parsed_content: bytes, selector: str) -...
method find_links (line 46) | def find_links(
FILE: src/crawlee/crawlers/_parsel/_parsel_crawler.py
class ParselCrawler (line 22) | class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector,...
method __init__ (line 57) | def __init__(
FILE: src/crawlee/crawlers/_parsel/_parsel_crawling_context.py
class ParselCrawlingContext (line 14) | class ParselCrawlingContext(ParsedHttpCrawlingContext[Selector]):
method selector (line 21) | def selector(self) -> Selector:
method from_parsed_http_crawling_context (line 26) | def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawling...
method html_to_text (line 30) | def html_to_text(self) -> str:
FILE: src/crawlee/crawlers/_parsel/_parsel_parser.py
class ParselParser (line 19) | class ParselParser(AbstractHttpParser[Selector, Selector]):
method parse (line 23) | async def parse(self, response: HttpResponse) -> Selector:
method parse_text (line 28) | async def parse_text(self, text: str) -> Selector:
method select (line 32) | async def select(self, parsed_content: Selector, selector: str) -> Seq...
method is_matching_selector (line 36) | def is_matching_selector(self, parsed_content: Selector, selector: str...
method find_links (line 40) | def find_links(self, parsed_content: Selector, selector: str, attribut...
FILE: src/crawlee/crawlers/_parsel/_utils.py
function html_to_text (line 16) | def html_to_text(source: str | Selector) -> str:
FILE: src/crawlee/crawlers/_playwright/_playwright_crawler.py
class PlaywrightCrawler (line 61) | class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, Statisti...
method __init__ (line 101) | def __init__(
method _open_page (line 219) | async def _open_page(
method _prepare_request_interceptor (line 260) | def _prepare_request_interceptor(
method _navigate (line 281) | async def _navigate(
method _create_extract_links_function (line 356) | def _create_extract_links_function(self, context: PlaywrightPreNavCraw...
method _handle_status_code_response (line 438) | async def _handle_status_code_response(
method _handle_blocked_request_by_content (line 460) | async def _handle_blocked_request_by_content(
method _execute_post_navigation_hooks (line 489) | async def _execute_post_navigation_hooks(
method _create_crawling_context (line 496) | async def _create_crawling_context(
method pre_navigation_hook (line 528) | def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawling...
method post_navigation_hook (line 536) | def post_navigation_hook(self, hook: Callable[[PlaywrightPostNavCrawli...
method _get_cookies (line 544) | async def _get_cookies(self, page: Page) -> list[PlaywrightCookieParam]:
method _update_cookies (line 549) | async def _update_cookies(self, page: Page, cookies: list[PlaywrightCo...
method _find_txt_file_for_url (line 553) | async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
class _PlaywrightCrawlerAdditionalOptions (line 564) | class _PlaywrightCrawlerAdditionalOptions(TypedDict):
class PlaywrightCrawlerOptions (line 596) | class PlaywrightCrawlerOptions(
FILE: src/crawlee/crawlers/_playwright/_playwright_crawling_context.py
class PlaywrightCrawlingContext (line 18) | class PlaywrightCrawlingContext(PlaywrightPostNavCrawlingContext):
FILE: src/crawlee/crawlers/_playwright/_playwright_http_client.py
function browser_page_context (line 30) | async def browser_page_context(page: Page) -> AsyncGenerator[None, None]:
class PlaywrightHttpClient (line 39) | class PlaywrightHttpClient(HttpClient):
method __init__ (line 50) | def __init__(self) -> None:
method crawl (line 55) | async def crawl(
method send_request (line 67) | async def send_request(
method stream (line 102) | def stream(
method cleanup (line 115) | async def cleanup(self) -> None:
FILE: src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py
class PlaywrightPostNavCrawlingContext (line 16) | class PlaywrightPostNavCrawlingContext(PlaywrightPreNavCrawlingContext):
FILE: src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py
class PlaywrightPreNavCrawlingContext (line 17) | class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
method get_snapshot (line 32) | async def get_snapshot(self) -> PageSnapshot:
FILE: src/crawlee/crawlers/_playwright/_types.py
class BlockRequestsFunction (line 19) | class BlockRequestsFunction(Protocol):
method __call__ (line 26) | async def __call__(
class PlaywrightHttpResponse (line 38) | class PlaywrightHttpResponse:
method read (line 46) | async def read(self) -> bytes:
method read_stream (line 49) | async def read_stream(self) -> AsyncGenerator[bytes, None]:
method from_playwright_response (line 55) | async def from_playwright_response(cls, response: Response | APIRespon...
class GotoOptions (line 68) | class GotoOptions(TypedDict):
FILE: src/crawlee/crawlers/_playwright/_utils.py
function infinite_scroll (line 25) | async def infinite_scroll(page: Page) -> None:
function block_requests (line 81) | async def block_requests(
FILE: src/crawlee/crawlers/_types.py
class BlockedInfo (line 7) | class BlockedInfo:
method __bool__ (line 12) | def __bool__(self) -> bool:
FILE: src/crawlee/errors.py
class UserDefinedErrorHandlerError (line 28) | class UserDefinedErrorHandlerError(Exception):
class UserHandlerTimeoutError (line 32) | class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
class SessionError (line 37) | class SessionError(Exception):
class ServiceConflictError (line 45) | class ServiceConflictError(Exception):
method __init__ (line 48) | def __init__(self, service: type, new_value: object, existing_value: o...
class ProxyError (line 56) | class ProxyError(SessionError):
class HttpStatusCodeError (line 61) | class HttpStatusCodeError(Exception):
method __init__ (line 64) | def __init__(self, message: str, status_code: int) -> None:
class HttpClientStatusCodeError (line 71) | class HttpClientStatusCodeError(HttpStatusCodeError):
class RequestHandlerError (line 76) | class RequestHandlerError(Exception, Generic[TCrawlingContext]):
method __init__ (line 79) | def __init__(self, wrapped_exception: Exception, crawling_context: TCr...
class ContextPipelineInitializationError (line 86) | class ContextPipelineInitializationError(Exception):
method __init__ (line 92) | def __init__(self, wrapped_exception: Exception, crawling_context: Bas...
class ContextPipelineFinalizationError (line 99) | class ContextPipelineFinalizationError(Exception):
method __init__ (line 105) | def __init__(self, wrapped_exception: Exception, crawling_context: Bas...
class ContextPipelineInterruptedError (line 112) | class ContextPipelineInterruptedError(Exception):
class RequestCollisionError (line 117) | class RequestCollisionError(Exception):
FILE: src/crawlee/events/_event_manager.py
class EventManagerOptions (line 41) | class EventManagerOptions(TypedDict):
class EventManager (line 55) | class EventManager:
method __init__ (line 63) | def __init__(
method active (line 100) | def active(self) -> bool:
method __aenter__ (line 104) | async def __aenter__(self) -> EventManager:
method __aexit__ (line 117) | async def __aexit__(
method on (line 143) | def on(self, *, event: Literal[Event.PERSIST_STATE], listener: EventLi...
method on (line 145) | def on(self, *, event: Literal[Event.SYSTEM_INFO], listener: EventList...
method on (line 147) | def on(self, *, event: Literal[Event.MIGRATING], listener: EventListen...
method on (line 149) | def on(self, *, event: Literal[Event.ABORTING], listener: EventListene...
method on (line 151) | def on(self, *, event: Literal[Event.EXIT], listener: EventListener[Ev...
method on (line 153) | def on(self, *, event: Literal[Event.CRAWLER_STATUS], listener: EventL...
method on (line 155) | def on(self, *, event: Event, listener: EventListener[None]) -> None: ...
method on (line 157) | def on(self, *, event: Event, listener: EventListener[Any]) -> None:
method off (line 207) | def off(self, *, event: Event, listener: EventListener[Any] | None = N...
method emit (line 224) | def emit(self, *, event: Literal[Event.PERSIST_STATE], event_data: Eve...
method emit (line 226) | def emit(self, *, event: Literal[Event.SYSTEM_INFO], event_data: Event...
method emit (line 228) | def emit(self, *, event: Literal[Event.MIGRATING], event_data: EventMi...
method emit (line 230) | def emit(self, *, event: Literal[Event.ABORTING], event_data: EventAbo...
method emit (line 232) | def emit(self, *, event: Literal[Event.EXIT], event_data: EventExitDat...
method emit (line 234) | def emit(self, *, event: Literal[Event.CRAWLER_STATUS], event_data: Ev...
method emit (line 236) | def emit(self, *, event: Event, event_data: Any) -> None: ...
method emit (line 239) | def emit(self, *, event: Event, event_data: EventData) -> None:
method wait_for_all_listeners_to_complete (line 249) | async def wait_for_all_listeners_to_complete(self, *, timeout: timedel...
method _emit_persist_state_event (line 268) | async def _emit_persist_state_event(self) -> None:
FILE: src/crawlee/events/_local_event_manager.py
class LocalEventManager (line 26) | class LocalEventManager(EventManager):
method __init__ (line 34) | def __init__(
method from_config (line 59) | def from_config(cls, config: Configuration | None = None) -> LocalEven...
method __aenter__ (line 72) | async def __aenter__(self) -> LocalEventManager:
method __aexit__ (line 81) | async def __aexit__(
method _emit_system_info_event (line 94) | async def _emit_system_info_event(self) -> None:
FILE: src/crawlee/events/_types.py
class Event (line 15) | class Event(str, Enum):
class EventPersistStateData (line 40) | class EventPersistStateData(BaseModel):
class EventSystemInfoData (line 49) | class EventSystemInfoData(BaseModel):
class EventMigratingData (line 62) | class EventMigratingData(BaseModel):
class EventAbortingData (line 73) | class EventAbortingData(BaseModel):
class EventExitData (line 80) | class EventExitData(BaseModel):
class EventCrawlerStatusData (line 87) | class EventCrawlerStatusData(BaseModel):
FILE: src/crawlee/fingerprint_suite/_browserforge_adapter.py
class PatchedHeaderGenerator (line 30) | class PatchedHeaderGenerator(bf_HeaderGenerator):
method _get_accept_language_header (line 33) | def _get_accept_language_header(self, locales: tuple[str, ...] | list[...
method generate (line 56) | def generate(
method _contains_all_sec_headers (line 120) | def _contains_all_sec_headers(self, headers: dict[str, str]) -> bool:
method _get_expected_browser_keywords (line 123) | def _get_expected_browser_keywords(self, browser: str | None) -> set[s...
method _get_single_browser_type (line 130) | def _get_single_browser_type(self, browser: Iterable[str | Browser] | ...
class PatchedFingerprintGenerator (line 154) | class PatchedFingerprintGenerator(bf_FingerprintGenerator):
method __init__ (line 157) | def __init__(
class BrowserforgeFingerprintGenerator (line 181) | class BrowserforgeFingerprintGenerator(FingerprintGenerator):
method __init__ (line 187) | def __init__(
method generate (line 227) | def generate(self) -> bf_Fingerprint:
class BrowserforgeHeaderGenerator (line 242) | class BrowserforgeHeaderGenerator:
method __init__ (line 245) | def __init__(self) -> None:
method generate (line 248) | def generate(self, browser_type: SupportedBrowserType = 'chrome') -> d...
function get_available_header_network (line 253) | def get_available_header_network() -> dict:
function get_available_header_values (line 258) | def get_available_header_values(header_network: dict, node_name: str | s...
FILE: src/crawlee/fingerprint_suite/_fingerprint_generator.py
class FingerprintGenerator (line 13) | class FingerprintGenerator(ABC):
method generate (line 17) | def generate(self) -> Fingerprint:
FILE: src/crawlee/fingerprint_suite/_header_generator.py
function fingerprint_browser_type_from_playwright_browser_type (line 13) | def fingerprint_browser_type_from_playwright_browser_type(
class HeaderGenerator (line 26) | class HeaderGenerator:
method __init__ (line 29) | def __init__(self) -> None:
method _select_specific_headers (line 32) | def _select_specific_headers(self, all_headers: dict[str, str], header...
method get_specific_headers (line 35) | def get_specific_headers(
method get_common_headers (line 48) | def get_common_headers(self) -> HttpHeaders:
method get_random_user_agent_header (line 57) | def get_random_user_agent_header(self) -> HttpHeaders:
method get_user_agent_header (line 62) | def get_user_agent_header(
method get_sec_ch_ua_headers (line 73) | def get_sec_ch_ua_headers(
FILE: src/crawlee/fingerprint_suite/_types.py
class ScreenOptions (line 13) | class ScreenOptions(BaseModel):
class HeaderGeneratorOptions (line 31) | class HeaderGeneratorOptions(BaseModel):
FILE: src/crawlee/http_clients/_base.py
class HttpResponse (line 23) | class HttpResponse(Protocol):
method http_version (line 27) | def http_version(self) -> str:
method status_code (line 31) | def status_code(self) -> int:
method headers (line 35) | def headers(self) -> HttpHeaders:
method read (line 38) | async def read(self) -> bytes:
method read_stream (line 48) | def read_stream(self) -> AsyncIterator[bytes]:
class HttpCrawlingResult (line 63) | class HttpCrawlingResult:
class HttpClient (line 75) | class HttpClient(ABC):
method __init__ (line 79) | def __init__(
method active (line 95) | def active(self) -> bool:
method crawl (line 100) | async def crawl(
method send_request (line 128) | async def send_request(
method stream (line 160) | def stream(
method cleanup (line 193) | async def cleanup(self) -> None:
method __aenter__ (line 201) | async def __aenter__(self) -> HttpClient:
method __aexit__ (line 213) | async def __aexit__(
FILE: src/crawlee/http_clients/_curl_impersonate.py
class _EmptyCookies (line 41) | class _EmptyCookies(CurlCookies):
method get_cookies_for_curl (line 43) | def get_cookies_for_curl(self, request: CurlRequest) -> list[CurlMorsel]:
method update_cookies_from_curl (line 47) | def update_cookies_from_curl(self, morsels: list[CurlMorsel]) -> None:
class _AsyncSession (line 51) | class _AsyncSession(AsyncSession):
method __init__ (line 53) | def __init__(self, *args: Any, **kwargs: Any) -> None:
class _CurlImpersonateResponse (line 58) | class _CurlImpersonateResponse:
method __init__ (line 61) | def __init__(self, response: Response) -> None:
method http_version (line 65) | def http_version(self) -> str:
method status_code (line 84) | def status_code(self) -> int:
method headers (line 88) | def headers(self) -> HttpHeaders:
method read (line 91) | async def read(self) -> bytes:
method read_stream (line 97) | async def read_stream(self) -> AsyncGenerator[bytes, None]:
class CurlImpersonateHttpClient (line 109) | class CurlImpersonateHttpClient(HttpClient):
method __init__ (line 128) | def __init__(
method crawl (line 148) | async def crawl(
method send_request (line 189) | async def send_request(
method stream (line 230) | async def stream(
method _get_client (line 273) | def _get_client(self, proxy_url: str | None) -> AsyncSession:
method _convert_method (line 297) | def _convert_method(self, method: HttpMethod) -> CurlHttpMethod:
method _is_proxy_error (line 332) | def _is_proxy_error(error: CurlRequestError) -> bool:
method _get_cookies (line 347) | def _get_cookies(curl: Curl) -> list[Cookie]:
method cleanup (line 360) | async def cleanup(self) -> None:
FILE: src/crawlee/http_clients/_httpx.py
class _HttpxResponse (line 32) | class _HttpxResponse:
method __init__ (line 35) | def __init__(self, response: httpx.Response) -> None:
method http_version (line 39) | def http_version(self) -> str:
method status_code (line 43) | def status_code(self) -> int:
method headers (line 47) | def headers(self) -> HttpHeaders:
method read (line 50) | async def read(self) -> bytes:
method read_stream (line 55) | async def read_stream(self) -> AsyncIterator[bytes]:
class _HttpxTransport (line 63) | class _HttpxTransport(httpx.AsyncHTTPTransport):
method handle_async_request (line 72) | async def handle_async_request(self, request: httpx.Request) -> httpx....
class HttpxHttpClient (line 86) | class HttpxHttpClient(HttpClient):
method __init__ (line 107) | def __init__(
method crawl (line 143) | async def crawl(
method send_request (line 184) | async def send_request(
method stream (line 220) | async def stream(
method _build_request (line 253) | def _build_request(
method _get_client (line 278) | def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
method _combine_headers (line 320) | def _combine_headers(self, explicit_headers: HttpHeaders | None) -> Ht...
method _is_proxy_error (line 335) | def _is_proxy_error(error: httpx.TransportError) -> bool:
method cleanup (line 349) | async def cleanup(self) -> None:
FILE: src/crawlee/http_clients/_impit.py
class _ClientCacheEntry (line 33) | class _ClientCacheEntry(TypedDict):
class _ImpitResponse (line 40) | class _ImpitResponse:
method __init__ (line 43) | def __init__(self, response: Response) -> None:
method http_version (line 47) | def http_version(self) -> str:
method status_code (line 51) | def status_code(self) -> int:
method headers (line 55) | def headers(self) -> HttpHeaders:
method read (line 58) | async def read(self) -> bytes:
method read_stream (line 63) | async def read_stream(self) -> AsyncIterator[bytes]:
class ImpitHttpClient (line 72) | class ImpitHttpClient(HttpClient):
method __init__ (line 91) | def __init__(
method crawl (line 121) | async def crawl(
method send_request (line 155) | async def send_request(
method stream (line 190) | async def stream(
method _get_client (line 225) | def _get_client(self, proxy_url: str | None, cookie_jar: CookieJar | N...
method _is_proxy_error (line 257) | def _is_proxy_error(error: HTTPError) -> bool:
method cleanup (line 271) | async def cleanup(self) -> None:
FILE: src/crawlee/otel/crawler_instrumentor.py
class CrawlerInstrumentor (line 25) | class CrawlerInstrumentor(BaseInstrumentor):
method __init__ (line 28) | def __init__(
method instrumentation_dependencies (line 120) | def instrumentation_dependencies(self) -> list[str]:
method _instrument_all_public_methods (line 124) | def _instrument_all_public_methods(self, on_class: type) -> None:
method _instrument (line 144) | def _instrument(self, **_: Any) -> None:
method _uninstrument (line 148) | def _uninstrument(self, **_: Any) -> None:
FILE: src/crawlee/project_template/templates/main.py
function main (line 32) | async def main() -> None:
FILE: src/crawlee/project_template/templates/main_playwright_camoufox.py
class CamoufoxPlugin (line 13) | class CamoufoxPlugin(PlaywrightBrowserPlugin):
method new_browser (line 19) | async def new_browser(self) -> PlaywrightBrowserController:
FILE: src/crawlee/project_template/templates/routes_beautifulsoup.py
function default_handler (line 8) | async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
FILE: src/crawlee/project_template/templates/routes_parsel.py
function default_handler (line 8) | async def default_handler(context: ParselCrawlingContext) -> None:
FILE: src/crawlee/project_template/templates/routes_playwright.py
function default_handler (line 8) | async def default_handler(context: PlaywrightCrawlingContext) -> None:
FILE: src/crawlee/proxy_configuration.py
class ProxyInfo (line 26) | class ProxyInfo:
class ProxyConfiguration (line 56) | class ProxyConfiguration:
method __init__ (line 68) | def __init__(
method _create_url (line 106) | def _create_url(self, url: str | None) -> URL | None:
method new_proxy_info (line 114) | async def new_proxy_info(
method new_url (line 159) | async def new_url(
method _pick_url (line 176) | async def _pick_url(
class _ProxyTierTracker (line 224) | class _ProxyTierTracker:
method __init__ (line 227) | def __init__(self, tiered_proxy_urls: list[list[URL | None]]) -> None:
method all_urls (line 233) | def all_urls(self) -> Sequence[URL | None]:
method get_tier_urls (line 236) | def get_tier_urls(self, tier_number: int) -> Sequence[URL | None]:
method add_error (line 239) | def add_error(self, domain: str, tier: int) -> None:
method predict_tier (line 242) | def predict_tier(self, domain: str) -> int:
class _NewUrlFunction (line 263) | class _NewUrlFunction(Protocol):
method __call__ (line 264) | def __call__(
FILE: src/crawlee/request_loaders/_request_list.py
class RequestListState (line 19) | class RequestListState(BaseModel):
class RequestListData (line 27) | class RequestListData(BaseModel):
class RequestList (line 32) | class RequestList(RequestLoader):
method __init__ (line 35) | def __init__(
method _get_state (line 92) | async def _get_state(self) -> RequestListState:
method name (line 140) | def name(self) -> str | None:
method get_handled_count (line 144) | async def get_handled_count(self) -> int:
method get_total_count (line 148) | async def get_total_count(self) -> int:
method is_empty (line 152) | async def is_empty(self) -> bool:
method is_finished (line 157) | async def is_finished(self) -> bool:
method fetch_next_request (line 162) | async def fetch_next_request(self) -> Request | None:
method mark_request_as_handled (line 184) | async def mark_request_as_handled(self, request: Request) -> None:
method _ensure_next_request (line 189) | async def _ensure_next_request(self) -> None:
method _dequeue_requests (line 204) | async def _dequeue_requests(self, count: int) -> AsyncGenerator[Reques...
method _iterate_in_threadpool (line 211) | async def _iterate_in_threadpool(self, iterable: Iterable[str | Reques...
FILE: src/crawlee/request_loaders/_request_loader.py
class RequestLoader (line 17) | class RequestLoader(ABC):
method get_handled_count (line 29) | async def get_handled_count(self) -> int:
method get_total_count (line 33) | async def get_total_count(self) -> int:
method is_empty (line 37) | async def is_empty(self) -> bool:
method is_finished (line 41) | async def is_finished(self) -> bool:
method fetch_next_request (line 45) | async def fetch_next_request(self) -> Request | None:
method mark_request_as_handled (line 53) | async def mark_request_as_handled(self, request: Request) -> Processed...
method to_tandem (line 56) | async def to_tandem(self, request_manager: RequestManager | None = Non...
method _transform_request (line 72) | def _transform_request(self, request: str | Request) -> Request:
method _transform_requests (line 82) | def _transform_requests(self, requests: Sequence[str | Request]) -> li...
FILE: src/crawlee/request_loaders/_request_manager.py
class RequestManager (line 18) | class RequestManager(RequestLoader, ABC):
method drop (line 22) | async def drop(self) -> None:
method add_request (line 26) | async def add_request(
method add_requests (line 43) | async def add_requests(
method reclaim_request (line 71) | async def reclaim_request(self, request: Request, *, forefront: bool =...
FILE: src/crawlee/request_loaders/_request_manager_tandem.py
class RequestManagerTandem (line 24) | class RequestManagerTandem(RequestManager):
method __init__ (line 31) | def __init__(self, request_loader: RequestLoader, request_manager: Req...
method get_handled_count (line 36) | async def get_handled_count(self) -> int:
method get_total_count (line 40) | async def get_total_count(self) -> int:
method is_empty (line 44) | async def is_empty(self) -> bool:
method is_finished (line 48) | async def is_finished(self) -> bool:
method add_request (line 52) | async def add_request(self, request: str | Request, *, forefront: bool...
method add_requests (line 56) | async def add_requests(
method fetch_next_request (line 76) | async def fetch_next_request(self) -> Request | None:
method reclaim_request (line 99) | async def reclaim_request(self, request: Request, *, forefront: bool =...
method mark_request_as_handled (line 103) | async def mark_request_as_handled(self, request: Request) -> None:
method drop (line 107) | async def drop(self) -> None:
FILE: src/crawlee/request_loaders/_sitemap_request_loader.py
class SitemapRequestLoaderState (line 33) | class SitemapRequestLoaderState(BaseModel):
class SitemapRequestLoader (line 91) | class SitemapRequestLoader(RequestLoader):
method __init__ (line 106) | def __init__(
method _get_state (line 161) | async def _get_state(self) -> SitemapRequestLoaderState:
method _check_url_patterns (line 189) | def _check_url_patterns(
method _load_sitemaps (line 219) | async def _load_sitemaps(self) -> None:
method get_total_count (line 288) | async def get_total_count(self) -> int:
method get_handled_count (line 294) | async def get_handled_count(self) -> int:
method is_empty (line 300) | async def is_empty(self) -> bool:
method is_finished (line 306) | async def is_finished(self) -> bool:
method fetch_next_request (line 312) | async def fetch_next_request(self) -> Request | None:
method mark_request_as_handled (line 340) | async def mark_request_as_handled(self, request: Request) -> Processed...
method abort_loading (line 348) | async def abort_loading(self) -> None:
method start (line 355) | async def start(self) -> None:
method close (line 361) | async def close(self) -> None:
method __aenter__ (line 366) | async def __aenter__(self) -> SitemapRequestLoader:
method __aexit__ (line 371) | async def __aexit__(
FILE: src/crawlee/router.py
class Router (line 20) | class Router(Generic[TCrawlingContext]):
method __init__ (line 59) | def __init__(self) -> None:
method default_handler (line 63) | def default_handler(self: Router, handler: RequestHandler[TCrawlingCon...
method handler (line 76) | def handler(
method __call__ (line 94) | async def __call__(self, context: TCrawlingContext) -> None:
FILE: src/crawlee/sessions/_cookies.py
class CookieParam (line 17) | class CookieParam(TypedDict, total=False):
class PlaywrightCookieParam (line 45) | class PlaywrightCookieParam(TypedDict, total=False):
class SessionCookies (line 60) | class SessionCookies:
method __init__ (line 63) | def __init__(self, cookies: SessionCookies | CookieJar | dict[str, str...
method jar (line 84) | def jar(self) -> CookieJar:
method set (line 88) | def set(
method _convert_cookie_to_dict (line 138) | def _convert_cookie_to_dict(self, cookie: Cookie) -> CookieParam:
method _to_playwright (line 161) | def _to_playwright(self, cookie_dict: CookieParam) -> PlaywrightCookie...
method _from_playwright (line 174) | def _from_playwright(self, cookie_dict: PlaywrightCookieParam) -> Cook...
method get_cookies_as_dicts (line 188) | def get_cookies_as_dicts(self) -> list[CookieParam]:
method store_cookie (line 192) | def store_cookie(self, cookie: Cookie) -> None:
method store_cookies (line 200) | def store_cookies(self, cookies: list[Cookie]) -> None:
method set_cookies (line 210) | def set_cookies(self, cookie_dicts: list[CookieParam]) -> None:
method get_cookies_as_playwright_format (line 220) | def get_cookies_as_playwright_format(self) -> list[PlaywrightCookiePar...
method set_cookies_from_playwright_format (line 224) | def set_cookies_from_playwright_format(self, pw_cookies: list[Playwrig...
method __deepcopy__ (line 231) | def __deepcopy__(self, memo: dict[int, Any] | None) -> SessionCookies:
method __len__ (line 236) | def __len__(self) -> int:
method __setitem__ (line 239) | def __setitem__(self, name: str, value: str) -> None:
method __getitem__ (line 242) | def __getitem__(self, name: str) -> str | None:
method __iter__ (line 248) | def __iter__(self) -> Iterator[CookieParam]:
method __repr__ (line 251) | def __repr__(self) -> str:
method __bool__ (line 257) | def __bool__(self) -> bool:
method __eq__ (line 262) | def __eq__(self, other: object) -> bool:
method __hash__ (line 274) | def __hash__(self) -> int:
method _is_valid_same_site (line 279) | def _is_valid_same_site(self, value: str | None) -> TypeGuard[Literal[...
FILE: src/crawlee/sessions/_models.py
class SessionModel (line 20) | class SessionModel(BaseModel):
class SessionPoolModel (line 38) | class SessionPoolModel(BaseModel):
method session_count (line 68) | def session_count(self) -> int:
method usable_session_count (line 74) | def usable_session_count(self) -> int:
method retired_session_count (line 80) | def retired_session_count(self) -> int:
FILE: src/crawlee/sessions/_session.py
class Session (line 22) | class Session:
method __init__ (line 34) | def __init__(
method from_model (line 77) | def from_model(cls, model: SessionModel) -> Session:
method __repr__ (line 82) | def __repr__(self) -> str:
method __eq__ (line 86) | def __eq__(self, other: object) -> bool:
method __hash__ (line 92) | def __hash__(self) -> int:
method id (line 115) | def id(self) -> str:
method user_data (line 120) | def user_data(self) -> dict:
method cookies (line 125) | def cookies(self) -> SessionCookies:
method error_score (line 130) | def error_score(self) -> float:
method usage_count (line 135) | def usage_count(self) -> float:
method expires_at (line 140) | def expires_at(self) -> datetime:
method is_blocked (line 145) | def is_blocked(self) -> bool:
method is_expired (line 150) | def is_expired(self) -> bool:
method is_max_usage_count_reached (line 155) | def is_max_usage_count_reached(self) -> bool:
method is_usable (line 160) | def is_usable(self) -> bool:
method get_state (line 165) | def get_state(self, *, as_dict: Literal[True]) -> dict: ...
method get_state (line 168) | def get_state(self, *, as_dict: Literal[False]) -> SessionModel: ...
method get_state (line 170) | def get_state(self, *, as_dict: bool = False) -> SessionModel | dict:
method mark_good (line 191) | def mark_good(self) -> None:
method mark_bad (line 202) | def mark_bad(self) -> None:
method retire (line 211) | def retire(self) -> None:
method is_blocked_status_code (line 223) | def is_blocked_status_code(
FILE: src/crawlee/sessions/_session_pool.py
class SessionPool (line 28) | class SessionPool:
method __init__ (line 36) | def __init__(
method __repr__ (line 86) | def __repr__(self) -> str:
method session_count (line 91) | def session_count(self) -> int:
method usable_session_count (line 96) | def usable_session_count(self) -> int:
method retired_session_count (line 101) | def retired_session_count(self) -> int:
method active (line 106) | def active(self) -> bool:
method __aenter__ (line 110) | async def __aenter__(self) -> SessionPool:
method __aexit__ (line 130) | async def __aexit__(
method get_state (line 149) | def get_state(self, *, as_dict: Literal[True]) -> dict: ...
method get_state (line 152) | def get_state(self, *, as_dict: Literal[False]) -> SessionPoolModel: ...
method get_state (line 155) | def get_state(self, *, as_dict: bool = False) -> SessionPoolModel | dict:
method add_session (line 163) | def add_session(self, session: Session) -> None:
method get_session (line 180) | async def get_session(self) -> Session:
method get_session_by_id (line 200) | async def get_session_by_id(self, session_id: str) -> Session | None:
method reset_store (line 225) | async def reset_store(self) -> None:
method _create_new_session (line 229) | async def _create_new_session(self) -> Session:
method _fill_sessions_to_max (line 238) | async def _fill_sessions_to_max(self) -> None:
method _get_random_session (line 243) | def _get_random_session(self) -> Session:
method _remove_retired_sessions (line 250) | def _remove_retired_sessions(self) -> None:
FILE: src/crawlee/statistics/_error_snapshotter.py
class ErrorSnapshotter (line 15) | class ErrorSnapshotter:
method __init__ (line 23) | def __init__(self, *, snapshot_kvs_name: str | None = None) -> None:
method capture_snapshot (line 26) | async def capture_snapshot(
method _save_html (line 61) | async def _save_html(self, kvs: KeyValueStore, html: str, base_name: s...
method _save_screenshot (line 65) | async def _save_screenshot(self, kvs: KeyValueStore, screenshot: bytes...
method _sanitize_filename (line 69) | def _sanitize_filename(self, filename: str) -> str:
method _get_snapshot_base_name (line 72) | def _get_snapshot_base_name(self, error_message: str, file_and_line: s...
FILE: src/crawlee/statistics/_error_tracker.py
class ErrorTracker (line 23) | class ErrorTracker:
method __init__ (line 26) | def __init__(
method add (line 46) | async def add(
method _capture_error_snapshot (line 108) | async def _capture_error_snapshot(
method _get_file_and_line (line 119) | def _get_file_and_line(self, error: Exception) -> str:
method _get_error_message (line 126) | def _get_error_message(self, error: Exception) -> str:
method unique_error_count (line 136) | def unique_error_count(self) -> int:
method total (line 145) | def total(self) -> int:
method get_most_common_errors (line 153) | def get_most_common_errors(self, n: int = 3) -> list[tuple[str | None,...
method _get_error_repr (line 164) | def _get_error_repr(self, file_and_line: str | None, name: str | None,...
method _create_generic_message (line 172) | def _create_generic_message(message_1: str | None, message_2: str | No...
FILE: src/crawlee/statistics/_models.py
class FinalStatistics (line 22) | class FinalStatistics:
method to_table (line 36) | def to_table(self) -> str:
method to_dict (line 47) | def to_dict(self) -> dict[str, float | int | list[int]]:
method __str__ (line 51) | def __str__(self) -> str:
class StatisticsState (line 58) | class StatisticsState(BaseModel):
method model_post_init (line 110) | def model_post_init(self, /, __context: Any) -> None:
method crawler_runtime (line 114) | def crawler_runtime(self) -> timedelta:
method crawler_runtime (line 121) | def crawler_runtime(self, value: timedelta) -> None:
method crawler_runtime_for_serialization (line 132) | def crawler_runtime_for_serialization(self) -> timedelta:
method request_total_duration (line 140) | def request_total_duration(self) -> timedelta:
method request_avg_failed_duration (line 145) | def request_avg_failed_duration(self) -> timedelta | None:
method request_avg_finished_duration (line 150) | def request_avg_finished_duration(self) -> timedelta | None:
method requests_total (line 155) | def requests_total(self) -> int:
FILE: src/crawlee/statistics/_statistics.py
class RequestProcessingRecord (line 31) | class RequestProcessingRecord:
method __init__ (line 34) | def __init__(self) -> None:
method run (line 39) | def run(self) -> int:
method finish (line 45) | def finish(self) -> timedelta:
method retry_count (line 54) | def retry_count(self) -> int:
class Statistics (line 60) | class Statistics(Generic[TStatisticsState]):
method __init__ (line 71) | def __init__(
method replace_state_model (line 113) | def replace_state_model(self, state_model: type[TNewStatisticsState]) ...
method with_default_state (line 127) | def with_default_state(
method active (line 154) | def active(self) -> bool:
method __aenter__ (line 158) | async def __aenter__(self) -> Self:
method __aexit__ (line 180) | async def __aexit__(
method state (line 204) | def state(self) -> TStatisticsState:
method register_status_code (line 208) | def register_status_code(self, code: int) -> None:
method record_request_processing_start (line 215) | def record_request_processing_start(self, request_id_or_key: str) -> N...
method record_request_processing_finish (line 222) | def record_request_processing_finish(self, request_id_or_key: str) -> ...
method record_request_processing_failure (line 244) | def record_request_processing_failure(self, request_id_or_key: str) ->...
method calculate (line 258) | def calculate(self) -> FinalStatistics:
method reset (line 277) | async def reset(self) -> None:
method _log (line 284) | def _log(self) -> None:
method _save_retry_count_for_request (line 291) | def _save_retry_count_for_request(self, record: RequestProcessingRecor...
FILE: src/crawlee/storage_clients/_base/_dataset_client.py
class DatasetClient (line 13) | class DatasetClient(ABC):
method get_metadata (line 27) | async def get_metadata(self) -> DatasetMetadata:
method drop (line 31) | async def drop(self) -> None:
method purge (line 38) | async def purge(self) -> None:
method push_data (line 45) | async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
method get_data (line 52) | async def get_data(
method iterate_items (line 73) | async def iterate_items(
FILE: src/crawlee/storage_clients/_base/_key_value_store_client.py
class KeyValueStoreClient (line 12) | class KeyValueStoreClient(ABC):
method get_metadata (line 26) | async def get_metadata(self) -> KeyValueStoreMetadata:
method drop (line 30) | async def drop(self) -> None:
method purge (line 37) | async def purge(self) -> None:
method get_value (line 44) | async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
method set_value (line 51) | async def set_value(self, *, key: str, value: Any, content_type: str |...
method delete_value (line 58) | async def delete_value(self, *, key: str) -> None:
method iterate_keys (line 65) | async def iterate_keys(
method get_public_url (line 82) | async def get_public_url(self, *, key: str) -> str:
method record_exists (line 89) | async def record_exists(self, *, key: str) -> bool:
FILE: src/crawlee/storage_clients/_base/_request_queue_client.py
class RequestQueueClient (line 13) | class RequestQueueClient(ABC):
method get_metadata (line 21) | async def get_metadata(self) -> RequestQueueMetadata:
method drop (line 25) | async def drop(self) -> None:
method purge (line 32) | async def purge(self) -> None:
method add_batch_of_requests (line 39) | async def add_batch_of_requests(
method get_request (line 66) | async def get_request(self, unique_key: str) -> Request | None:
method fetch_next_request (line 77) | async def fetch_next_request(self) -> Request | None:
method mark_request_as_handled (line 94) | async def mark_request_as_handled(self, request: Request) -> Processed...
method reclaim_request (line 107) | async def reclaim_request(
method is_empty (line 126) | async def is_empty(self) -> bool:
FILE: src/crawlee/storage_clients/_base/_storage_client.py
class StorageClient (line 19) | class StorageClient(ABC):
method get_storage_client_cache_key (line 33) | def get_storage_client_cache_key(self, configuration: Configuration) -...
method create_dataset_client (line 42) | async def create_dataset_client(
method create_kvs_client (line 53) | async def create_kvs_client(
method create_rq_client (line 64) | async def create_rq_client(
method get_rate_limit_errors (line 74) | def get_rate_limit_errors(self) -> dict[int, int]:
method _purge_if_needed (line 78) | async def _purge_if_needed(
FILE: src/crawlee/storage_clients/_file_system/_dataset_client.py
class FileSystemDatasetClient (line 29) | class FileSystemDatasetClient(DatasetClient):
method __init__ (line 56) | def __init__(
method get_metadata (line 76) | async def get_metadata(self) -> DatasetMetadata:
method path_to_dataset (line 80) | def path_to_dataset(self) -> Path:
method path_to_metadata (line 85) | def path_to_metadata(self) -> Path:
method open (line 90) | async def open(
method drop (line 205) | async def drop(self) -> None:
method purge (line 211) | async def purge(self) -> None:
method push_data (line 223) | async def push_data(self, data: list[dict[str, Any]] | dict[str, Any])...
method get_data (line 242) | async def get_data(
method iterate_items (line 341) | async def iterate_items(
method _update_metadata (line 414) | async def _update_metadata(
method _push_item (line 444) | async def _push_item(self, item: dict[str, Any], item_id: int) -> None:
method _get_sorted_data_files (line 465) | async def _get_sorted_data_files(self) -> list[Path]:
FILE: src/crawlee/storage_clients/_file_system/_key_value_store_client.py
class FileSystemKeyValueStoreClient (line 32) | class FileSystemKeyValueStoreClient(KeyValueStoreClient):
method __init__ (line 56) | def __init__(
method get_metadata (line 76) | async def get_metadata(self) -> KeyValueStoreMetadata:
method path_to_kvs (line 80) | def path_to_kvs(self) -> Path:
method path_to_metadata (line 85) | def path_to_metadata(self) -> Path:
method open (line 90) | async def open(
method drop (line 204) | async def drop(self) -> None:
method purge (line 211) | async def purge(self) -> None:
method get_value (line 224) | async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
method set_value (line 303) | async def set_value(self, *, key: str, value: Any, content_type: str |...
method delete_value (line 344) | async def delete_value(self, *, key: str) -> None:
method iterate_keys (line 367) | async def iterate_keys(
method get_public_url (line 428) | async def get_public_url(self, *, key: str) -> str:
method record_exists (line 442) | async def record_exists(self, *, key: str) -> bool:
method _update_metadata (line 461) | async def _update_metadata(
method _encode_key (line 487) | def _encode_key(self, key: str) -> str:
method _decode_key (line 491) | def _decode_key(self, encoded_key: str) -> str:
FILE: src/crawlee/storage_clients/_file_system/_request_queue_client.py
class RequestQueueState (line 40) | class RequestQueueState(BaseModel):
class FileSystemRequestQueueClient (line 62) | class FileSystemRequestQueueClient(RequestQueueClient):
method __init__ (line 91) | def __init__(
method get_metadata (line 124) | async def get_metadata(self) -> RequestQueueMetadata:
method path_to_rq (line 128) | def path_to_rq(self) -> Path:
method path_to_metadata (line 133) | def path_to_metadata(self) -> Path:
method _create_recoverable_state (line 138) | async def _create_recoverable_state(cls, id: str, configuration: Confi...
method open (line 154) | async def open(
method drop (line 283) | async def drop(self) -> None:
method purge (line 299) | async def purge(self) -> None:
method add_batch_of_requests (line 323) | async def add_batch_of_requests(
method get_request (line 448) | async def get_request(self, unique_key: str) -> Request | None:
method fetch_next_request (line 461) | async def fetch_next_request(self) -> Request | None:
method mark_request_as_handled (line 484) | async def mark_request_as_handled(self, request: Request) -> Processed...
method reclaim_request (line 527) | async def reclaim_request(
method is_empty (line 588) | async def is_empty(self) -> bool:
method _get_request_path (line 624) | def _get_request_path(self, unique_key: str) -> Path:
method _update_metadata (line 635) | async def _update_metadata(
method _refresh_cache (line 685) | async def _refresh_cache(self) -> None:
method _get_request_files (line 747) | async def _get_request_files(cls, path_to_rq: Path) -> list[Path]:
method _parse_request_file (line 768) | async def _parse_request_file(cls, file_path: Path) -> Request | None:
method _discover_existing_requests (line 800) | async def _discover_existing_requests(self) -> None:
method _get_file_base_name_from_unique_key (line 821) | def _get_file_base_name_from_unique_key(unique_key: str) -> str:
FILE: src/crawlee/storage_clients/_file_system/_storage_client.py
class FileSystemStorageClient (line 20) | class FileSystemStorageClient(StorageClient):
method get_storage_client_cache_key (line 38) | def get_storage_client_cache_key(self, configuration: Configuration) -...
method create_dataset_client (line 43) | async def create_dataset_client(
method create_kvs_client (line 57) | async def create_kvs_client(
method create_rq_client (line 71) | async def create_rq_client(
FILE: src/crawlee/storage_clients/_memory/_dataset_client.py
class MemoryDatasetClient (line 20) | class MemoryDatasetClient(DatasetClient):
method __init__ (line 33) | def __init__(
method get_metadata (line 48) | async def get_metadata(self) -> DatasetMetadata:
method open (line 52) | async def open(
method drop (line 98) | async def drop(self) -> None:
method purge (line 107) | async def purge(self) -> None:
method push_data (line 116) | async def push_data(self, data: list[dict[str, Any]] | dict[str, Any])...
method get_data (line 135) | async def get_data(
method iterate_items (line 194) | async def iterate_items(
method _update_metadata (line 240) | async def _update_metadata(
method _push_item (line 263) | async def _push_item(self, item: dict[str, Any]) -> None:
FILE: src/crawlee/storage_clients/_memory/_key_value_store_client.py
class MemoryKeyValueStoreClient (line 19) | class MemoryKeyValueStoreClient(KeyValueStoreClient):
method __init__ (line 31) | def __init__(
method get_metadata (line 46) | async def get_metadata(self) -> KeyValueStoreMetadata:
method open (line 50) | async def open(
method drop (line 95) | async def drop(self) -> None:
method purge (line 100) | async def purge(self) -> None:
method get_value (line 105) | async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
method set_value (line 112) | async def set_value(self, *, key: str, value: Any, content_type: str |...
method delete_value (line 129) | async def delete_value(self, *, key: str) -> None:
method iterate_keys (line 135) | async def iterate_keys(
method get_public_url (line 164) | async def get_public_url(self, *, key: str) -> str:
method record_exists (line 168) | async def record_exists(self, *, key: str) -> bool:
method _update_metadata (line 172) | async def _update_metadata(
FILE: src/crawlee/storage_clients/_memory/_request_queue_client.py
class MemoryRequestQueueClient (line 23) | class MemoryRequestQueueClient(RequestQueueClient):
method __init__ (line 34) | def __init__(
method get_metadata (line 58) | async def get_metadata(self) -> RequestQueueMetadata:
method open (line 62) | async def open(
method drop (line 111) | async def drop(self) -> None:
method purge (line 126) | async def purge(self) -> None:
method add_batch_of_requests (line 141) | async def add_batch_of_requests(
method fetch_next_request (line 232) | async def fetch_next_request(self) -> Request | None:
method get_request (line 251) | async def get_request(self, unique_key: str) -> Request | None:
method mark_request_as_handled (line 256) | async def mark_request_as_handled(self, request: Request) -> Processed...
method reclaim_request (line 288) | async def reclaim_request(
method is_empty (line 317) | async def is_empty(self) -> bool:
method _update_metadata (line 328) | async def _update_metadata(
FILE: src/crawlee/storage_clients/_memory/_storage_client.py
class MemoryStorageClient (line 15) | class MemoryStorageClient(StorageClient):
method create_dataset_client (line 31) | async def create_dataset_client(
method create_kvs_client (line 45) | async def create_kvs_client(
method create_rq_client (line 59) | async def create_rq_client(
FILE: src/crawlee/storage_clients/_redis/_client_mixin.py
class MetadataUpdateParams (line 27) | class MetadataUpdateParams(TypedDict, total=False):
class RedisClientMixin (line 34) | class RedisClientMixin:
method __init__ (line 49) | def __init__(self, storage_name: str, storage_id: str, redis: Redis) -...
method redis (line 57) | def redis(self) -> Redis:
method metadata_key (line 62) | def metadata_key(self) -> str:
method _get_metadata_by_name (line 67) | async def _get_metadata_by_name(cls, name: str, redis: Redis, *, with_...
method _get_metadata_name_by_id (line 88) | async def _get_metadata_name_by_id(cls, id: str, redis: Redis) -> str ...
method _open (line 103) | async def _open(
method _load_scripts (line 167) | async def _load_scripts(self) -> None:
method _ensure_scripts_loaded (line 171) | async def _ensure_scripts_loaded(self) -> None:
method _get_pipeline (line 178) | async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIt...
method _create_storage (line 188) | async def _create_storage(self, pipeline: Pipeline) -> None:
method _create_script (line 191) | async def _create_script(self, script_name: str) -> AsyncScript:
method _create_metadata_and_storage (line 197) | async def _create_metadata_and_storage(self, storage_name: str, metada...
method _drop (line 220) | async def _drop(self, extra_keys: list[str]) -> None:
method _purge (line 229) | async def _purge(self, extra_keys: list[str], metadata_kwargs: Metadat...
method _get_metadata (line 237) | async def _get_metadata(self, metadata_model: type[DatasetMetadata]) -...
method _get_metadata (line 239) | async def _get_metadata(self, metadata_model: type[KeyValueStoreMetada...
method _get_metadata (line 241) | async def _get_metadata(self, metadata_model: type[RequestQueueMetadat...
method _get_metadata (line 243) | async def _get_metadata(
method _specific_update_metadata (line 255) | async def _specific_update_metadata(self, pipeline: Pipeline, **kwargs...
method _update_metadata (line 265) | async def _update_metadata(
FILE: src/crawlee/storage_clients/_redis/_dataset_client.py
class _DatasetMetadataUpdateParams (line 23) | class _DatasetMetadataUpdateParams(MetadataUpdateParams):
class RedisDatasetClient (line 30) | class RedisDatasetClient(DatasetClient, RedisClientMixin):
method __init__ (line 54) | def __init__(self, storage_name: str, storage_id: str, redis: Redis) -...
method _items_key (line 67) | def _items_key(self) -> str:
method open (line 72) | async def open(
method get_metadata (line 106) | async def get_metadata(self) -> DatasetMetadata:
method drop (line 110) | async def drop(self) -> None:
method purge (line 114) | async def purge(self) -> None:
method push_data (line 123) | async def push_data(self, data: list[dict[str, Any]] | dict[str, Any])...
method get_data (line 137) | async def get_data(
method iterate_items (line 221) | async def iterate_items(
method _create_storage (line 303) | async def _create_storage(self, pipeline: Pipeline) -> None:
method _specific_update_metadata (line 309) | async def _specific_update_metadata(
FILE: src/crawlee/storage_clients/_redis/_key_value_store_client.py
class RedisKeyValueStoreClient (line 24) | class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):
method __init__ (line 52) | def __init__(self, storage_name: str, storage_id: str, redis: Redis) -...
method _items_key (line 60) | def _items_key(self) -> str:
method _metadata_items_key (line 65) | def _metadata_items_key(self) -> str:
method open (line 70) | async def open(
method get_metadata (line 104) | async def get_metadata(self) -> KeyValueStoreMetadata:
method drop (line 108) | async def drop(self) -> None:
method purge (line 112) | async def purge(self) -> None:
method set_value (line 119) | async def set_value(self, *, key: str, value: Any, content_type: str |...
method get_value (line 159) | async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
method delete_value (line 204) | async def delete_value(self, *, key: str) -> None:
method iterate_keys (line 211) | async def iterate_keys(
method get_public_url (line 251) | async def get_public_url(self, *, key: str) -> str:
method record_exists (line 255) | async def record_exists(self, *, key: str) -> bool:
FILE: src/crawlee/storage_clients/_redis/_request_queue_client.py
class _QueueMetadataUpdateParams (line 29) | class _QueueMetadataUpdateParams(MetadataUpdateParams):
class RedisRequestQueueClient (line 42) | class RedisRequestQueueClient(RequestQueueClient, RedisClientMixin):
method __init__ (line 87) | def __init__(
method _added_filter_key (line 121) | def _added_filter_key(self) -> str:
method _handled_filter_key (line 128) | def _handled_filter_key(self) -> str:
method _pending_set_key (line 135) | def _pending_set_key(self) -> str:
method _handled_set_key (line 142) | def _handled_set_key(self) -> str:
method _queue_key (line 149) | def _queue_key(self) -> str:
method _data_key (line 154) | def _data_key(self) -> str:
method _in_progress_key (line 159) | def _in_progress_key(self) -> str:
method open (line 164) | async def open(
method get_metadata (line 211) | async def get_metadata(self) -> RequestQueueMetadata:
method drop (line 215) | async def drop(self) -> None:
method purge (line 226) | async def purge(self) -> None:
method add_batch_of_requests (line 246) | async def add_batch_of_requests(
method fetch_next_request (line 353) | async def fetch_next_request(self) -> Request | None:
method get_request (line 381) | async def get_request(self, unique_key: str) -> Request | None:
method mark_request_as_handled (line 390) | async def mark_request_as_handled(self, request: Request) -> Processed...
method reclaim_request (line 428) | async def reclaim_request(
method is_empty (line 473) | async def is_empty(self) -> bool:
method _load_scripts (line 491) | async def _load_scripts(self) -> None:
method _create_storage (line 501) | async def _create_storage(self, pipeline: Pipeline) -> None:
method _reclaim_stale_requests (line 515) | async def _reclaim_stale_requests(self) -> None:
method _specific_update_metadata (line 527) | async def _specific_update_metadata(
FILE: src/crawlee/storage_clients/_redis/_storage_client.py
class RedisStorageClient (line 19) | class RedisStorageClient(StorageClient):
method __init__ (line 39) | def __init__(
method create_dataset_client (line 87) | async def create_dataset_client(
method create_kvs_client (line 108) | async def create_kvs_client(
method create_rq_client (line 129) | async def create_rq_client(
FILE: src/crawlee/storage_clients/_redis/_utils.py
function await_redis_response (line 9) | async def await_redis_response(response: Awaitable[T]) -> T: ...
function await_redis_response (line 11) | async def await_redis_response(response: T) -> T: ...
function await_redis_response (line 14) | async def await_redis_response(response: Awaitable[T] | T) -> T:
function read_lua_script (line 21) | def read_lua_script(script_name: str) -> str:
FILE: src/crawlee/storage_clients/_sql/_client_mixin.py
class MetadataUpdateParams (line 45) | class MetadataUpdateParams(TypedDict, total=False):
class SqlClientMixin (line 52) | class SqlClientMixin(ABC):
method __init__ (line 78) | def __init__(self, *, id: str, storage_client: SqlStorageClient) -> None:
method _open (line 83) | async def _open(
method _safely_open (line 138) | async def _safely_open(
method get_session (line 197) | async def get_session(self, *, with_simple_commit: bool = False) -> As...
method _build_insert_stmt_with_ignore (line 211) | def _build_insert_stmt_with_ignore(
method _build_upsert_stmt (line 236) | def _build_upsert_stmt(
method _purge (line 274) | async def _purge(self, metadata_kwargs: MetadataUpdateParams) -> None:
method _drop (line 288) | async def _drop(self) -> None:
method _get_metadata (line 305) | async def _get_metadata(self, metadata_model: type[DatasetMetadata]) -...
method _get_metadata (line 307) | async def _get_metadata(self, metadata_model: type[KeyValueStoreMetada...
method _get_metadata (line 309) | async def _get_metadata(self, metadata_model: type[RequestQueueMetadat...
method _get_metadata (line 311) | async def _get_metadata(
method _specific_update_metadata (line 326) | def _specific_update_metadata(self, **kwargs: Any) -> dict[str, Any]:
method _prepare_buffer_data (line 336) | def _prepare_buffer_data(self, **kwargs: Any) -> dict[str, Any]:
method _apply_buffer_updates (line 340) | async def _apply_buffer_updates(self, session: AsyncSession, max_buffe...
method _update_metadata (line 348) | async def _update_metadata(
method _add_buffer_record (line 381) | async def _add_buffer_record(
method _try_acquire_buffer_lock (line 405) | async def _try_acquire_buffer_lock(self, session: AsyncSession) -> bool:
method _release_buffer_lock (line 467) | async def _release_buffer_lock(self, session: AsyncSession) -> None:
method _has_pending_buffer_updates (line 477) | async def _has_pending_buffer_updates(self, session: AsyncSession) -> ...
method _process_buffers (line 494) | async def _process_buffers(self) -> None:
FILE: src/crawlee/storage_clients/_sql/_dataset_client.py
class _DatasetMetadataUpdateParams (line 30) | class _DatasetMetadataUpdateParams(MetadataUpdateParams):
class SqlDatasetClient (line 37) | class SqlDatasetClient(DatasetClient, SqlClientMixin):
method __init__ (line 68) | def __init__(
method open (line 81) | async def open(
method get_metadata (line 113) | async def get_metadata(self) -> DatasetMetadata:
method drop (line 118) | async def drop(self) -> None:
method purge (line 126) | async def purge(self) -> None:
method push_data (line 141) | async def push_data(self, data: list[dict[str, Any]] | dict[str, Any])...
method get_data (line 154) | async def get_data(
method iterate_items (line 201) | async def iterate_items(
method _prepare_get_stmt (line 234) | def _prepare_get_stmt(
method _specific_update_metadata (line 279) | def _specific_update_metadata(
method _prepare_buffer_data (line 303) | def _prepare_buffer_data(self, delta_item_count: int | None = None, **...
method _apply_buffer_updates (line 316) | async def _apply_buffer_updates(self, session: AsyncSession, max_buffe...
FILE: src/crawlee/storage_clients/_sql/_db_models.py
class AwareDateTime (line 17) | class AwareDateTime(TypeDecorator):
method process_result_value (line 28) | def process_result_value(self, value: datetime | None, dialect: Dialec...
class JsonField (line 35) | class JsonField(TypeDecorator):
method load_dialect_impl (line 41) | def load_dialect_impl(self, dialect: Dialect) -> TypeEngine[JSON | JSO...
class Base (line 48) | class Base(DeclarativeBase):
class StorageMetadataDb (line 52) | class StorageMetadataDb:
class DatasetMetadataDb (line 74) | class DatasetMetadataDb(StorageMetadataDb, Base):
class RequestQueueMetadataDb (line 94) | class RequestQueueMetadataDb(StorageMetadataDb, Base):
class KeyValueStoreMetadataDb (line 127) | class KeyValueStoreMetadataDb(StorageMetadataDb, Base):
class KeyValueStoreRecordDb (line 144) | class KeyValueStoreRecordDb(Base):
class DatasetItemDb (line 177) | class DatasetItemDb(Base):
class RequestDb (line 202) | class RequestDb(Base):
class RequestQueueStateDb (line 251) | class RequestQueueStateDb(Base):
class VersionDb (line 271) | class VersionDb(Base):
class MetadataBufferDb (line 279) | class MetadataBufferDb:
class KeyValueStoreMetadataBufferDb (line 293) | class KeyValueStoreMetadataBufferDb(MetadataBufferDb, Base):
class DatasetMetadataBufferDb (line 306) | class DatasetMetadataBufferDb(MetadataBufferDb, Base):
class RequestQueueMetadataBufferDb (line 323) | class RequestQueueMetadataBufferDb(MetadataBufferDb, Base):
FILE: src/crawlee/storage_clients/_sql/_key_value_store_client.py
class SqlKeyValueStoreClient (line 34) | class SqlKeyValueStoreClient(KeyValueStoreClient, SqlClientMixin):
method __init__ (line 72) | def __init__(
method open (line 85) | async def open(
method get_metadata (line 121) | async def get_metadata(self) -> KeyValueStoreMetadata:
method drop (line 126) | async def drop(self) -> None:
method purge (line 134) | async def purge(self) -> None:
method set_value (line 143) | async def set_value(self, *, key: str, value: Any, content_type: str |...
method get_value (line 184) | async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
method delete_value (line 230) | async def delete_value(self, *, key: str) -> None:
method iterate_keys (line 244) | async def iterate_keys(
method record_exists (line 278) | async def record_exists(self, *, key: str) -> bool:
method get_public_url (line 291) | async def get_public_url(self, *, key: str) -> str:
method _specific_update_metadata (line 295) | def _specific_update_metadata(self, **_kwargs: dict[str, Any]) -> dict...
method _prepare_buffer_data (line 299) | def _prepare_buffer_data(self, **_kwargs: Any) -> dict[str, Any]:
method _apply_buffer_updates (line 308) | async def _apply_buffer_updates(self, session: AsyncSession, max_buffe...
FILE: src/crawlee/storage_clients/_sql/_request_queue_client.py
class _QueueMetadataUpdateParams (line 41) | class _QueueMetadataUpdateParams(MetadataUpdateParams):
class SqlRequestQueueClient (line 54) | class SqlRequestQueueClient(RequestQueueClient, SqlClientMixin):
method __init__ (line 103) | def __init__(
method open (line 125) | async def open(
method get_metadata (line 166) | async def get_metadata(self) -> RequestQueueMetadata:
method drop (line 173) | async def drop(self) -> None:
method purge (line 183) | async def purge(self) -> None:
method add_batch_of_requests (line 204) | async def add_batch_of_requests(
method get_request (line 395) | async def get_request(self, unique_key: str) -> Request | None:
method fetch_next_request (line 414) | async def fetch_next_request(self) -> Request | None:
method mark_request_as_handled (line 496) | async def mark_request_as_handled(self, request: Request) -> Processed...
method reclaim_request (line 527) | async def reclaim_request(
method is_empty (line 584) | async def is_empty(self) -> bool:
method _get_state (line 627) | async def _get_state(self, session: AsyncSession) -> RequestQueueStateDb:
method _specific_update_metadata (line 644) | def _specific_update_metadata(
method _get_int_id_from_unique_key (line 729) | def _get_int_id_from_unique_key(unique_key: str) -> int:
method _prepare_buffer_data (line 743) | def _prepare_buffer_data(
method _apply_buffer_updates (line 779) | async def _apply_buffer_updates(self, session: AsyncSession, max_buffe...
FILE: src/crawlee/storage_clients/_sql/_storage_client.py
class SqlStorageClient (line 32) | class SqlStorageClient(StorageClient):
method __init__ (line 54) | def __init__(
method __aenter__ (line 86) | async def __aenter__(self) -> SqlStorageClient:
method __aexit__ (line 90) | async def __aexit__(
method engine (line 100) | def engine(self) -> AsyncEngine:
method get_dialect_name (line 106) | def get_dialect_name(self) -> str | None:
method initialize (line 110) | async def initialize(self, configuration: Configuration) -> None:
method close (line 161) | async def close(self) -> None:
method create_session (line 167) | def create_session(self) -> AsyncSession:
method create_dataset_client (line 178) | async def create_dataset_client(
method create_kvs_client (line 200) | async def create_kvs_client(
method create_rq_client (line 222) | async def create_rq_client(
method _get_or_create_engine (line 243) | def _get_or_create_engine(self, configuration: Configuration) -> Async...
FILE: src/crawlee/storage_clients/models.py
class StorageMetadata (line 17) | class StorageMetadata(BaseModel):
class DatasetMetadata (line 42) | class DatasetMetadata(StorageMetadata):
class KeyValueStoreMetadata (line 52) | class KeyValueStoreMetadata(StorageMetadata):
class RequestQueueMetadata (line 59) | class RequestQueueMetadata(StorageMetadata):
class KeyValueStoreRecordMetadata (line 78) | class KeyValueStoreRecordMetadata(BaseModel):
class KeyValueStoreRecord (line 100) | class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueT...
class DatasetItemsListPage (line 110) | class DatasetItemsListPage(BaseModel):
class ProcessedRequest (line 140) | class ProcessedRequest(BaseModel):
class UnprocessedRequest (line 154) | class UnprocessedRequest(BaseModel):
class AddRequestsResponse (line 165) | class AddRequestsResponse(BaseModel):
FILE: src/crawlee/storages/_base.py
class Storage (line 15) | class Storage(ABC):
method id (line 20) | def id(self) -> str:
method name (line 25) | def name(self) -> str | None:
method get_metadata (line 29) | async def get_metadata(self) -> DatasetMetadata | KeyValueStoreMetadat...
method open (line 34) | async def open(
method drop (line 57) | async def drop(self) -> None:
method purge (line 61) | async def purge(self) -> None:
FILE: src/crawlee/storages/_dataset.py
class Dataset (line 33) | class Dataset(Storage):
method __init__ (line 69) | def __init__(self, client: DatasetClient, id: str, name: str | None) -...
method id (line 87) | def id(self) -> str:
method name (line 92) | def name(self) -> str | None:
method get_metadata (line 96) | async def get_metadata(self) -> DatasetMetadata:
method open (line 101) | async def open(
method drop (line 128) | async def drop(self) -> None:
method purge (line 134) | async def purge(self) -> None:
method push_data (line 137) | async def push_data(self, data: list[dict[str, Any]] | dict[str, Any])...
method get_data (line 150) | async def get_data(
method iterate_items (line 201) | async def iterate_items(
method list_items (line 249) | async def list_items(
method export_to (line 300) | async def export_to(
method export_to (line 312) | async def export_to(
method export_to (line 323) | async def export_to(
FILE: src/crawlee/storages/_key_value_store.py
class AutosavedValue (line 35) | class AutosavedValue(RootModel):
class KeyValueStore (line 40) | class KeyValueStore(Storage):
method __init__ (line 78) | def __init__(self, client: KeyValueStoreClient, id: str, name: str | N...
method id (line 99) | def id(self) -> str:
method name (line 104) | def name(self) -> str | None:
method get_metadata (line 108) | async def get_metadata(self) -> KeyValueStoreMetadata:
method open (line 113) | async def open(
method drop (line 140) | async def drop(self) -> None:
method purge (line 148) | async def purge(self) -> None:
method get_value (line 152) | async def get_value(self, key: str) -> Any: ...
method get_value (line 155) | async def get_value(self, key: str, default_value: T) -> T: ...
method get_value (line 158) | async def get_value(self, key: str, default_value: T | None = None) ->...
method get_value (line 160) | async def get_value(self, key: str, default_value: T | None = None) ->...
method set_value (line 173) | async def set_value(
method delete_value (line 188) | async def delete_value(self, key: str) -> None:
method iterate_keys (line 196) | async def iterate_keys(
method list_keys (line 216) | async def list_keys(
method record_exists (line 240) | async def record_exists(self, key: str) -> bool:
method get_public_url (line 251) | async def get_public_url(self, key: str) -> str:
method get_auto_saved_value (line 262) | async def get_auto_saved_value(
method persist_autosaved_values (line 299) | async def persist_autosaved_values(self) -> None:
method _clear_cache (line 306) | async def _clear_cache(self) -> None:
FILE: src/crawlee/storages/_request_queue.py
class RequestQueue (line 33) | class RequestQueue(Storage, RequestManager):
method __init__ (line 74) | def __init__(self, client: RequestQueueClient, id: str, name: str | No...
method id (line 95) | def id(self) -> str:
method name (line 100) | def name(self) -> str | None:
method get_metadata (line 104) | async def get_metadata(self) -> RequestQueueMetadata:
method get_handled_count (line 108) | async def get_handled_count(self) -> int:
method get_total_count (line 113) | async def get_total_count(self) -> int:
method open (line 119) | async def open(
method drop (line 144) | async def drop(self) -> None:
method purge (line 152) | async def purge(self) -> None:
method add_request (line 156) | async def add_request(
method add_requests (line 180) | async def add_requests(
method fetch_next_request (line 230) | async def fetch_next_request(self) -> Request | None:
method get_request (line 247) | async def get_request(self, unique_key: str) -> Request | None:
method mark_request_as_handled (line 258) | async def mark_request_as_handled(self, request: Request) -> Processed...
method reclaim_request (line 273) | async def reclaim_request(
method is_empty (line 295) | async def is_empty(self) -> bool:
method is_finished (line 307) | async def is_finished(self) -> bool:
method _process_batch (line 327) | async def _process_batch(
FILE: src/crawlee/storages/_storage_instance_manager.py
class _StorageCache (line 22) | class _StorageCache:
method remove_from_cache (line 40) | def remove_from_cache(self, storage_instance: Storage) -> None:
class StorageInstanceManager (line 69) | class StorageInstanceManager:
method __init__ (line 79) | def __init__(self) -> None:
method open_storage_instance (line 83) | async def open_storage_instance(
method remove_from_cache (line 195) | def remove_from_cache(self, storage_instance: Storage) -> None:
method clear_cache (line 203) | def clear_cache(self) -> None:
method _get_from_cache (line 207) | def _get_from_cache(
method _check_name_alias_conflict (line 234) | def _check_name_alias_conflict(
FILE: src/crawlee/storages/_utils.py
function validate_storage_name (line 6) | def validate_storage_name(name: str | None) -> None:
FILE: tests/e2e/conftest.py
function pytest_configure (line 11) | def pytest_configure(config: Config) -> None:
function crawlee_wheel_path (line 31) | def crawlee_wheel_path(tmp_path_factory: pytest.TempPathFactory, testrun...
FILE: tests/e2e/project_template/test_static_crawlers_templates.py
function test_static_crawler_actor_at_apify (line 47) | async def test_static_crawler_actor_at_apify(
FILE: tests/e2e/project_template/utils.py
function patch_crawlee_version_in_project (line 8) | def patch_crawlee_version_in_project(
function _patch_crawlee_version_in_requirements_txt_based_project (line 21) | def _patch_crawlee_version_in_requirements_txt_based_project(project_pat...
function _patch_crawlee_version_in_pyproject_toml_based_project (line 58) | def _patch_crawlee_version_in_pyproject_toml_based_project(project_path:...
FILE: tests/unit/_autoscaling/test_autoscaled_pool.py
function system_status (line 24) | def system_status() -> SystemStatus | Mock:
function future (line 31) | def future(value: T, /) -> Awaitable[T]:
function test_runs_concurrently (line 38) | async def test_runs_concurrently(system_status: SystemStatus | Mock) -> ...
function test_abort_works (line 66) | async def test_abort_works(system_status: SystemStatus | Mock) -> None:
function test_propagates_exceptions (line 93) | async def test_propagates_exceptions(system_status: SystemStatus | Mock)...
function test_propagates_exceptions_after_finished (line 121) | async def test_propagates_exceptions_after_finished(system_status: Syste...
function test_autoscales (line 152) | async def test_autoscales(
function test_autoscales_uses_desired_concurrency_ratio (line 223) | async def test_autoscales_uses_desired_concurrency_ratio(
function test_max_tasks_per_minute_works (line 283) | async def test_max_tasks_per_minute_works(system_status: SystemStatus | ...
function test_allows_multiple_run_calls (line 314) | async def test_allows_multiple_run_calls(system_status: SystemStatus | M...
FILE: tests/unit/_autoscaling/test_snapshotter.py
function event_manager (line 35) | async def event_manager() -> AsyncGenerator[LocalEventManager, None]:
function snapshotter (line 56) | async def snapshotter(event_manager: LocalEventManager) -> AsyncGenerato...
function default_cpu_info (line 64) | def default_cpu_info() -> CpuInfo:
function default_memory_info (line 69) | def default_memory_info() -> MemoryInfo:
function event_system_data_info (line 78) | def event_system_data_info(default_cpu_info: CpuInfo, default_memory_inf...
function test_start_stop_lifecycle (line 85) | async def test_start_stop_lifecycle() -> None:
function test_snapshot_cpu (line 92) | async def test_snapshot_cpu(
function test_snapshot_memory (line 102) | async def test_snapshot_memory(
function test_snapshot_memory_with_memory_info_sets_system_wide_fields (line 112) | async def test_snapshot_memory_with_memory_info_sets_system_wide_fields(
function test_snapshot_event_loop (line 139) | def test_snapshot_event_loop(snapshotter: Snapshotter) -> None:
function test_snapshot_client (line 145) | def test_snapshot_client(snapshotter: Snapshotter) -> None:
function test_snapshot_client_overloaded (line 151) | def test_snapshot_client_overloaded() -> None:
function test_get_cpu_sample (line 159) | async def test_get_cpu_sample(
function test_methods_raise_error_when_not_active (line 196) | async def test_methods_raise_error_when_not_active() -> None:
function test_snapshot_pruning_removes_outdated_records (line 225) | async def test_snapshot_pruning_removes_outdated_records(
function test_memory_load_evaluation_logs_warning_on_high_usage (line 260) | async def test_memory_load_evaluation_logs_warning_on_high_usage(
function test_memory_load_evaluation_silent_on_acceptable_usage (line 300) | async def test_memory_load_evaluation_silent_on_acceptable_usage(
function test_snapshots_time_ordered (line 329) | async def test_snapshots_time_ordered(snapshotter: Snapshotter, event_ma...
function test_sorted_snapshot_list_add_maintains_order (line 359) | def test_sorted_snapshot_list_add_maintains_order() -> None:
function test_dynamic_memory (line 391) | async def test_dynamic_memory(
FILE: tests/unit/_autoscaling/test_system_status.py
function snapshotter (line 25) | async def snapshotter() -> AsyncGenerator[Snapshotter, None]:
function now (line 32) | def now() -> datetime:
function test_start_stop_lifecycle (line 36) | async def test_start_stop_lifecycle() -> None:
function test_cpu_is_overloaded (line 45) | def test_cpu_is_overloaded(snapshotter: Snapshotter, now: datetime) -> N...
function test_cpu_is_not_overloaded (line 61) | def test_cpu_is_not_overloaded(snapshotter: Snapshotter, now: datetime) ...
function test_get_system_info (line 77) | def test_get_system_info(snapshotter: Snapshotter, now: datetime) -> None:
function test_client_overloaded (line 195) | def test_client_overloaded(
function test_memory_overloaded_system_wide (line 217) | def test_memory_overloaded_system_wide(snapshotter: Snapshotter, now: da...
FILE: tests/unit/_statistics/test_error_tracker.py
function test_error_tracker_counts (line 20) | async def test_error_tracker_counts(error_tracker: ErrorTracker, expecte...
function test_error_tracker_similar_messages_full_stack (line 54) | async def test_error_tracker_similar_messages_full_stack(
function test_show_full_message (line 90) | async def test_show_full_message(*, show_full_message: bool, expected_me...
function test_error_tracker_with_errors_chain (line 104) | async def test_error_tracker_with_errors_chain() -> None:
FILE: tests/unit/_statistics/test_periodic_logging.py
function test_periodic_logging (line 14) | async def test_periodic_logging(caplog: pytest.LogCaptureFixture) -> None:
FILE: tests/unit/_statistics/test_persistence.py
function test_basic_persistence (line 6) | async def test_basic_persistence() -> None:
FILE: tests/unit/_statistics/test_request_max_duration.py
function test_request_max_duration_tracks_maximum (line 8) | async def test_request_max_duration_tracks_maximum() -> None:
FILE: tests/unit/_statistics/test_request_processing_record.py
function test_tracking_time_resolution (line 6) | def test_tracking_time_resolution() -> None:
FILE: tests/unit/_utils/test_byte_size.py
function test_initializations (line 8) | def test_initializations() -> None:
function test_conversions (line 19) | def test_conversions() -> None:
function test_string_representation (line 27) | def test_string_representation() -> None:
function test_comparisons (line 35) | def test_comparisons() -> None:
function test_additions (line 47) | def test_additions() -> None:
function test_subtractions (line 62) | def test_subtractions() -> None:
function test_multiplication (line 81) | def test_multiplication() -> None:
function test_divisions (line 98) | def test_divisions() -> None:
FILE: tests/unit/_utils/test_console.py
function test_empty_input (line 6) | def test_empty_input() -> None:
function test_empty_row (line 10) | def test_empty_row() -> None:
function test_single_column (line 14) | def test_single_column() -> None:
function test_two_columns (line 21) | def test_two_columns() -> None:
function test_long_content_truncation (line 34) | def test_long_content_truncation() -> None:
FILE: tests/unit/_utils/test_crypto.py
function test_crypto_random_object_id_default_length (line 6) | def test_crypto_random_object_id_default_length() -> None:
function test_crypto_random_object_id_custom_length (line 11) | def test_crypto_random_object_id_custom_length() -> None:
function test_crypto_random_object_id_character_set (line 17) | def test_crypto_random_object_id_character_set() -> None:
function test_compute_short_hash_with_known_input (line 24) | def test_compute_short_hash_with_known_input() -> None:
function test_compute_short_hash_with_empty_input (line 30) | def test_compute_short_hash_with_empty_input() -> None:
function test_compute_short_hash_output_length (line 36) | def test_compute_short_hash_output_length() -> None:
function test_compute_short_hash_differentiates_input (line 41) | def test_compute_short_hash_differentiates_input() -> None:
FILE: tests/unit/_utils/test_file.py
function test_json_dumps (line 8) | async def test_json_dumps() -> None:
FILE: tests/unit/_utils/test_globs.py
function test_asterisk (line 6) | def test_asterisk() -> None:
function test_double_asteritsk (line 13) | def test_double_asteritsk() -> None:
FILE: tests/unit/_utils/test_html_to_text.py
function test_html_to_text (line 187) | def test_html_to_text(source: str, expected_text: str, html_to_text: Cal...
function test_html_to_text_raises_on_wrong_input_type (line 192) | def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[...
function test_html_to_text_parsel (line 198) | def test_html_to_text_parsel() -> None:
function test_html_to_text_beautifulsoup (line 202) | def test_html_to_text_beautifulsoup() -> None:
FILE: tests/unit/_utils/test_measure_time.py
function test_measure_time_wall_sync (line 9) | def test_measure_time_wall_sync() -> None:
function test_measure_time_cpu_sync (line 18) | def test_measure_time_cpu_sync() -> None:
function test_measure_time_wall_async (line 33) | async def test_measure_time_wall_async() -> None:
FILE: tests/unit/_utils/test_raise_if_too_many_kwargs.py
function test_limit_kwargs_default (line 22) | def test_limit_kwargs_default(kwargs: dict[str, Any], *, should_raise: b...
function test_limit_kwargs (line 35) | def test_limit_kwargs(kwargs: dict[str, Any], *, should_raise: bool) -> ...
FILE: tests/unit/_utils/test_recurring_task.py
function function (line 13) | def function() -> AsyncMock:
function delay (line 20) | def delay() -> timedelta:
function test_init (line 24) | async def test_init(function: AsyncMock, delay: timedelta) -> None:
function test_start_and_stop (line 31) | async def test_start_and_stop(function: AsyncMock, delay: timedelta) -> ...
function test_execution (line 45) | async def test_execution(function: AsyncMock, delay: timedelta) -> None:
FILE: tests/unit/_utils/test_requests.py
function test_normalize_url (line 36) | def test_normalize_url(url: str, expected_output: str, *, keep_url_fragm...
function test_compute_unique_key_basic (line 41) | def test_compute_unique_key_basic() -> None:
function test_compute_unique_key_handles_fragments (line 48) | def test_compute_unique_key_handles_fragments() -> None:
function test_compute_unique_key_handles_payload (line 57) | def test_compute_unique_key_handles_payload() -> None:
function test_compute_unique_key_handles_headers (line 74) | def test_compute_unique_key_handles_headers() -> None:
function test_compute_unique_key_complex (line 91) | def test_compute_unique_key_complex() -> None:
function test_compute_unique_key_post_with_none_payload (line 117) | def test_compute_unique_key_post_with_none_payload() -> None:
function test_compute_unique_key_with_whitespace_in_headers (line 124) | def test_compute_unique_key_with_whitespace_in_headers() -> None:
FILE: tests/unit/_utils/test_robots.py
function test_generation_robots_txt_url (line 13) | async def test_generation_robots_txt_url(server_url: URL, http_client: H...
function test_allow_disallow_robots_txt (line 18) | async def test_allow_disallow_robots_txt(server_url: URL, http_client: H...
function test_extract_sitemaps_urls (line 26) | async def test_extract_sitemaps_urls(server_url: URL, http_client: HttpC...
function test_parse_from_content (line 32) | async def test_parse_from_content() -> None:
function test_bind_robots_txt_url (line 45) | async def test_bind_robots_txt_url() -> None:
FILE: tests/unit/_utils/test_shared_timeout.py
function test_shared_timeout_tracks_elapsed_time (line 9) | async def test_shared_timeout_tracks_elapsed_time() -> None:
function test_shared_timeout_expires (line 23) | async def test_shared_timeout_expires() -> None:
function test_shared_timeout_cannot_be_nested (line 35) | async def test_shared_timeout_cannot_be_nested() -> None:
function test_shared_timeout_multiple_sequential_uses (line 45) | async def test_shared_timeout_multiple_sequential_uses() -> None:
FILE: tests/unit/_utils/test_sitemap.py
function _make_mock_client (line 51) | def _make_mock_client(url_map: dict[str, tuple[int, bytes]]) -> AsyncMock:
function compress_gzip (line 68) | def compress_gzip(data: str) -> bytes:
function encode_base64 (line 73) | def encode_base64(data: bytes) -> str:
function test_sitemap (line 78) | async def test_sitemap(server_url: URL, http_client: HttpClient) -> None:
function test_extract_metadata_sitemap (line 89) | async def test_extract_metadata_sitemap(server_url: URL, http_client: Ht...
function test_gzipped_sitemap (line 106) | async def test_gzipped_sitemap(server_url: URL, http_client: HttpClient)...
function test_gzipped_sitemap_with_invalid_data (line 115) | async def test_gzipped_sitemap_with_invalid_data(server_url: URL, http_c...
function test_gz_sitemap_with_non_gzipped (line 126) | async def test_gz_sitemap_with_non_gzipped(server_url: URL, http_client:...
function test_gzipped_sitemap_with_bad_type (line 137) | async def test_gzipped_sitemap_with_bad_type(server_url: URL, http_clien...
function test_xml_sitemap_with_gzipped_data (line 149) | async def test_xml_sitemap_with_gzipped_data(server_url: URL, http_clien...
function test_parent_sitemap (line 159) | async def test_parent_sitemap(server_url: URL, http_client: HttpClient) ...
function test_non_sitemap_url (line 186) | async def test_non_sitemap_url(server_url: URL, http_client: HttpClient)...
function test_cdata_sitemap (line 194) | async def test_cdata_sitemap(server_url: URL, http_client: HttpClient) -...
function test_txt_sitemap (line 213) | async def test_txt_sitemap(server_url: URL, http_client: HttpClient) -> ...
function test_sitemap_pretty (line 231) | async def test_sitemap_pretty(server_url: URL, http_client: HttpClient) ...
function test_sitemap_from_string (line 262) | async def test_sitemap_from_string() -> None:
function test_discover_sitemap_from_robots_txt (line 270) | async def test_discover_sitemap_from_robots_txt() -> None:
function test_discover_sitemap_from_common_paths (line 280) | async def test_discover_sitemap_from_common_paths() -> None:
function test_discover_sitemap_from_input_url (line 295) | async def test_discover_sitemap_from_input_url() -> None:
function test_discover_sitemap_deduplication (line 304) | async def test_discover_sitemap_deduplication() -> None:
function test_discover_sitemaps_multiple_domains (line 319) | async def test_discover_sitemaps_multiple_domains() -> None:
function test_discover_sitemap_url_without_host_skipped (line 342) | async def test_discover_sitemap_url_without_host_skipped() -> None:
FILE: tests/unit/_utils/test_system.py
function test_get_memory_info_returns_valid_values (line 17) | def test_get_memory_info_returns_valid_values() -> None:
function test_get_cpu_info_returns_valid_values (line 24) | def test_get_cpu_info_returns_valid_values() -> None:
function test_memory_estimation_does_not_overestimate_due_to_shared_memory (line 30) | def test_memory_estimation_does_not_overestimate_due_to_shared_memory() ...
FILE: tests/unit/_utils/test_timedelta_ms.py
class _ModelWithTimedeltaMs (line 12) | class _ModelWithTimedeltaMs(BaseModel):
function test_model_with_timedelta_ms_input_types (line 30) | def test_model_with_timedelta_ms_input_types(
FILE: tests/unit/_utils/test_urls.py
function test_is_url_absolute (line 9) | def test_is_url_absolute() -> None:
function test_convert_to_absolute_url (line 19) | def test_convert_to_absolute_url() -> None:
function test_validate_http_url (line 36) | def test_validate_http_url() -> None:
FILE: tests/unit/browsers/test_browser_pool.py
function test_default_plugin_new_page_creation (line 23) | async def test_default_plugin_new_page_creation(server_url: URL) -> None:
function test_multiple_plugins_new_page_creation (line 43) | async def test_multiple_plugins_new_page_creation(server_url: URL) -> None:
function test_new_page_with_each_plugin (line 79) | async def test_new_page_with_each_plugin(server_url: URL) -> None:
function test_with_default_plugin_constructor (line 106) | async def test_with_default_plugin_constructor(server_url: URL) -> None:
function test_new_page_with_existing_id (line 125) | async def test_new_page_with_existing_id() -> None:
function test_new_page_with_invalid_plugin (line 132) | async def test_new_page_with_invalid_plugin() -> None:
function test_resource_management (line 140) | async def test_resource_management(server_url: URL) -> None:
function test_methods_raise_error_when_not_active (line 154) | async def test_methods_raise_error_when_not_active() -> None:
function test_with_plugin_contains_page_options (line 174) | async def test_with_plugin_contains_page_options(server_url: URL) -> None:
function test_browser_pool_retire_browser_after_page_count (line 190) | async def test_browser_pool_retire_browser_after_page_count(
function test_pre_page_create_hook_is_called (line 209) | async def test_pre_page_create_hook_is_called() -> None:
function test_post_page_create_hook_is_called (line 243) | async def test_post_page_create_hook_is_called() -> None:
function test_pre_page_close_hook (line 272) | async def test_pre_page_close_hook() -> None:
function test_post_page_close_hook (line 291) | async def test_post_page_close_hook() -> None:
function test_page_hooks_execution_order (line 312) | async def test_page_hooks_execution_order() -> None:
function test_multiple_hooks_all_called (line 344) | async def test_multiple_hooks_all_called() -> None:
FILE: tests/unit/browsers/test_playwright_browser.py
function playwright (line 18) | async def playwright() -> AsyncGenerator[Playwright, None]:
function test_init (line 23) | async def test_init(playwright: Playwright) -> None:
function test_delete_temp_folder_with_close_browser (line 35) | async def test_delete_temp_folder_with_close_browser(playwright: Playwri...
FILE: tests/unit/browsers/test_playwright_browser_controller.py
function playwright (line 21) | async def playwright() -> AsyncGenerator[Playwright, None]:
function browser (line 27) | async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]:
function controller (line 34) | async def controller(browser: Browser) -> AsyncGenerator[PlaywrightBrows...
function test_initial_state (line 40) | async def test_initial_state(browser: Browser) -> None:
function test_open_and_close_page (line 52) | async def test_open_and_close_page(controller: PlaywrightBrowserControll...
function test_max_open_pages_limit (line 66) | async def test_max_open_pages_limit(controller: PlaywrightBrowserControl...
function test_idle_time (line 91) | async def test_idle_time(controller: PlaywrightBrowserController) -> None:
function test_close_browser_with_open_pages (line 98) | async def test_close_browser_with_open_pages(browser: Browser) -> None:
function test_memory_leak_on_concurrent_context_creation (line 114) | async def test_memory_leak_on_concurrent_context_creation() -> None:
function test_max_open_pages_limit_on_concurrent_creation (line 143) | async def test_max_open_pages_limit_on_concurrent_creation(controller: P...
function test_max_open_pages_limit_error_on_concurrent_creation (line 152) | async def test_max_open_pages_limit_error_on_concurrent_creation(control...
function test_browser_with_pre_existing_context (line 158) | async def test_browser_with_pre_existing_context(tmp_path: Path) -> None:
FILE: tests/unit/browsers/test_playwright_browser_plugin.py
function plugin (line 16) | async def plugin() -> AsyncGenerator[PlaywrightBrowserPlugin, None]:
function test_initial_state (line 21) | async def test_initial_state() -> None:
function test_new_browser (line 37) | async def test_new_browser(plugin: PlaywrightBrowserPlugin, server_url: ...
function test_multiple_new_browsers (line 51) | async def test_multiple_new_browsers(plugin: PlaywrightBrowserPlugin) ->...
function test_methods_raise_error_when_not_active (line 58) | async def test_methods_raise_error_when_not_active() -> None:
function raise_error_if_chrome_and_executable_path (line 74) | async def raise_error_if_chrome_and_executable_path() -> None:
FILE: tests/unit/conftest.py
function suppress_user_warning (line 33) | async def suppress_user_warning() -> AsyncGenerator[None, None]:
function prepare_test_env (line 44) | def prepare_test_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) ->...
function _isolate_test_environment (line 84) | def _isolate_test_environment(prepare_test_env: Callable[[], None]) -> N...
function _set_crawler_log_level (line 97) | def _set_crawler_log_level(pytestconfig: pytest.Config, monkeypatch: pyt...
function proxy_info (line 106) | async def proxy_info(unused_tcp_port: int) -> ProxyInfo:
function proxy (line 121) | async def proxy(proxy_info: ProxyInfo) -> AsyncGenerator[ProxyInfo, None]:
function disabled_proxy (line 136) | async def disabled_proxy(proxy_info: ProxyInfo) -> AsyncGenerator[ProxyI...
function header_network (line 152) | def header_network() -> dict:
function key_value_store (line 157) | async def key_value_store() -> AsyncGenerator[KeyValueStore, None]:
function http_server (line 164) | def http_server(unused_tcp_port_factory: Callable[[], int]) -> Iterator[...
function server_url (line 172) | def server_url(http_server: TestServer) -> URL:
function redirect_http_server (line 179) | def redirect_http_server(unused_tcp_port_factory: Callable[[], int]) -> ...
function redirect_server_url (line 197) | def redirect_server_url(redirect_http_server: TestServer) -> URL:
function http_client (line 209) | async def http_client(request: pytest.FixtureRequest) -> AsyncGenerator[...
function redis_client (line 226) | def redis_client() -> FakeAsyncRedis:
FILE: tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py
function test_urls (line 65) | def test_urls(server_url: URL) -> list[str]:
function key_value_store (line 74) | async def key_value_store() -> AsyncGenerator[KeyValueStore, None]:
class _SimpleRenderingTypePredictor (line 80) | class _SimpleRenderingTypePredictor(RenderingTypePredictor):
method __init__ (line 83) | def __init__(
method predict (line 94) | def predict(self, request: Request) -> RenderingTypePrediction:
method store_result (line 98) | def store_result(self, request: Request, rendering_type: RenderingType...
class TestInput (line 103) | class TestInput:
function test_adaptive_crawling (line 154) | async def test_adaptive_crawling(
function test_adaptive_crawling_parsel (line 208) | async def test_adaptive_crawling_parsel(test_urls: list[str]) -> None:
function test_adaptive_crawling_pre_nav_change_to_context (line 240) | async def test_adaptive_crawling_pre_nav_change_to_context(test_urls: li...
function test_playwright_only_pre_navigation_hook (line 271) | async def test_playwright_only_pre_navigation_hook(test_urls: list[str])...
function test_adaptive_crawling_post_nav_change_to_context (line 304) | async def test_adaptive_crawling_post_nav_change_to_context(test_urls: l...
function test_playwright_only_post_navigation_hook (line 335) | async def test_playwright_only_post_navigation_hook(test_urls: list[str]...
function test_adaptive_crawling_result (line 368) | async def test_adaptive_crawling_result(test_urls: list[str]) -> None:
function test_adaptive_crawling_predictor_calls (line 399) | async def test_adaptive_crawling_predictor_calls(
function test_adaptive_crawling_result_use_state_isolation (line 438) | async def test_adaptive_crawling_result_use_state_isolation(
function test_adaptive_crawling_statistics (line 468) | async def test_adaptive_crawling_statistics(test_urls: list[str]) -> None:
function test_adaptive_crawler_exceptions_in_sub_crawlers (line 502) | async def test_adaptive_crawler_exceptions_in_sub_crawlers(*, error_in_p...
function test_adaptive_playwright_crawler_statistics_in_init (line 543) | async def test_adaptive_playwright_crawler_statistics_in_init() -> None:
function test_adaptive_playwright_crawler_timeout_in_sub_crawler (line 572) | async def test_adaptive_playwright_crawler_timeout_in_sub_crawler(test_u...
function test_adaptive_playwright_crawler_default_predictor (line 611) | async def test_adaptive_playwright_crawler_default_predictor(test_urls: ...
function test_adaptive_context_query_selector_beautiful_soup (line 634) | async def test_adaptive_context_query_selector_beautiful_soup(test_urls:...
function test_adaptive_context_query_selector_parsel (line 680) | async def test_adaptive_context_query_selector_parsel(test_urls: list[st...
function test_adaptive_context_parse_with_static_parser_parsel (line 718) | async def test_adaptive_context_parse_with_static_parser_parsel(test_url...
function test_adaptive_context_helpers_on_changed_selector (line 755) | async def test_adaptive_context_helpers_on_changed_selector(test_urls: l...
function test_adaptive_context_query_non_existing_element (line 786) | async def test_adaptive_context_query_non_existing_element(test_urls: li...
function test_change_context_state_after_handling (line 841) | async def test_change_context_state_after_handling(test_input: TestInput...
function test_adaptive_playwright_crawler_with_sql_storage (line 888) | async def test_adaptive_playwright_crawler_with_sql_storage(test_urls: l...
FILE: tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py
function test_predictor_state_persistence (line 7) | async def test_predictor_state_persistence() -> None:
FILE: tests/unit/crawlers/_adaptive_playwright/test_predictor.py
function test_predictor_same_label (line 27) | async def test_predictor_same_label(url: str, expected_prediction: Rende...
function test_predictor_new_label_increased_detection_probability_recommendation (line 45) | async def test_predictor_new_label_increased_detection_probability_recom...
function test_unreliable_prediction (line 89) | async def test_unreliable_prediction() -> None:
function test_no_learning_data_prediction (line 114) | async def test_no_learning_data_prediction() -> None:
function test_persistent_no_learning_data_prediction (line 126) | async def test_persistent_no_learning_data_prediction() -> None:
function test_persistent_prediction (line 140) | async def test_persistent_prediction() -> None:
function test_persistent_prediction_recovery (line 164) | async def test_persistent_prediction_recovery(*, persistence_enabled: bo...
function test_url_similarity (line 213) | def test_url_similarity(url_1: str, url_2: str, expected_rounded_similar...
FILE: tests/unit/crawlers/_basic/test_basic_crawler.py
function test_processes_requests_from_explicit_queue (line 47) | async def test_processes_requests_from_explicit_queue() -> None:
function test_processes_requests_from_request_source_tandem (line 63) | async def test_processes_requests_from_request_source_tandem() -> None:
function test_processes_requests_from_run_args (line 89) | async def test_processes_requests_from_run_args() -> None:
function test_allows_multiple_run_calls (line 102) | async def test_allows_multiple_run_calls() -> None:
function test_retries_failed_requests (line 123) | async def test_retries_failed_requests() -> None:
function test_respects_no_retry (line 146) | async def test_respects_no_retry() -> None:
function test_respects_request_specific_max_retries (line 174) | async def test_respects_request_specific_max_retries() -> None:
function test_calls_error_handler (line 199) | async def test_calls_error_handler() -> None:
function test_calls_error_handler_for_session_errors (line 233) | async def test_calls_error_handler_for_session_errors() -> None:
function test_handles_error_in_error_handler (line 253) | async def test_handles_error_in_error_handler() -> None:
function test_calls_failed_request_handler (line 269) | async def test_calls_failed_request_handler() -> None:
function test_handlers_use_context_helpers (line 290) | async def test_handlers_use_context_helpers(tmp_path: Path, handler: str...
function test_handles_error_in_failed_request_handler (line 329) | async def test_handles_error_in_failed_request_handler() -> None:
function test_send_request_works (line 352) | async def test_send_request_works(server_url: URL, method: HttpMethod, p...
class AddRequestsTestInput (line 378) | class AddRequestsTestInput:
function test_enqueue_strategy (line 559) | async def test_enqueue_strategy(test_input: AddRequestsTestInput) -> None:
function test_session_rotation (line 583) | async def test_session_rotation(server_url: URL) -> None:
function test_final_statistics (line 608) | async def test_final_statistics() -> None:
function test_crawler_get_storages (line 652) | async def test_crawler_get_storages() -> None:
function test_crawler_run_requests (line 665) | async def test_crawler_run_requests() -> None:
function test_context_push_and_get_data (line 685) | async def test_context_push_and_get_data() -> None:
function test_context_push_and_get_data_handler_error (line 706) | async def test_context_push_and_get_data_handler_error() -> None:
function test_crawler_push_and_export_data (line 722) | async def test_crawler_push_and_export_data(tmp_path: Path) -> None:
function test_crawler_export_data_additional_kwargs (line 746) | async def test_crawler_export_data_additional_kwargs(tmp_path: Path) -> ...
function test_context_push_and_export_data (line 762) | async def test_context_push_and_export_data(tmp_path: Path) -> None:
function test_context_update_kv_store (line 789) | async def test_context_update_kv_store() -> None:
function test_context_use_state (line 803) | async def test_context_use_state() -> None:
function test_crawler_use_state (line 818) | async def test_crawler_use_state() -> None:
function test_context_use_state_crawlers_share_state (line 832) | async def test_context_use_state_crawlers_share_state() -> None:
function test_crawlers_share_stats (line 851) | async def test_crawlers_share_stats() -> None:
function test_context_use_state_crawlers_own_state (line 866) | async def test_context_use_state_crawlers_own_state() -> None:
function test_context_handlers_use_state (line 883) | async def test_context_handlers_use_state(key_value_store: KeyValueStore...
function test_max_requests_per_crawl (line 929) | async def test_max_requests_per_crawl(*, use_failed_requests: bool) -> N...
function test_max_crawl_depth (line 960) | async def test_max_crawl_depth() -> None:
function test_abort_on_error (line 999) | async def test_abort_on_error(
function test_crawler_log (line 1027) | def test_crawler_log() -> None:
function test_consecutive_runs_purge_request_queue (line 1033) | async def test_consecutive_runs_purge_request_queue() -> None:
function test_logs_final_statistics (line 1061) | async def test_logs_final_statistics(
function test_crawler_manual_stop (line 1131) | async def test_crawler_manual_stop() -> None:
function test_crawler_multiple_stops_in_parallel (line 1158) | async def test_crawler_multiple_stops_in_parallel() -> None:
function test_services_no_side_effect_on_crawler_init (line 1191) | async def test_services_no_side_effect_on_crawler_init() -> None:
function test_crawler_uses_default_services (line 1207) | async def test_crawler_uses_default_services() -> None:
function test_services_crawlers_can_use_different_services (line 1224) | async def test_services_crawlers_can_use_different_services() -> None:
function test_crawler_uses_default_storages (line 1246) | async def test_crawler_uses_default_storages(tmp_path: Path) -> None:
function test_crawler_can_use_other_storages (line 1264) | async def test_crawler_can_use_other_storages(tmp_path: Path) -> None:
function test_crawler_can_use_other_storages_of_same_type (line 1282) | async def test_crawler_can_use_other_storages_of_same_type(tmp_path: Pat...
function test_allows_storage_client_overwrite_before_run (line 1323) | async def test_allows_storage_client_overwrite_before_run(monkeypatch: p...
function test_context_use_state_race_condition_in_handlers (line 1349) | async def test_context_use_state_race_condition_in_handlers(key_value_st...
function test_timeout_in_handler (line 1389) | async def test_timeout_in_handler(sleep_type: str) -> None:
function test_keep_alive (line 1447) | async def test_keep_alive(
function test_session_retire_in_user_handler (line 1492) | async def test_session_retire_in_user_handler(*, retire: bool) -> None:
function test_bound_session_to_request (line 1514) | async def test_bound_session_to_request() -> None:
function test_bound_sessions_to_same_request (line 1536) | async def test_bound_sessions_to_same_request() -> None:
function test_error_bound_session_to_request (line 1568) | async def test_error_bound_session_to_request() -> None:
function test_handle_error_bound_session_to_request (line 1580) | async def test_handle_error_bound_session_to_request() -> None:
function test_handles_session_error_in_failed_request_handler (line 1596) | async def test_handles_session_error_in_failed_request_handler() -> None:
function test_lock_with_get_robots_txt_file_for_url (line 1615) | async def test_lock_with_get_robots_txt_file_for_url(server_url: URL) ->...
function test_reduced_logs_from_timed_out_request_handler (line 1627) | async def test_reduced_logs_from_timed_out_request_handler(caplog: pytes...
function test_reduced_logs_from_time_out_in_request_handler (line 1657) | async def test_reduced_logs_from_time_out_in_request_handler(caplog: pyt...
function test_status_message_callback (line 1682) | async def test_status_message_callback() -> None:
function test_status_message_emit (line 1721) | async def test_status_message_emit() -> None:
function test_add_requests_with_rq_param (line 1748) | async def test_add_requests_with_rq_param(queue_name: str | None, queue_...
function test_add_requests_error_with_multi_params (line 1790) | async def test_add_requests_error_with_multi_params(
function test_crawler_purge_request_queue_uses_same_storage_client (line 1808) | async def test_crawler_purge_request_queue_uses_same_storage_client() ->...
function _run_crawler (line 1831) | async def _run_crawler(crawler_id: int | None, requests: list[str], stor...
class _CrawlerInput (line 1860) | class _CrawlerInput:
function _process_run_crawlers (line 1865) | def _process_run_crawlers(crawler_inputs: list[_CrawlerInput], storage_d...
function test_crawler_state_persistence (line 1872) | async def test_crawler_state_persistence(tmp_path: Path) -> None:
function test_crawler_state_persistence_2_crawlers_with_migration (line 1921) | async def test_crawler_state_persistence_2_crawlers_with_migration(tmp_p...
function test_crawler_intermediate_statistics (line 1967) | async def test_crawler_intermediate_statistics() -> None:
function test_protect_request_in_run_handlers (line 1993) | async def test_protect_request_in_run_handlers() -> None:
function test_new_request_error_handler (line 2016) | async def test_new_request_error_handler() -> None:
FILE: tests/unit/crawlers/_basic/test_context_pipeline.py
class EnhancedCrawlingContext (line 21) | class EnhancedCrawlingContext(BasicCrawlingContext):
class MoreEnhancedCrawlingContext (line 26) | class MoreEnhancedCrawlingContext(EnhancedCrawlingContext):
function test_calls_consumer_without_middleware (line 30) | async def test_calls_consumer_without_middleware() -> None:
function test_calls_consumers_and_middlewares (line 51) | async def test_calls_consumers_and_middlewares() -> None:
function test_wraps_consumer_errors (line 115) | async def test_wraps_consumer_errors() -> None:
function test_handles_exceptions_in_middleware_initialization (line 135) | async def test_handles_exceptions_in_middleware_initialization() -> None:
function test_handles_exceptions_in_middleware_finalization (line 167) | async def test_handles_exceptions_in_middleware_finalization() -> None:
FILE: tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
function test_basic (line 21) | async def test_basic(server_url: URL, http_client: HttpClient) -> None:
function test_enqueue_links (line 38) | async def test_enqueue_links(redirect_server_url: URL, server_url: URL, ...
function test_enqueue_non_href_links (line 67) | async def test_enqueue_non_href_links(redirect_server_url: URL, server_u...
function test_enqueue_links_selector (line 90) | async def test_enqueue_links_selector(server_url: URL, http_client: Http...
function test_enqueue_links_with_max_crawl (line 108) | async def test_enqueue_links_with_max_crawl(server_url: URL, http_client...
function test_enqueue_links_with_transform_request_function (line 132) | async def test_enqueue_links_with_transform_request_function(server_url:...
function test_handle_blocked_request (line 173) | async def test_handle_blocked_request(server_url: URL, http_client: Http...
function test_default_logger (line 179) | def test_default_logger() -> None:
function test_respect_robots_txt (line 183) | async def test_respect_robots_txt(server_url: URL, http_client: HttpClie...
function test_respect_robots_txt_with_problematic_links (line 203) | async def test_respect_robots_txt_with_problematic_links(server_url: URL...
function test_on_skipped_request (line 239) | async def test_on_skipped_request(server_url: URL, http_client: HttpClie...
function test_extract_links (line 262) | async def test_extract_links(server_url: URL, http_client: HttpClient) -...
function test_extract_non_href_links (line 277) | async def test_extract_non_href_links(server_url: URL, http_client: Http...
function test_enqueue_links_with_rq_param (line 300) | async def test_enqueue_links_with_rq_param(
function test_enqueue_links_requests_with_rq_param (line 337) | async def test_enqueue_links_requests_with_rq_param(
function test_enqueue_links_error_with_multi_params (line 383) | async def test_enqueue_links_error_with_multi_params(
function test_navigation_timeout_on_slow_request (line 396) | async def test_navigation_timeout_on_slow_request(server_url: URL, http_...
function test_navigation_timeout_applies_to_hooks (line 417) | async def test_navigation_timeout_applies_to_hooks(server_url: URL) -> N...
function test_slow_navigation_does_not_count_toward_handler_timeout (line 435) | async def test_slow_navigation_does_not_count_toward_handler_timeout(ser...
function test_enqueue_strategy_after_redirect (line 453) | async def test_enqueue_strategy_after_redirect(server_url: URL, redirect...
function test_enqueue_links_with_limit (line 472) | async def test_enqueue_links_with_limit(server_url: URL, http_client: Ht...
FILE: tests/unit/crawlers/_http/test_http_crawler.py
function mock_request_handler (line 39) | async def mock_request_handler() -> Callable[[HttpCrawlingContext], Awai...
function crawler (line 44) | async def crawler(
function crawler_without_retries (line 51) | async def crawler_without_retries(
function test_fetches_html (line 61) | async def test_fetches_html(
function test_handles_redirects (line 73) | async def test_handles_redirects(crawler: HttpCrawler, mock_request_hand...
function test_handles_client_errors (line 97) | async def test_handles_client_errors(
function test_handles_session_block_errors (line 136) | async def test_handles_session_block_errors(
function test_handles_server_error (line 166) | async def test_handles_server_error(crawler: HttpCrawler, mock_request_h...
function test_stores_cookies (line 173) | async def test_stores_cookies(http_client: HttpClient, server_url: URL) ...
function test_do_not_retry_on_client_errors (line 213) | async def test_do_not_retry_on_client_errors(crawler: HttpCrawler, serve...
function test_http_status_statistics (line 223) | async def test_http_status_statistics(crawler: HttpCrawler, server_url: ...
function test_sending_payload_as_raw_data (line 238) | async def test_sending_payload_as_raw_data(http_client: HttpClient, serv...
function test_sending_payload_as_form_data (line 269) | async def test_sending_payload_as_form_data(http_client: HttpClient, ser...
function test_sending_payload_as_json (line 295) | async def test_sending_payload_as_json(http_client: HttpClient, server_u...
function test_sending_url_query_params (line 322) | async def test_sending_url_query_params(http_client: HttpClient, server_...
function test_http_crawler_pre_navigation_hook_execution (line 344) | async def test_http_crawler_pre_navigation_hook_execution(server_url: UR...
function test_http_crawler_post_navigation_hook_execution (line 361) | async def test_http_crawler_post_navigation_hook_execution(server_url: U...
function test_http_crawler_navigation_hooks_order (line 378) | async def test_http_crawler_navigation_hooks_order(server_url: URL) -> N...
function test_isolation_cookies (line 420) | async def test_isolation_cookies(http_client: HttpClient, server_url: UR...
function test_store_complex_cookies (line 486) | async def test_store_complex_cookies(server_url: URL) -> None:
function test_default_logger (line 585) | def test_default_logger() -> None:
function test_get_snapshot (line 589) | async def test_get_snapshot(server_url: URL) -> None:
function test_error_snapshot_through_statistics (line 606) | async def test_error_snapshot_through_statistics(server_url: URL) -> None:
function test_request_state (line 633) | async def test_request_state(server_url: URL) -> None:
FILE: tests/unit/crawlers/_parsel/test_parsel_crawler.py
function test_basic (line 21) | async def test_basic(server_url: URL, http_client: HttpClient) -> None:
function test_enqueue_links (line 38) | async def test_enqueue_links(redirect_server_url: URL, server_url: URL, ...
function test_enqueue_non_href_links (line 68) | async def test_enqueue_non_href_links(redirect_server_url: URL, server_u...
function test_enqueue_links_with_incompatible_kwargs_raises_error (line 91) | async def test_enqueue_links_with_incompatible_kwargs_raises_error(serve...
function test_enqueue_links_selector (line 110) | async def test_enqueue_links_selector(server_url: URL, http_client: Http...
function test_enqueue_links_with_max_crawl (line 128) | async def test_enqueue_links_with_max_crawl(server_url: URL, http_client...
function test_enqueue_links_with_transform_request_function (line 152) | async def test_enqueue_links_with_transform_request_function(server_url:...
function test_handle_blocked_request (line 192) | async def test_handle_blocked_request(server_url: URL, http_client: Http...
function test_handle_blocked_status_code (line 199) | async def test_handle_blocked_status_code(server_url: URL, http_client: ...
function test_import_error_handled (line 224) | def test_import_error_handled() -> None:
function test_json (line 240) | async def test_json(server_url: URL, http_client: HttpClient) -> None:
function test_xml (line 256) | async def test_xml(server_url: URL, http_client: HttpClient) -> None:
function test_default_logger (line 272) | def test_default_logger() -> None:
function test_respect_robots_txt (line 276) | async def test_respect_robots_txt(server_url: URL, http_client: HttpClie...
function test_respect_robots_txt_with_problematic_links (line 296) | async def test_respect_robots_txt_with_problematic_links(server_url: URL...
function test_on_skipped_request (line 332) | async def test_on_skipped_request(server_url: URL, http_client: HttpClie...
function test_extract_links (line 355) | async def test_extract_links(server_url: URL, http_client: HttpClient) -...
function test_extract_non_href_links (line 370) | async def test_extract_non_href_links(server_url: URL, http_client: Http...
function test_enqueue_links_with_rq_param (line 393) | async def test_enqueue_links_with_rq_param(
function test_enqueue_links_requests_with_rq_param (line 430) | async def test_enqueue_links_requests_with_rq_param(
function test_enqueue_links_error_with_multi_params (line 476) | async def test_enqueue_links_error_with_multi_params(
function test_enqueue_links_with_limit (line 489) | async def test_enqueue_links_with_limit(server_url: URL, http_client: Ht...
FILE: tests/unit/crawlers/_playwright/test_playwright_crawler.py
function test_basic_request (line 66) | async def test_basic_request(method: HttpMethod, path: str, payload: Htt...
function test_enqueue_links (line 83) | async def test_enqueue_links(redirect_server_url: URL, server_url: URL) ...
function test_enqueue_non_href_links (line 111) | async def test_enqueue_non_href_links(redirect_server_url: URL, server_u...
function test_enqueue_links_with_incompatible_kwargs_raises_error (line 133) | async def test_enqueue_links_with_incompatible_kwargs_raises_error(serve...
function test_enqueue_links_with_transform_request_function (line 156) | async def test_enqueue_links_with_transform_request_function(server_url:...
function test_nonexistent_url_invokes_error_handler (line 185) | async def test_nonexistent_url_invokes_error_handler() -> None:
function test_redirect_handling (line 199) | async def test_redirect_handling(server_url: URL, redirect_server_url: U...
function test_chromium_headless_headers (line 233) | async def test_chromium_headless_headers(
function test_firefox_headless_headers (line 266) | async def test_firefox_headless_headers(header_network: dict, server_url...
function test_custom_headers (line 296) | async def test_custom_headers(server_url: URL) -> None:
function test_pre_navigation_hook (line 315) | async def test_pre_navigation_hook() -> None:
function test_proxy_set (line 329) | async def test_proxy_set() -> None:
function test_isolation_cookies (line 362) | async def test_isolation_cookies(*, use_incognito_pages: bool, server_ur...
function test_save_cookies_after_handler_processing (line 448) | async def test_save_cookies_after_handler_processing(server_url: URL) ->...
function test_read_write_cookies (line 475) | async def test_read_write_cookies(server_url: URL) -> None:
function test_custom_fingerprint_uses_generator_options (line 501) | async def test_custom_fingerprint_uses_generator_options(server_url: URL...
function test_custom_fingerprint_matches_header_user_agent (line 536) | async def test_custom_fingerprint_matches_header_user_agent(server_url: ...
function test_ignore_http_error_status_codes (line 556) | async def test_ignore_http_error_status_codes(server_url: URL) -> None:
function test_additional_http_error_status_codes (line 571) | async def test_additional_http_error_status_codes(server_url: URL) -> None:
function test_launch_with_user_data_dir (line 586) | async def test_launch_with_user_data_dir(tmp_path: Path, server_url: URL...
function test_launch_with_user_data_dir_and_fingerprint (line 600) | async def test_launch_with_user_data_dir_and_fingerprint(tmp_path: Path,...
function test_get_snapshot (line 626) | async def test_get_snapshot(server_url: URL) -> None:
function test_error_snapshot_through_statistics (line 647) | async def test_error_snapshot_through_statistics(server_url: URL) -> None:
function test_respect_robots_txt (line 697) | async def test_respect_robots_txt(server_url: URL) -> None:
function test_respect_robots_txt_with_problematic_links (line 717) | async def test_respect_robots_txt_with_problematic_links(server_url: URL...
function test_on_skipped_request (line 752) | async def test_on_skipped_request(server_url: URL) -> None:
function test_send_request (line 775) | async def test_send_request(server_url: URL) -> None:
function test_send_request_with_client (line 807) | async def test_send_request_with_client(server_url: URL) -> None:
function test_passing_configuration (line 828) | async def test_passing_configuration() -> None:
function test_extract_links (line 839) | async def test_extract_links(server_url: URL) -> None:
function test_extract_non_href_links (line 854) | async def test_extract_non_href_links(server_url: URL) -> None:
function test_reduced_logs_from_playwright_navigation_timeout (line 869) | async def test_reduced_logs_from_playwright_navigation_timeout(caplog: p...
function test_enqueue_links_with_rq_param (line 902) | async def test_enqueue_links_with_rq_param(
function test_enqueue_links_requests_with_rq_param (line 939) | async def test_enqueue_links_requests_with_rq_param(
function test_enqueue_links_error_with_multi_params (line 985) | async def test_enqueue_links_error_with_multi_params(
function test_navigation_timeout_on_slow_page_load (line 998) | async def test_navigation_timeout_on_slow_page_load(server_url: URL) -> ...
function test_navigation_timeout_applies_to_hooks (line 1021) | async def test_navigation_timeout_applies_to_hooks(server_url: URL) -> N...
function test_slow_navigation_does_not_count_toward_handler_timeout (line 1039) | async def test_slow_navigation_does_not_count_toward_handler_timeout(ser...
function test_request_state (line 1056) | async def test_request_state(server_url: URL) -> None:
function test_enqueue_links_with_limit (line 1110) | async def test_enqueue_links_with_limit(server_url: URL) -> None:
function test_playwright_crawler_pre_navigation_hook_execution (line 1132) | async def test_playwright_crawler_pre_navigation_hook_execution(server_u...
function test_playwright_crawler_post_navigation_hook_execution (line 1149) | async def test_playwright_crawler_post_navigation_hook_execution(server_...
function test_playwright_navigation_hooks_order (line 1166) | async def test_playwright_navigation_hooks_order(server_url: URL) -> None:
FILE: tests/unit/crawlers/_playwright/test_utils.py
function test_infinite_scroll_on_dynamic_page (line 7) | async def test_infinite_scroll_on_dynamic_page(server_url: URL) -> None:
function test_infinite_scroll_no_page_without_scroll (line 45) | async def test_infinite_scroll_no_page_without_scroll(server_url: URL) -...
function test_double_call_infinite_scroll (line 62) | async def test_double_call_infinite_scroll(server_url: URL) -> None:
function test_block_requests_default (line 81) | async
Condensed preview — 635 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (2,825K chars).
[
{
"path": ".editorconfig",
"chars": 213,
"preview": "root = true\n\n[*]\nindent_style = space\nindent_size = 4\ncharset = utf-8\ntrim_trailing_whitespace = true\ninsert_final_newli"
},
{
"path": ".github/CODEOWNERS",
"chars": 64,
"preview": "# Documentation codeowner\n\n/docs/*.md @TC-MO\n/docs/*.mdx @TC-MO\n"
},
{
"path": ".github/pull_request_template.md",
"chars": 282,
"preview": "### Description\n\n<!-- The purpose of the PR, list of the changes, ... -->\n\n- TODO\n\n### Issues\n\n<!-- If applicable, refer"
},
{
"path": ".github/workflows/_check_code.yaml",
"chars": 1005,
"preview": "name: Code checks\n\non:\n # Runs when manually triggered from the GitHub UI.\n workflow_dispatch:\n\n # Runs when invoked "
},
{
"path": ".github/workflows/_check_docs.yaml",
"chars": 302,
"preview": "name: Doc checks\n\non:\n # Runs when manually triggered from the GitHub UI.\n workflow_dispatch:\n\n # Runs when invoked b"
},
{
"path": ".github/workflows/_release_docs.yaml",
"chars": 2138,
"preview": "name: Doc release\n\non:\n # Runs when manually triggered from the GitHub UI.\n workflow_dispatch:\n\n # Runs when invoked "
},
{
"path": ".github/workflows/_tests.yaml",
"chars": 593,
"preview": "name: Tests\n\non:\n # Runs when manually triggered from the GitHub UI.\n workflow_dispatch:\n\n # Runs when invoked by ano"
},
{
"path": ".github/workflows/manual_release_stable.yaml",
"chars": 3685,
"preview": "name: Stable release\n\non:\n # Runs when manually triggered from the GitHub UI, with options to specify the type of relea"
},
{
"path": ".github/workflows/on_issue.yaml",
"chars": 595,
"preview": "name: CI (issue)\n\non:\n # Runs when a new issue is opened.\n issues:\n types:\n - opened\n\npermissions:\n contents:"
},
{
"path": ".github/workflows/on_master.yaml",
"chars": 3476,
"preview": "name: CI (master)\n\non:\n push:\n branches:\n - master\n tags-ignore:\n - \"**\" # Ignore all tags to avoid dup"
},
{
"path": ".github/workflows/on_pull_request.yaml",
"chars": 610,
"preview": "name: CI (PR)\n\non:\n # Runs whenever a pull request is opened or updated.\n pull_request:\n\npermissions:\n contents: read"
},
{
"path": ".github/workflows/on_schedule_tests.yaml",
"chars": 1970,
"preview": "name: Scheduled tests\n\non:\n # Runs when manually triggered from the GitHub UI.\n workflow_dispatch:\n\n # Runs on a dail"
},
{
"path": ".gitignore",
"chars": 1007,
"preview": "# AI assistant files\n.agent\n.agents\n.ai\n.aider\n.claude\n.codeium\n.continue\n.copilot\n.cursor\n.gemini\n.llm\n.llms\n.openai\n.s"
},
{
"path": ".markdownlint.yaml",
"chars": 108,
"preview": "default: true\nline-length:\n line_length: 120\nMD007:\n indent: 4\nMD004:\n style: dash\nno-inline-html: false\n"
},
{
"path": ".pre-commit-config.yaml",
"chars": 309,
"preview": "repos:\n - repo: local\n hooks:\n - id: lint-check\n name: Lint check\n entry: uv run poe lint\n "
},
{
"path": ".rules.md",
"chars": 4753,
"preview": "# Coding guidelines\n\nThis file provides guidance to programming agents when working with code in this repository.\n\n## De"
},
{
"path": "CHANGELOG.md",
"chars": 105558,
"preview": "# Changelog\n\nAll notable changes to this project will be documented in this file.\n\n## [1.6.0](https://github.com/apify/c"
},
{
"path": "CONTRIBUTING.md",
"chars": 5650,
"preview": "# Development\n\nHere you'll find a contributing guide to get started with development.\n\n## Environment\n\nFor local develop"
},
{
"path": "LICENSE",
"chars": 11355,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "README.md",
"chars": 11186,
"preview": "<h1 align=\"center\">\n <a href=\"https://crawlee.dev\">\n <picture>\n <source media=\"(prefers-color-scheme:"
},
{
"path": "codecov.yaml",
"chars": 370,
"preview": "coverage:\n status:\n project:\n default:\n target: auto\n threshold: 0.10% # tolerate up to 0.10% d"
},
{
"path": "docs/deployment/apify_platform.mdx",
"chars": 14671,
"preview": "---\nid: apify-platform\ntitle: Apify platform\ndescription: Apify platform - large-scale and high-performance web scraping"
},
{
"path": "docs/deployment/aws_lambda.mdx",
"chars": 8800,
"preview": "---\nid: aws-lambda\ntitle: Deploy on AWS Lambda\ndescription: Prepare your crawler to run on AWS Lambda.\n---\n\nimport ApiLi"
},
{
"path": "docs/deployment/code_examples/apify/crawler_as_actor_example.py",
"chars": 839,
"preview": "import asyncio\n\nfrom apify import Actor\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext"
},
{
"path": "docs/deployment/code_examples/apify/get_public_url.py",
"chars": 436,
"preview": "import asyncio\n\nfrom apify import Actor\n\n\nasync def main() -> None:\n async with Actor:\n store = await Actor.op"
},
{
"path": "docs/deployment/code_examples/apify/log_with_config_example.py",
"chars": 518,
"preview": "import asyncio\n\nfrom apify import Actor, Configuration\n\n\nasync def main() -> None:\n # Create a new configuration with"
},
{
"path": "docs/deployment/code_examples/apify/proxy_advanced_example.py",
"chars": 451,
"preview": "import asyncio\n\nfrom apify import Actor\n\n\nasync def main() -> None:\n async with Actor:\n proxy_configuration = "
},
{
"path": "docs/deployment/code_examples/apify/proxy_example.py",
"chars": 762,
"preview": "import asyncio\n\nfrom apify import Actor\n\n\nasync def main() -> None:\n async with Actor:\n # Create a new Apify P"
},
{
"path": "docs/deployment/code_examples/aws/beautifulsoup_crawler_lambda.py",
"chars": 2000,
"preview": "import asyncio\nimport json\nfrom datetime import timedelta\nfrom typing import Any\n\nfrom aws_lambda_powertools.utilities.t"
},
{
"path": "docs/deployment/code_examples/aws/playwright_crawler_lambda.py",
"chars": 2354,
"preview": "import asyncio\nimport json\nfrom datetime import timedelta\nfrom typing import Any\n\nfrom aws_lambda_powertools.utilities.t"
},
{
"path": "docs/deployment/code_examples/aws/playwright_dockerfile",
"chars": 1159,
"preview": "FROM apify/actor-python-playwright:3.14\n\nRUN apt update && apt install -yq git && rm -rf /var/lib/apt/lists/*\n\nRUN pip i"
},
{
"path": "docs/deployment/code_examples/google/cloud_run_example.py",
"chars": 1741,
"preview": "import json\nimport os\n\nimport uvicorn\nfrom litestar import Litestar, get\n\nfrom crawlee.crawlers import PlaywrightCrawler"
},
{
"path": "docs/deployment/code_examples/google/google_example.py",
"chars": 1814,
"preview": "import asyncio\nimport json\nfrom datetime import timedelta\n\nimport functions_framework\nfrom flask import Request, Respons"
},
{
"path": "docs/deployment/google_cloud.mdx",
"chars": 2490,
"preview": "---\nid: gcp-cloud-run-functions\ntitle: Cloud Run functions\ndescription: Prepare your crawler to run in Cloud Run functio"
},
{
"path": "docs/deployment/google_cloud_run.mdx",
"chars": 2607,
"preview": "---\nid: gcp-cloud-run\ntitle: Cloud Run\ndescription: Prepare your crawler to run in Cloud Run on Google Cloud Platform.\n-"
},
{
"path": "docs/examples/add_data_to_dataset.mdx",
"chars": 1870,
"preview": "---\nid: add-data-to-dataset\ntitle: Add data to dataset\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport T"
},
{
"path": "docs/examples/beautifulsoup_crawler.mdx",
"chars": 1128,
"preview": "---\nid: beautifulsoup-crawler\ntitle: BeautifulSoup crawler\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimpo"
},
{
"path": "docs/examples/capture_screenshot_using_playwright.mdx",
"chars": 1251,
"preview": "---\nid: capture-screenshots-using-playwright\ntitle: Capture screenshots using Playwright\n---\n\nimport ApiLink from '@site"
},
{
"path": "docs/examples/capturing_page_snapshots_with_error_snapshotter.mdx",
"chars": 1840,
"preview": "---\nid: capturing-page-snapshots-with-error-snapshotter\ntitle: Capturing page snapshots with ErrorSnapshotter\ndescriptio"
},
{
"path": "docs/examples/code_examples/adaptive_playwright_crawler.py",
"chars": 2361,
"preview": "import asyncio\nfrom datetime import timedelta\n\nfrom playwright.async_api import Route\n\nfrom crawlee.crawlers import (\n "
},
{
"path": "docs/examples/code_examples/add_data_to_dataset_bs.py",
"chars": 1016,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> No"
},
{
"path": "docs/examples/code_examples/add_data_to_dataset_dataset.py",
"chars": 321,
"preview": "import asyncio\n\nfrom crawlee.storages import Dataset\n\n\nasync def main() -> None:\n # Open dataset manually using async"
},
{
"path": "docs/examples/code_examples/add_data_to_dataset_pw.py",
"chars": 989,
"preview": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n "
},
{
"path": "docs/examples/code_examples/beautifulsoup_crawler.py",
"chars": 2364,
"preview": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee.crawlers import (\n BasicCrawlingContext,\n BeautifulSou"
},
{
"path": "docs/examples/code_examples/beautifulsoup_crawler_keep_alive.py",
"chars": 2202,
"preview": "import asyncio\n\nfrom crawlee._types import BasicCrawlingContext\nfrom crawlee.crawlers import BeautifulSoupCrawler\n\n\nasyn"
},
{
"path": "docs/examples/code_examples/beautifulsoup_crawler_stop.py",
"chars": 1676,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> No"
},
{
"path": "docs/examples/code_examples/capture_screenshot_using_playwright.py",
"chars": 1469,
"preview": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.storages import K"
},
{
"path": "docs/examples/code_examples/configure_json_logging.py",
"chars": 2945,
"preview": "from __future__ import annotations\n\nimport asyncio\nimport inspect\nimport logging\nimport sys\nfrom typing import TYPE_CHEC"
},
{
"path": "docs/examples/code_examples/crawl_all_links_on_website_bs.py",
"chars": 797,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> No"
},
{
"path": "docs/examples/code_examples/crawl_all_links_on_website_pw.py",
"chars": 785,
"preview": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n "
},
{
"path": "docs/examples/code_examples/crawl_multiple_urls_bs.py",
"chars": 687,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> No"
},
{
"path": "docs/examples/code_examples/crawl_multiple_urls_pw.py",
"chars": 675,
"preview": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n "
},
{
"path": "docs/examples/code_examples/crawl_specific_links_on_website_bs.py",
"chars": 998,
"preview": "import asyncio\n\nfrom crawlee import Glob\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext"
},
{
"path": "docs/examples/code_examples/crawl_specific_links_on_website_pw.py",
"chars": 986,
"preview": "import asyncio\n\nfrom crawlee import Glob\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasy"
},
{
"path": "docs/examples/code_examples/crawl_website_with_relative_links_all_links.py",
"chars": 927,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> No"
},
{
"path": "docs/examples/code_examples/crawl_website_with_relative_links_same_domain.py",
"chars": 925,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> No"
},
{
"path": "docs/examples/code_examples/crawl_website_with_relative_links_same_hostname.py",
"chars": 952,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> No"
},
{
"path": "docs/examples/code_examples/crawl_website_with_relative_links_same_origin.py",
"chars": 923,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> No"
},
{
"path": "docs/examples/code_examples/export_entire_dataset_to_file_csv.py",
"chars": 1263,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> No"
},
{
"path": "docs/examples/code_examples/export_entire_dataset_to_file_json.py",
"chars": 1268,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> No"
},
{
"path": "docs/examples/code_examples/extract_and_add_specific_links_on_website_bs.py",
"chars": 1355,
"preview": "import asyncio\n\nfrom crawlee import Glob\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext"
},
{
"path": "docs/examples/code_examples/extract_and_add_specific_links_on_website_pw.py",
"chars": 1343,
"preview": "import asyncio\n\nfrom crawlee import Glob\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasy"
},
{
"path": "docs/examples/code_examples/fill_and_submit_web_form_crawler.py",
"chars": 1385,
"preview": "import asyncio\nfrom urllib.parse import urlencode\n\nfrom crawlee import Request\nfrom crawlee.crawlers import HttpCrawler,"
},
{
"path": "docs/examples/code_examples/fill_and_submit_web_form_request.py",
"chars": 791,
"preview": "import asyncio\nfrom urllib.parse import urlencode\n\nfrom crawlee import Request\n\n\nasync def main() -> None:\n # Prepare"
},
{
"path": "docs/examples/code_examples/parsel_crawler.py",
"chars": 1601,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext\n\n# Regex for ide"
},
{
"path": "docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py",
"chars": 1009,
"preview": "import asyncio\nfrom random import choice\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee"
},
{
"path": "docs/examples/code_examples/playwright_block_requests.py",
"chars": 1202,
"preview": "import asyncio\n\nfrom crawlee.crawlers import (\n PlaywrightCrawler,\n PlaywrightCrawlingContext,\n PlaywrightPreNa"
},
{
"path": "docs/examples/code_examples/playwright_crawler.py",
"chars": 2907,
"preview": "import asyncio\n\nfrom crawlee.crawlers import (\n PlaywrightCrawler,\n PlaywrightCrawlingContext,\n PlaywrightPreNa"
},
{
"path": "docs/examples/code_examples/playwright_crawler_with_camoufox.py",
"chars": 2596,
"preview": "import asyncio\n\n# Camoufox is external package and needs to be installed. It is not included in crawlee.\nfrom camoufox "
},
{
"path": "docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py",
"chars": 1025,
"preview": "import asyncio\nfrom random import choice\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom"
},
{
"path": "docs/examples/code_examples/playwright_crawler_with_fingerprint_generator.py",
"chars": 1702,
"preview": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.fingerprint_suite"
},
{
"path": "docs/examples/code_examples/respect_robots_on_skipped_request.py",
"chars": 1156,
"preview": "import asyncio\n\nfrom crawlee import SkippedReason\nfrom crawlee.crawlers import (\n BeautifulSoupCrawler,\n Beautiful"
},
{
"path": "docs/examples/code_examples/respect_robots_txt_file.py",
"chars": 866,
"preview": "import asyncio\n\nfrom crawlee.crawlers import (\n BeautifulSoupCrawler,\n BeautifulSoupCrawlingContext,\n)\n\n\nasync def"
},
{
"path": "docs/examples/code_examples/resuming_paused_crawl.py",
"chars": 1282,
"preview": "import asyncio\n\nfrom crawlee import ConcurrencySettings, service_locator\nfrom crawlee.crawlers import (\n BeautifulSou"
},
{
"path": "docs/examples/code_examples/run_parallel_crawlers.py",
"chars": 3820,
"preview": "import asyncio\n\nfrom crawlee import ConcurrencySettings\nfrom crawlee.crawlers import (\n ParselCrawler,\n ParselCraw"
},
{
"path": "docs/examples/code_examples/using_browser_profiles_chrome.py",
"chars": 1800,
"preview": "import asyncio\nimport shutil\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\n\nfrom crawlee.crawlers imp"
},
{
"path": "docs/examples/code_examples/using_browser_profiles_firefox.py",
"chars": 1241,
"preview": "import asyncio\nfrom pathlib import Path\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n# Re"
},
{
"path": "docs/examples/code_examples/using_sitemap_request_loader.py",
"chars": 3736,
"preview": "import asyncio\nfrom collections.abc import Callable\n\nfrom yarl import URL\n\nfrom crawlee import RequestOptions, RequestTr"
},
{
"path": "docs/examples/crawl_all_links_on_website.mdx",
"chars": 1783,
"preview": "---\nid: crawl-all-links-on-website\ntitle: Crawl all links on website\n---\n\nimport ApiLink from '@site/src/components/ApiL"
},
{
"path": "docs/examples/crawl_multiple_urls.mdx",
"chars": 1210,
"preview": "---\nid: crawl-multiple-urls\ntitle: Crawl multiple URLs\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport T"
},
{
"path": "docs/examples/crawl_specific_links_on_website.mdx",
"chars": 3062,
"preview": "---\nid: crawl-specific-links-on-website\ntitle: Crawl specific links on website\n---\n\nimport ApiLink from '@site/src/compo"
},
{
"path": "docs/examples/crawl_website_with_relative_links.mdx",
"chars": 3267,
"preview": "---\nid: crawl-website-with-relative-links\ntitle: Crawl website with relative links\n---\n\nimport ApiLink from '@site/src/c"
},
{
"path": "docs/examples/crawler_keep_alive.mdx",
"chars": 969,
"preview": "---\nid: crawler-keep-alive\ntitle: Keep a Crawler alive waiting for more requests\n---\n\nimport ApiLink from '@site/src/com"
},
{
"path": "docs/examples/crawler_stop.mdx",
"chars": 1231,
"preview": "---\nid: crawler-stop\ntitle: Stopping a Crawler with stop method\n---\n\nimport ApiLink from '@site/src/components/ApiLink';"
},
{
"path": "docs/examples/export_entire_dataset_to_file.mdx",
"chars": 1498,
"preview": "---\nid: export-entire-dataset-to-file\ntitle: Export entire dataset to file\n---\n\nimport ApiLink from '@site/src/component"
},
{
"path": "docs/examples/fill_and_submit_web_form.mdx",
"chars": 5169,
"preview": "---\nid: fill-and-submit-web-form\ntitle: Fill and submit web form\n---\n\nimport ApiLink from '@site/src/components/ApiLink'"
},
{
"path": "docs/examples/json_logging.mdx",
"chars": 2668,
"preview": "---\nid: configure-json-logging\ntitle: Сonfigure JSON logging\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nim"
},
{
"path": "docs/examples/parsel_crawler.mdx",
"chars": 1225,
"preview": "---\nid: parsel-crawler\ntitle: Parsel crawler\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCod"
},
{
"path": "docs/examples/playwright_crawler.mdx",
"chars": 1310,
"preview": "---\nid: playwright-crawler\ntitle: Playwright crawler\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Run"
},
{
"path": "docs/examples/playwright_crawler_adaptive.mdx",
"chars": 1706,
"preview": "---\nid: adaptive-playwright-crawler\ntitle: Adaptive Playwright crawler\n---\n\nimport ApiLink from '@site/src/components/Ap"
},
{
"path": "docs/examples/playwright_crawler_with_block_requests.mdx",
"chars": 1418,
"preview": "---\nid: playwright-crawler-with-block-requests\ntitle: Playwright crawler with block requests\n---\n\nimport ApiLink from '@"
},
{
"path": "docs/examples/playwright_crawler_with_camoufox.mdx",
"chars": 1914,
"preview": "---\nid: playwright-crawler-with-camoufox\ntitle: Playwright crawler with Camoufox\n---\n\nimport ApiLink from '@site/src/com"
},
{
"path": "docs/examples/playwright_crawler_with_fingerprint_generator.mdx",
"chars": 1362,
"preview": "---\nid: playwright-crawler-with-fingerprint-generator\ntitle: Playwright crawler with fingerprint generator\n---\n\nimport A"
},
{
"path": "docs/examples/respect_robots_txt_file.mdx",
"chars": 1837,
"preview": "---\nid: respect-robots-txt-file\ntitle: Respect robots.txt file\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\n"
},
{
"path": "docs/examples/resuming_paused_crawl.mdx",
"chars": 1431,
"preview": "---\nid: resuming-paused-crawl\ntitle: Resuming a paused crawl\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nim"
},
{
"path": "docs/examples/run_parallel_crawlers.mdx",
"chars": 1643,
"preview": "---\nid: run-parallel-crawlers\ntitle: Run parallel crawlers\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimpo"
},
{
"path": "docs/examples/using_browser_profile.mdx",
"chars": 2103,
"preview": "---\nid: using_browser_profile\ntitle: Using browser profile\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\n\nimp"
},
{
"path": "docs/examples/using_sitemap_request_loader.mdx",
"chars": 1625,
"preview": "---\nid: using-sitemap-request-loader\ntitle: Using sitemap request loader\n---\n\nimport ApiLink from '@site/src/components/"
},
{
"path": "docs/guides/architecture_overview.mdx",
"chars": 22306,
"preview": "---\nid: architecture-overview\ntitle: Architecture overview\ndescription: An overview of the core components of the Crawle"
},
{
"path": "docs/guides/avoid_blocking.mdx",
"chars": 4009,
"preview": "---\nid: avoid-blocking\ntitle: Avoid getting blocked\ndescription: How to avoid getting blocked when scraping\n---\n\nimport "
},
{
"path": "docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py",
"chars": 414,
"preview": "import asyncio\n\nfrom crawlee.fingerprint_suite import (\n DefaultFingerprintGenerator,\n HeaderGeneratorOptions,\n "
},
{
"path": "docs/guides/code_examples/avoid_blocking/playwright_with_fingerprint_generator.py",
"chars": 751,
"preview": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n "
},
{
"path": "docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py",
"chars": 2073,
"preview": "import asyncio\nimport io\nfrom pathlib import Path\n\nfrom warcio.statusandheaders import StatusAndHeaders\nfrom warcio.warc"
},
{
"path": "docs/guides/code_examples/creating_web_archive/manual_archiving_playwright_crawler.py",
"chars": 2883,
"preview": "import asyncio\nimport io\nimport logging\nfrom functools import partial\nfrom pathlib import Path\n\nfrom playwright.async_ap"
},
{
"path": "docs/guides/code_examples/creating_web_archive/simple_pw_through_proxy_pywb_server.py",
"chars": 1143,
"preview": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.proxy_configurati"
},
{
"path": "docs/guides/code_examples/error_handling/change_handle_error_status.py",
"chars": 1694,
"preview": "import asyncio\nimport json\n\nfrom crawlee import HttpHeaders\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContex"
},
{
"path": "docs/guides/code_examples/error_handling/disable_retry.py",
"chars": 1134,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext\nfrom crawlee.errors "
},
{
"path": "docs/guides/code_examples/error_handling/handle_proxy_error.py",
"chars": 1651,
"preview": "import asyncio\n\nfrom crawlee import Request\nfrom crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawling"
},
{
"path": "docs/guides/code_examples/http_clients/parsel_curl_impersonate_example.py",
"chars": 1312,
"preview": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.http_clients import CurlI"
},
{
"path": "docs/guides/code_examples/http_clients/parsel_httpx_example.py",
"chars": 1276,
"preview": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.http_clients import Httpx"
},
{
"path": "docs/guides/code_examples/http_clients/parsel_impit_example.py",
"chars": 1293,
"preview": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.http_clients import Impit"
},
{
"path": "docs/guides/code_examples/http_crawlers/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "docs/guides/code_examples/http_crawlers/beautifulsoup_example.py",
"chars": 1005,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> No"
},
{
"path": "docs/guides/code_examples/http_crawlers/custom_crawler_example.py",
"chars": 0,
"preview": ""
},
{
"path": "docs/guides/code_examples/http_crawlers/http_example.py",
"chars": 1654,
"preview": "import asyncio\nimport re\n\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\n\nasync def main() -> None:\n "
},
{
"path": "docs/guides/code_examples/http_crawlers/lexbor_parser.py",
"chars": 2112,
"preview": "import asyncio\n\nfrom pydantic import ValidationError\nfrom selectolax.lexbor import LexborHTMLParser\nfrom yarl import URL"
},
{
"path": "docs/guides/code_examples/http_crawlers/lxml_parser.py",
"chars": 2056,
"preview": "import asyncio\n\nfrom lxml import html\nfrom pydantic import ValidationError\n\nfrom crawlee import Request\nfrom crawlee.cra"
},
{
"path": "docs/guides/code_examples/http_crawlers/lxml_saxonche_parser.py",
"chars": 2840,
"preview": "import asyncio\n\nfrom lxml import html\nfrom pydantic import ValidationError\nfrom saxonche import PySaxonProcessor\n\nfrom c"
},
{
"path": "docs/guides/code_examples/http_crawlers/parsel_example.py",
"chars": 978,
"preview": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\n\n\nasync def main() -> None:\n # Crea"
},
{
"path": "docs/guides/code_examples/http_crawlers/pyquery_parser.py",
"chars": 2078,
"preview": "import asyncio\n\nfrom pydantic import ValidationError\nfrom pyquery import PyQuery\nfrom yarl import URL\n\nfrom crawlee impo"
},
{
"path": "docs/guides/code_examples/http_crawlers/scrapling_parser.py",
"chars": 2519,
"preview": "import asyncio\n\nfrom pydantic import ValidationError\nfrom scrapling.parser import Selector\nfrom yarl import URL\n\nfrom cr"
},
{
"path": "docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py",
"chars": 922,
"preview": "import asyncio\n\nfrom crawlee.crawlers import (\n AdaptivePlaywrightCrawler,\n AdaptivePlaywrightCrawlingContext,\n)\n\n"
},
{
"path": "docs/guides/code_examples/http_crawlers/selectolax_context.py",
"chars": 1247,
"preview": "from dataclasses import dataclass, fields\n\nfrom selectolax.lexbor import LexborHTMLParser\nfrom typing_extensions import "
},
{
"path": "docs/guides/code_examples/http_crawlers/selectolax_crawler.py",
"chars": 1706,
"preview": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom selectolax.lexbor import LexborHTMLParser, Le"
},
{
"path": "docs/guides/code_examples/http_crawlers/selectolax_crawler_run.py",
"chars": 690,
"preview": "import asyncio\n\nfrom .selectolax_crawler import SelectolaxLexborContext, SelectolaxLexborCrawler\n\n\nasync def main() -> N"
},
{
"path": "docs/guides/code_examples/http_crawlers/selectolax_parser.py",
"chars": 2036,
"preview": "from __future__ import annotations\n\nimport asyncio\nfrom typing import TYPE_CHECKING\n\nfrom selectolax.lexbor import Lexbo"
},
{
"path": "docs/guides/code_examples/login_crawler/http_login.py",
"chars": 3277,
"preview": "import asyncio\nimport json\nfrom datetime import datetime, timedelta\n\nfrom crawlee import ConcurrencySettings, Request\nfr"
},
{
"path": "docs/guides/code_examples/login_crawler/playwright_login.py",
"chars": 2686,
"preview": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee import ConcurrencySettings, Request\nfrom crawlee.crawlers im"
},
{
"path": "docs/guides/code_examples/playwright_crawler/browser_configuration_example.py",
"chars": 1353,
"preview": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n "
},
{
"path": "docs/guides/code_examples/playwright_crawler/browser_pool_page_hooks_example.py",
"chars": 2705,
"preview": "from __future__ import annotations\n\nimport asyncio\nimport logging\nfrom typing import TYPE_CHECKING, Any\n\nfrom crawlee.br"
},
{
"path": "docs/guides/code_examples/playwright_crawler/multiple_launch_example.py",
"chars": 1282,
"preview": "import asyncio\n\nfrom crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin\nfrom crawlee.crawlers import Playwrigh"
},
{
"path": "docs/guides/code_examples/playwright_crawler/navigation_hooks_example.py",
"chars": 1418,
"preview": "import asyncio\n\nfrom crawlee.crawlers import (\n PlaywrightCrawler,\n PlaywrightCrawlingContext,\n PlaywrightPostN"
},
{
"path": "docs/guides/code_examples/playwright_crawler/plugin_browser_configuration_example.py",
"chars": 1002,
"preview": "import asyncio\n\nfrom crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin\nfrom crawlee.crawlers import Playwrigh"
},
{
"path": "docs/guides/code_examples/playwright_crawler_adaptive/handler.py",
"chars": 665,
"preview": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrigh"
},
{
"path": "docs/guides/code_examples/playwright_crawler_adaptive/init_beautifulsoup.py",
"chars": 505,
"preview": "import asyncio\n\nfrom crawlee.crawlers import AdaptivePlaywrightCrawler\n\n\nasync def main() -> None:\n crawler = Adaptiv"
},
{
"path": "docs/guides/code_examples/playwright_crawler_adaptive/init_parsel.py",
"chars": 498,
"preview": "import asyncio\n\nfrom crawlee.crawlers import AdaptivePlaywrightCrawler\n\n\nasync def main() -> None:\n crawler = Adaptiv"
},
{
"path": "docs/guides/code_examples/playwright_crawler_adaptive/init_prediction.py",
"chars": 2640,
"preview": "import asyncio\n\nfrom crawlee import Request\nfrom crawlee._types import RequestHandlerRunResult\nfrom crawlee.crawlers imp"
},
{
"path": "docs/guides/code_examples/playwright_crawler_adaptive/pre_nav_hooks.py",
"chars": 1269,
"preview": "import asyncio\n\nfrom playwright.async_api import Route\n\nfrom crawlee.crawlers import (\n AdaptivePlaywrightCrawler,\n "
},
{
"path": "docs/guides/code_examples/playwright_crawler_stagehand/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py",
"chars": 3216,
"preview": "from __future__ import annotations\n\nfrom datetime import datetime, timezone\nfrom typing import TYPE_CHECKING, Any, cast\n"
},
{
"path": "docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py",
"chars": 2228,
"preview": "from __future__ import annotations\n\nimport asyncio\nimport os\nfrom typing import cast\n\nfrom stagehand import StagehandCon"
},
{
"path": "docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py",
"chars": 1646,
"preview": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Any\n\nfrom stagehand import Stagehand, StagehandPag"
},
{
"path": "docs/guides/code_examples/proxy_management/inspecting_bs_example.py",
"chars": 972,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\nfrom crawlee.proxy_confi"
},
{
"path": "docs/guides/code_examples/proxy_management/inspecting_pw_example.py",
"chars": 960,
"preview": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.proxy_configurati"
},
{
"path": "docs/guides/code_examples/proxy_management/integration_bs_example.py",
"chars": 1074,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\nfrom crawlee.proxy_confi"
},
{
"path": "docs/guides/code_examples/proxy_management/integration_pw_example.py",
"chars": 1031,
"preview": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.proxy_configurati"
},
{
"path": "docs/guides/code_examples/proxy_management/quick_start_example.py",
"chars": 592,
"preview": "import asyncio\n\nfrom crawlee.proxy_configuration import ProxyConfiguration\n\n\nasync def main() -> None:\n proxy_configu"
},
{
"path": "docs/guides/code_examples/proxy_management/session_bs_example.py",
"chars": 561,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler\nfrom crawlee.proxy_configuration import ProxyConfigura"
},
{
"path": "docs/guides/code_examples/proxy_management/session_pw_example.py",
"chars": 555,
"preview": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler\nfrom crawlee.proxy_configuration import ProxyConfiguratio"
},
{
"path": "docs/guides/code_examples/proxy_management/tiers_bs_example.py",
"chars": 1458,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\nfrom crawlee.proxy_confi"
},
{
"path": "docs/guides/code_examples/proxy_management/tiers_pw_example.py",
"chars": 1446,
"preview": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.proxy_configurati"
},
{
"path": "docs/guides/code_examples/request_loaders/rl_basic_example.py",
"chars": 769,
"preview": "import asyncio\n\nfrom crawlee.request_loaders import RequestList\n\n\nasync def main() -> None:\n # Open the request list,"
},
{
"path": "docs/guides/code_examples/request_loaders/rl_basic_example_with_persist.py",
"chars": 1498,
"preview": "import asyncio\nimport logging\n\nfrom crawlee import service_locator\nfrom crawlee.request_loaders import RequestList\n\nlogg"
},
{
"path": "docs/guides/code_examples/request_loaders/rl_tandem_example.py",
"chars": 1332,
"preview": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.request_loaders import Re"
},
{
"path": "docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py",
"chars": 1386,
"preview": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.request_loaders import Re"
},
{
"path": "docs/guides/code_examples/request_loaders/sitemap_basic_example.py",
"chars": 991,
"preview": "import asyncio\nimport re\n\nfrom crawlee.http_clients import ImpitHttpClient\nfrom crawlee.request_loaders import SitemapRe"
},
{
"path": "docs/guides/code_examples/request_loaders/sitemap_example_with_persist.py",
"chars": 1594,
"preview": "import asyncio\nimport logging\n\nfrom crawlee import service_locator\nfrom crawlee.http_clients import ImpitHttpClient\nfrom"
},
{
"path": "docs/guides/code_examples/request_loaders/sitemap_tandem_example.py",
"chars": 1710,
"preview": "import asyncio\nimport re\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.http_clients im"
},
{
"path": "docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py",
"chars": 1798,
"preview": "import asyncio\nimport re\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.http_clients im"
},
{
"path": "docs/guides/code_examples/request_router/adaptive_crawler_handlers.py",
"chars": 1842,
"preview": "import asyncio\n\nfrom crawlee import HttpHeaders\nfrom crawlee.crawlers import (\n AdaptivePlaywrightCrawler,\n Adapti"
},
{
"path": "docs/guides/code_examples/request_router/basic_request_handlers.py",
"chars": 3241,
"preview": "import asyncio\n\nfrom crawlee import Request\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawl"
},
{
"path": "docs/guides/code_examples/request_router/custom_router_default_only.py",
"chars": 1293,
"preview": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.router import Router\n\n\nas"
},
{
"path": "docs/guides/code_examples/request_router/error_handler.py",
"chars": 2215,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext\nfrom crawlee.err"
},
{
"path": "docs/guides/code_examples/request_router/failed_request_handler.py",
"chars": 2270,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext\n\n\nasync def main"
},
{
"path": "docs/guides/code_examples/request_router/http_pre_navigation.py",
"chars": 1105,
"preview": "import asyncio\n\nfrom crawlee import HttpHeaders\nfrom crawlee.crawlers import BasicCrawlingContext, ParselCrawler, Parsel"
},
{
"path": "docs/guides/code_examples/request_router/playwright_pre_navigation.py",
"chars": 1597,
"preview": "import asyncio\n\nfrom crawlee.crawlers import (\n PlaywrightCrawler,\n PlaywrightCrawlingContext,\n PlaywrightPreNa"
},
{
"path": "docs/guides/code_examples/request_router/simple_default_handler.py",
"chars": 1083,
"preview": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\n\n\nasync def main() -> None:\n # Crea"
},
{
"path": "docs/guides/code_examples/running_in_web_server/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "docs/guides/code_examples/running_in_web_server/crawler.py",
"chars": 1854,
"preview": "import asyncio\nfrom collections.abc import AsyncIterator\nfrom contextlib import asynccontextmanager\nfrom typing import T"
},
{
"path": "docs/guides/code_examples/running_in_web_server/server.py",
"chars": 1579,
"preview": "from __future__ import annotations\n\nimport asyncio\nfrom uuid import uuid4\n\nfrom fastapi import FastAPI\nfrom starlette.re"
},
{
"path": "docs/guides/code_examples/scaling_crawlers/max_tasks_per_minute_example.py",
"chars": 671,
"preview": "import asyncio\n\nfrom crawlee import ConcurrencySettings\nfrom crawlee.crawlers import BeautifulSoupCrawler\n\n\nasync def ma"
},
{
"path": "docs/guides/code_examples/scaling_crawlers/min_and_max_concurrency_example.py",
"chars": 760,
"preview": "import asyncio\n\nfrom crawlee import ConcurrencySettings\nfrom crawlee.crawlers import BeautifulSoupCrawler\n\n\nasync def ma"
},
{
"path": "docs/guides/code_examples/service_locator/service_conflicts.py",
"chars": 740,
"preview": "import asyncio\n\nfrom crawlee import service_locator\nfrom crawlee.storage_clients import FileSystemStorageClient, MemoryS"
},
{
"path": "docs/guides/code_examples/service_locator/service_crawler_configuration.py",
"chars": 480,
"preview": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee.configuration import Configuration\nfrom crawlee.crawlers imp"
},
{
"path": "docs/guides/code_examples/service_locator/service_crawler_event_manager.py",
"chars": 427,
"preview": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee.crawlers import ParselCrawler\nfrom crawlee.events import Loc"
},
{
"path": "docs/guides/code_examples/service_locator/service_crawler_storage_client.py",
"chars": 357,
"preview": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import MemoryStorageClient\n\n\nasy"
},
{
"path": "docs/guides/code_examples/service_locator/service_locator_configuration.py",
"chars": 462,
"preview": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee import service_locator\nfrom crawlee.configuration import Con"
},
{
"path": "docs/guides/code_examples/service_locator/service_locator_event_manager.py",
"chars": 409,
"preview": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee import service_locator\nfrom crawlee.events import LocalEvent"
},
{
"path": "docs/guides/code_examples/service_locator/service_locator_storage_client.py",
"chars": 339,
"preview": "import asyncio\n\nfrom crawlee import service_locator\nfrom crawlee.storage_clients import MemoryStorageClient\n\n\nasync def "
},
{
"path": "docs/guides/code_examples/service_locator/service_storage_configuration.py",
"chars": 907,
"preview": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee import service_locator\nfrom crawlee.configuration import Con"
},
{
"path": "docs/guides/code_examples/service_locator/service_storage_storage_client.py",
"chars": 394,
"preview": "import asyncio\n\nfrom crawlee.storage_clients import MemoryStorageClient\nfrom crawlee.storages import Dataset\n\n\nasync def"
},
{
"path": "docs/guides/code_examples/session_management/multi_sessions_http.py",
"chars": 3105,
"preview": "import asyncio\nfrom collections.abc import Callable\nfrom datetime import timedelta\nfrom itertools import count\n\nfrom cra"
},
{
"path": "docs/guides/code_examples/session_management/one_session_http.py",
"chars": 2255,
"preview": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee import ConcurrencySettings, Request\nfrom crawlee.crawlers im"
},
{
"path": "docs/guides/code_examples/session_management/sm_basic.py",
"chars": 1844,
"preview": "import asyncio\nimport re\n\nfrom crawlee.crawlers import BasicCrawler, BasicCrawlingContext\nfrom crawlee.proxy_configurati"
},
{
"path": "docs/guides/code_examples/session_management/sm_beautifulsoup.py",
"chars": 1546,
"preview": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\nfrom crawlee.proxy_confi"
},
{
"path": "docs/guides/code_examples/session_management/sm_http.py",
"chars": 1650,
"preview": "import asyncio\nimport re\n\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\nfrom crawlee.proxy_configuration"
},
{
"path": "docs/guides/code_examples/session_management/sm_parsel.py",
"chars": 1498,
"preview": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.proxy_configuration impor"
},
{
"path": "docs/guides/code_examples/session_management/sm_playwright.py",
"chars": 1499,
"preview": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.proxy_configurati"
},
{
"path": "docs/guides/code_examples/session_management/sm_standalone.py",
"chars": 622,
"preview": "import asyncio\n\nfrom crawlee.sessions import SessionPool\n\n\nasync def main() -> None:\n # Override the default Session "
},
{
"path": "docs/guides/code_examples/storage_clients/custom_storage_client_example.py",
"chars": 1685,
"preview": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom crawlee.storage_clients import StorageClient\n"
},
{
"path": "docs/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py",
"chars": 276,
"preview": "from crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import FileSystemStorageClient\n\n# Create a new i"
},
{
"path": "docs/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py",
"chars": 505,
"preview": "from crawlee.configuration import Configuration\nfrom crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients "
},
{
"path": "docs/guides/code_examples/storage_clients/memory_storage_client_basic_example.py",
"chars": 268,
"preview": "from crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import MemoryStorageClient\n\n# Create a new insta"
},
{
"path": "docs/guides/code_examples/storage_clients/redis_storage_client_basic_example.py",
"chars": 433,
"preview": "from crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import RedisStorageClient\n\n# Create a new instan"
},
{
"path": "docs/guides/code_examples/storage_clients/redis_storage_client_configuration_example.py",
"chars": 823,
"preview": "from redis.asyncio import Redis\n\nfrom crawlee.configuration import Configuration\nfrom crawlee.crawlers import ParselCraw"
},
{
"path": "docs/guides/code_examples/storage_clients/registering_storage_clients_example.py",
"chars": 897,
"preview": "import asyncio\n\nfrom crawlee import service_locator\nfrom crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clie"
},
{
"path": "docs/guides/code_examples/storage_clients/sql_storage_client_basic_example.py",
"chars": 550,
"preview": "from crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import SqlStorageClient\n\n\nasync def main() -> No"
},
{
"path": "docs/guides/code_examples/storage_clients/sql_storage_client_configuration_example.py",
"chars": 1140,
"preview": "from sqlalchemy.ext.asyncio import create_async_engine\n\nfrom crawlee.configuration import Configuration\nfrom crawlee.cra"
},
{
"path": "docs/guides/code_examples/storages/cleaning_do_not_purge_example.py",
"chars": 694,
"preview": "import asyncio\n\nfrom crawlee.configuration import Configuration\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingCo"
},
{
"path": "docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py",
"chars": 525,
"preview": "import asyncio\n\nfrom crawlee.storages import Dataset\n\n\nasync def main() -> None:\n # Create storage client with config"
}
]
// ... and 435 more files (download for full content)
About this extraction
This page contains the full source code of the apify/crawlee-python GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 635 files (2.6 MB), approximately 707.4k tokens, and a symbol index with 2556 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.